Change to flush and close logic to fix #1760556.
[python.git] / Objects / unicodeobject.c
blob26aa7533bab43440c2cbf028e00f5e9d09f21f16
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
52 /* Limit for the Unicode object free list */
54 #define MAX_UNICODE_FREELIST_SIZE 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *unicode_freelist;
97 static int unicode_freelist_size;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
102 /* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding[100];
115 Py_UNICODE
116 PyUnicode_GetMax(void)
118 #ifdef Py_UNICODE_WIDE
119 return 0x10FFFF;
120 #else
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
123 return 0xFFFF;
124 #endif
127 /* --- Bloom Filters ----------------------------------------------------- */
129 /* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
133 /* the linebreak mask is set up by Unicode_Init below */
135 #define BLOOM_MASK unsigned long
137 static BLOOM_MASK bloom_linebreak;
139 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
141 #define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
144 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
146 /* calculate simple bloom-style bitmask for a given unicode string */
148 long mask;
149 Py_ssize_t i;
151 mask = 0;
152 for (i = 0; i < len; i++)
153 mask |= (1 << (ptr[i] & 0x1F));
155 return mask;
158 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
160 Py_ssize_t i;
162 for (i = 0; i < setlen; i++)
163 if (set[i] == chr)
164 return 1;
166 return 0;
169 #define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
172 /* --- Unicode Object ----------------------------------------------------- */
174 static
175 int unicode_resize(register PyUnicodeObject *unicode,
176 Py_ssize_t length)
178 void *oldstr;
180 /* Shortcut if there's nothing much to do. */
181 if (unicode->length == length)
182 goto reset;
184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
186 instead ! */
188 if (unicode == unicode_empty ||
189 (unicode->length == 1 &&
190 unicode->str[0] < 256U &&
191 unicode_latin1[unicode->str[0]] == unicode)) {
192 PyErr_SetString(PyExc_SystemError,
193 "can't resize shared unicode objects");
194 return -1;
197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
199 safe to look at str[length] (without making any assumptions about what
200 it contains). */
202 oldstr = unicode->str;
203 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
204 if (!unicode->str) {
205 unicode->str = (Py_UNICODE *)oldstr;
206 PyErr_NoMemory();
207 return -1;
209 unicode->str[length] = 0;
210 unicode->length = length;
212 reset:
213 /* Reset the object caches */
214 if (unicode->defenc) {
215 Py_DECREF(unicode->defenc);
216 unicode->defenc = NULL;
218 unicode->hash = -1;
220 return 0;
223 /* We allocate one more byte to make sure the string is
224 Ux0000 terminated -- XXX is this needed ?
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
231 static
232 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
234 register PyUnicodeObject *unicode;
236 /* Optimization for empty strings */
237 if (length == 0 && unicode_empty != NULL) {
238 Py_INCREF(unicode_empty);
239 return unicode_empty;
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist) {
244 unicode = unicode_freelist;
245 unicode_freelist = *(PyUnicodeObject **)unicode;
246 unicode_freelist_size--;
247 if (unicode->str) {
248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
250 if ((unicode->length < length) &&
251 unicode_resize(unicode, length) < 0) {
252 PyMem_DEL(unicode->str);
253 goto onError;
256 else {
257 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
259 PyObject_INIT(unicode, &PyUnicode_Type);
261 else {
262 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
263 if (unicode == NULL)
264 return NULL;
265 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
268 if (!unicode->str) {
269 PyErr_NoMemory();
270 goto onError;
272 /* Initialize the first element to guard against cases where
273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
277 * that case.
279 unicode->str[0] = 0;
280 unicode->str[length] = 0;
281 unicode->length = length;
282 unicode->hash = -1;
283 unicode->defenc = NULL;
284 return unicode;
286 onError:
287 _Py_ForgetReference((PyObject *)unicode);
288 PyObject_Del(unicode);
289 return NULL;
292 static
293 void unicode_dealloc(register PyUnicodeObject *unicode)
295 if (PyUnicode_CheckExact(unicode) &&
296 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
297 /* Keep-Alive optimization */
298 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
299 PyMem_DEL(unicode->str);
300 unicode->str = NULL;
301 unicode->length = 0;
303 if (unicode->defenc) {
304 Py_DECREF(unicode->defenc);
305 unicode->defenc = NULL;
307 /* Add to free list */
308 *(PyUnicodeObject **)unicode = unicode_freelist;
309 unicode_freelist = unicode;
310 unicode_freelist_size++;
312 else {
313 PyMem_DEL(unicode->str);
314 Py_XDECREF(unicode->defenc);
315 Py_Type(unicode)->tp_free((PyObject *)unicode);
319 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
321 register PyUnicodeObject *v;
323 /* Argument checks */
324 if (unicode == NULL) {
325 PyErr_BadInternalCall();
326 return -1;
328 v = (PyUnicodeObject *)*unicode;
329 if (v == NULL || !PyUnicode_Check(v) || Py_Refcnt(v) != 1 || length < 0) {
330 PyErr_BadInternalCall();
331 return -1;
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
337 if (v->length != length &&
338 (v == unicode_empty || v->length == 1)) {
339 PyUnicodeObject *w = _PyUnicode_New(length);
340 if (w == NULL)
341 return -1;
342 Py_UNICODE_COPY(w->str, v->str,
343 length < v->length ? length : v->length);
344 Py_DECREF(*unicode);
345 *unicode = (PyObject *)w;
346 return 0;
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v, length);
354 /* Internal API for use in unicodeobject.c only ! */
355 #define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
358 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
359 Py_ssize_t size)
361 PyUnicodeObject *unicode;
363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
365 if (u != NULL) {
367 /* Optimization for empty strings */
368 if (size == 0 && unicode_empty != NULL) {
369 Py_INCREF(unicode_empty);
370 return (PyObject *)unicode_empty;
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size == 1 && *u < 256) {
376 unicode = unicode_latin1[*u];
377 if (!unicode) {
378 unicode = _PyUnicode_New(1);
379 if (!unicode)
380 return NULL;
381 unicode->str[0] = *u;
382 unicode_latin1[*u] = unicode;
384 Py_INCREF(unicode);
385 return (PyObject *)unicode;
389 unicode = _PyUnicode_New(size);
390 if (!unicode)
391 return NULL;
393 /* Copy the Unicode data into the new object */
394 if (u != NULL)
395 Py_UNICODE_COPY(unicode->str, u, size);
397 return (PyObject *)unicode;
400 #ifdef HAVE_WCHAR_H
402 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
403 Py_ssize_t size)
405 PyUnicodeObject *unicode;
407 if (w == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
412 unicode = _PyUnicode_New(size);
413 if (!unicode)
414 return NULL;
416 /* Copy the wchar_t data into the new object */
417 #ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode->str, w, size * sizeof(wchar_t));
419 #else
421 register Py_UNICODE *u;
422 register Py_ssize_t i;
423 u = PyUnicode_AS_UNICODE(unicode);
424 for (i = size; i > 0; i--)
425 *u++ = *w++;
427 #endif
429 return (PyObject *)unicode;
432 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
433 wchar_t *w,
434 Py_ssize_t size)
436 if (unicode == NULL) {
437 PyErr_BadInternalCall();
438 return -1;
441 /* If possible, try to copy the 0-termination as well */
442 if (size > PyUnicode_GET_SIZE(unicode))
443 size = PyUnicode_GET_SIZE(unicode) + 1;
445 #ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w, unicode->str, size * sizeof(wchar_t));
447 #else
449 register Py_UNICODE *u;
450 register Py_ssize_t i;
451 u = PyUnicode_AS_UNICODE(unicode);
452 for (i = size; i > 0; i--)
453 *w++ = *u++;
455 #endif
457 if (size > PyUnicode_GET_SIZE(unicode))
458 return PyUnicode_GET_SIZE(unicode);
459 else
460 return size;
463 #endif
465 PyObject *PyUnicode_FromOrdinal(int ordinal)
467 Py_UNICODE s[1];
469 #ifdef Py_UNICODE_WIDE
470 if (ordinal < 0 || ordinal > 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
474 return NULL;
476 #else
477 if (ordinal < 0 || ordinal > 0xffff) {
478 PyErr_SetString(PyExc_ValueError,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
481 return NULL;
483 #endif
485 s[0] = (Py_UNICODE)ordinal;
486 return PyUnicode_FromUnicode(s, 1);
489 PyObject *PyUnicode_FromObject(register PyObject *obj)
491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj)) {
494 Py_INCREF(obj);
495 return obj;
497 if (PyUnicode_Check(obj)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
501 PyUnicode_GET_SIZE(obj));
503 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
506 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
507 const char *encoding,
508 const char *errors)
510 const char *s = NULL;
511 Py_ssize_t len;
512 PyObject *v;
514 if (obj == NULL) {
515 PyErr_BadInternalCall();
516 return NULL;
519 #if 0
520 /* For b/w compatibility we also accept Unicode objects provided
521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
523 Unicode subclasses.
525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
529 if (PyUnicode_Check(obj)) {
530 if (encoding) {
531 PyErr_SetString(PyExc_TypeError,
532 "decoding Unicode is not supported");
533 return NULL;
535 return PyObject_Unicode(obj);
537 #else
538 if (PyUnicode_Check(obj)) {
539 PyErr_SetString(PyExc_TypeError,
540 "decoding Unicode is not supported");
541 return NULL;
543 #endif
545 /* Coerce object */
546 if (PyString_Check(obj)) {
547 s = PyString_AS_STRING(obj);
548 len = PyString_GET_SIZE(obj);
550 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError))
554 PyErr_Format(PyExc_TypeError,
555 "coercing to Unicode: need string or buffer, "
556 "%.80s found",
557 Py_Type(obj)->tp_name);
558 goto onError;
561 /* Convert to Unicode */
562 if (len == 0) {
563 Py_INCREF(unicode_empty);
564 v = (PyObject *)unicode_empty;
566 else
567 v = PyUnicode_Decode(s, len, encoding, errors);
569 return v;
571 onError:
572 return NULL;
575 PyObject *PyUnicode_Decode(const char *s,
576 Py_ssize_t size,
577 const char *encoding,
578 const char *errors)
580 PyObject *buffer = NULL, *unicode;
582 if (encoding == NULL)
583 encoding = PyUnicode_GetDefaultEncoding();
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding, "utf-8") == 0)
587 return PyUnicode_DecodeUTF8(s, size, errors);
588 else if (strcmp(encoding, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s, size, errors);
590 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s, size, errors);
593 #endif
594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s, size, errors);
597 /* Decode via the codec registry */
598 buffer = PyBuffer_FromMemory((void *)s, size);
599 if (buffer == NULL)
600 goto onError;
601 unicode = PyCodec_Decode(buffer, encoding, errors);
602 if (unicode == NULL)
603 goto onError;
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_Format(PyExc_TypeError,
606 "decoder did not return an unicode object (type=%.400s)",
607 Py_Type(unicode)->tp_name);
608 Py_DECREF(unicode);
609 goto onError;
611 Py_DECREF(buffer);
612 return unicode;
614 onError:
615 Py_XDECREF(buffer);
616 return NULL;
619 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
620 const char *encoding,
621 const char *errors)
623 PyObject *v;
625 if (!PyUnicode_Check(unicode)) {
626 PyErr_BadArgument();
627 goto onError;
630 if (encoding == NULL)
631 encoding = PyUnicode_GetDefaultEncoding();
633 /* Decode via the codec registry */
634 v = PyCodec_Decode(unicode, encoding, errors);
635 if (v == NULL)
636 goto onError;
637 return v;
639 onError:
640 return NULL;
643 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
644 Py_ssize_t size,
645 const char *encoding,
646 const char *errors)
648 PyObject *v, *unicode;
650 unicode = PyUnicode_FromUnicode(s, size);
651 if (unicode == NULL)
652 return NULL;
653 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
654 Py_DECREF(unicode);
655 return v;
658 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
659 const char *encoding,
660 const char *errors)
662 PyObject *v;
664 if (!PyUnicode_Check(unicode)) {
665 PyErr_BadArgument();
666 goto onError;
669 if (encoding == NULL)
670 encoding = PyUnicode_GetDefaultEncoding();
672 /* Encode via the codec registry */
673 v = PyCodec_Encode(unicode, encoding, errors);
674 if (v == NULL)
675 goto onError;
676 return v;
678 onError:
679 return NULL;
682 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
683 const char *encoding,
684 const char *errors)
686 PyObject *v;
688 if (!PyUnicode_Check(unicode)) {
689 PyErr_BadArgument();
690 goto onError;
693 if (encoding == NULL)
694 encoding = PyUnicode_GetDefaultEncoding();
696 /* Shortcuts for common default encodings */
697 if (errors == NULL) {
698 if (strcmp(encoding, "utf-8") == 0)
699 return PyUnicode_AsUTF8String(unicode);
700 else if (strcmp(encoding, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode);
702 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode);
705 #endif
706 else if (strcmp(encoding, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode);
710 /* Encode via the codec registry */
711 v = PyCodec_Encode(unicode, encoding, errors);
712 if (v == NULL)
713 goto onError;
714 if (!PyString_Check(v)) {
715 PyErr_Format(PyExc_TypeError,
716 "encoder did not return a string object (type=%.400s)",
717 Py_Type(v)->tp_name);
718 Py_DECREF(v);
719 goto onError;
721 return v;
723 onError:
724 return NULL;
727 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
728 const char *errors)
730 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
732 if (v)
733 return v;
734 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
735 if (v && errors == NULL)
736 ((PyUnicodeObject *)unicode)->defenc = v;
737 return v;
740 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
742 if (!PyUnicode_Check(unicode)) {
743 PyErr_BadArgument();
744 goto onError;
746 return PyUnicode_AS_UNICODE(unicode);
748 onError:
749 return NULL;
752 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
754 if (!PyUnicode_Check(unicode)) {
755 PyErr_BadArgument();
756 goto onError;
758 return PyUnicode_GET_SIZE(unicode);
760 onError:
761 return -1;
764 const char *PyUnicode_GetDefaultEncoding(void)
766 return unicode_default_encoding;
769 int PyUnicode_SetDefaultEncoding(const char *encoding)
771 PyObject *v;
773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v = _PyCodec_Lookup(encoding);
776 if (v == NULL)
777 goto onError;
778 Py_DECREF(v);
779 strncpy(unicode_default_encoding,
780 encoding,
781 sizeof(unicode_default_encoding));
782 return 0;
784 onError:
785 return -1;
788 /* error handling callback helper:
789 build arguments, call the callback and check the arguments,
790 if no exception occurred, copy the replacement to the output
791 and adjust various state variables.
792 return 0 on success, -1 on error
795 static
796 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
797 const char *encoding, const char *reason,
798 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
799 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
800 PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
802 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
804 PyObject *restuple = NULL;
805 PyObject *repunicode = NULL;
806 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
807 Py_ssize_t requiredsize;
808 Py_ssize_t newpos;
809 Py_UNICODE *repptr;
810 Py_ssize_t repsize;
811 int res = -1;
813 if (*errorHandler == NULL) {
814 *errorHandler = PyCodec_LookupError(errors);
815 if (*errorHandler == NULL)
816 goto onError;
819 if (*exceptionObject == NULL) {
820 *exceptionObject = PyUnicodeDecodeError_Create(
821 encoding, input, insize, *startinpos, *endinpos, reason);
822 if (*exceptionObject == NULL)
823 goto onError;
825 else {
826 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
827 goto onError;
828 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
829 goto onError;
830 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
831 goto onError;
834 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
835 if (restuple == NULL)
836 goto onError;
837 if (!PyTuple_Check(restuple)) {
838 PyErr_Format(PyExc_TypeError, &argparse[4]);
839 goto onError;
841 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
842 goto onError;
843 if (newpos<0)
844 newpos = insize+newpos;
845 if (newpos<0 || newpos>insize) {
846 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
847 goto onError;
850 /* need more space? (at least enough for what we
851 have+the replacement+the rest of the string (starting
852 at the new input position), so we won't have to check space
853 when there are no errors in the rest of the string) */
854 repptr = PyUnicode_AS_UNICODE(repunicode);
855 repsize = PyUnicode_GET_SIZE(repunicode);
856 requiredsize = *outpos + repsize + insize-newpos;
857 if (requiredsize > outsize) {
858 if (requiredsize<2*outsize)
859 requiredsize = 2*outsize;
860 if (PyUnicode_Resize(output, requiredsize) < 0)
861 goto onError;
862 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
864 *endinpos = newpos;
865 *inptr = input + newpos;
866 Py_UNICODE_COPY(*outptr, repptr, repsize);
867 *outptr += repsize;
868 *outpos += repsize;
869 /* we made it! */
870 res = 0;
872 onError:
873 Py_XDECREF(restuple);
874 return res;
877 /* --- UTF-7 Codec -------------------------------------------------------- */
879 /* see RFC2152 for details */
881 static
882 char utf7_special[128] = {
883 /* indicate whether a UTF-7 character is special i.e. cannot be directly
884 encoded:
885 0 - not special
886 1 - special
887 2 - whitespace (optional)
888 3 - RFC2152 Set O (optional) */
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
891 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
893 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
895 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
900 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
901 warnings about the comparison always being false; since
902 utf7_special[0] is 1, we can safely make that one comparison
903 true */
905 #define SPECIAL(c, encodeO, encodeWS) \
906 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
907 (encodeWS && (utf7_special[(c)] == 2)) || \
908 (encodeO && (utf7_special[(c)] == 3)))
910 #define B64(n) \
911 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
912 #define B64CHAR(c) \
913 (isalnum(c) || (c) == '+' || (c) == '/')
914 #define UB64(c) \
915 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
916 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
918 #define ENCODE(out, ch, bits) \
919 while (bits >= 6) { \
920 *out++ = B64(ch >> (bits-6)); \
921 bits -= 6; \
924 #define DECODE(out, ch, bits, surrogate) \
925 while (bits >= 16) { \
926 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
927 bits -= 16; \
928 if (surrogate) { \
929 /* We have already generated an error for the high surrogate \
930 so let's not bother seeing if the low surrogate is correct or not */ \
931 surrogate = 0; \
932 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
933 /* This is a surrogate pair. Unfortunately we can't represent \
934 it in a 16-bit character */ \
935 surrogate = 1; \
936 errmsg = "code pairs are not supported"; \
937 goto utf7Error; \
938 } else { \
939 *out++ = outCh; \
943 PyObject *PyUnicode_DecodeUTF7(const char *s,
944 Py_ssize_t size,
945 const char *errors)
947 const char *starts = s;
948 Py_ssize_t startinpos;
949 Py_ssize_t endinpos;
950 Py_ssize_t outpos;
951 const char *e;
952 PyUnicodeObject *unicode;
953 Py_UNICODE *p;
954 const char *errmsg = "";
955 int inShift = 0;
956 unsigned int bitsleft = 0;
957 unsigned long charsleft = 0;
958 int surrogate = 0;
959 PyObject *errorHandler = NULL;
960 PyObject *exc = NULL;
962 unicode = _PyUnicode_New(size);
963 if (!unicode)
964 return NULL;
965 if (size == 0)
966 return (PyObject *)unicode;
968 p = unicode->str;
969 e = s + size;
971 while (s < e) {
972 Py_UNICODE ch;
973 restart:
974 ch = *s;
976 if (inShift) {
977 if ((ch == '-') || !B64CHAR(ch)) {
978 inShift = 0;
979 s++;
981 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
982 if (bitsleft >= 6) {
983 /* The shift sequence has a partial character in it. If
984 bitsleft < 6 then we could just classify it as padding
985 but that is not the case here */
987 errmsg = "partial character in shift sequence";
988 goto utf7Error;
990 /* According to RFC2152 the remaining bits should be zero. We
991 choose to signal an error/insert a replacement character
992 here so indicate the potential of a misencoded character. */
994 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
995 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
996 errmsg = "non-zero padding bits in shift sequence";
997 goto utf7Error;
1000 if (ch == '-') {
1001 if ((s < e) && (*(s) == '-')) {
1002 *p++ = '-';
1003 inShift = 1;
1005 } else if (SPECIAL(ch,0,0)) {
1006 errmsg = "unexpected special character";
1007 goto utf7Error;
1008 } else {
1009 *p++ = ch;
1011 } else {
1012 charsleft = (charsleft << 6) | UB64(ch);
1013 bitsleft += 6;
1014 s++;
1015 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1018 else if ( ch == '+' ) {
1019 startinpos = s-starts;
1020 s++;
1021 if (s < e && *s == '-') {
1022 s++;
1023 *p++ = '+';
1024 } else
1026 inShift = 1;
1027 bitsleft = 0;
1030 else if (SPECIAL(ch,0,0)) {
1031 startinpos = s-starts;
1032 errmsg = "unexpected special character";
1033 s++;
1034 goto utf7Error;
1036 else {
1037 *p++ = ch;
1038 s++;
1040 continue;
1041 utf7Error:
1042 outpos = p-PyUnicode_AS_UNICODE(unicode);
1043 endinpos = s-starts;
1044 if (unicode_decode_call_errorhandler(
1045 errors, &errorHandler,
1046 "utf7", errmsg,
1047 starts, size, &startinpos, &endinpos, &exc, &s,
1048 (PyObject **)&unicode, &outpos, &p))
1049 goto onError;
1052 if (inShift) {
1053 outpos = p-PyUnicode_AS_UNICODE(unicode);
1054 endinpos = size;
1055 if (unicode_decode_call_errorhandler(
1056 errors, &errorHandler,
1057 "utf7", "unterminated shift sequence",
1058 starts, size, &startinpos, &endinpos, &exc, &s,
1059 (PyObject **)&unicode, &outpos, &p))
1060 goto onError;
1061 if (s < e)
1062 goto restart;
1065 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1066 goto onError;
1068 Py_XDECREF(errorHandler);
1069 Py_XDECREF(exc);
1070 return (PyObject *)unicode;
1072 onError:
1073 Py_XDECREF(errorHandler);
1074 Py_XDECREF(exc);
1075 Py_DECREF(unicode);
1076 return NULL;
1080 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1081 Py_ssize_t size,
1082 int encodeSetO,
1083 int encodeWhiteSpace,
1084 const char *errors)
1086 PyObject *v;
1087 /* It might be possible to tighten this worst case */
1088 Py_ssize_t cbAllocated = 5 * size;
1089 int inShift = 0;
1090 Py_ssize_t i = 0;
1091 unsigned int bitsleft = 0;
1092 unsigned long charsleft = 0;
1093 char * out;
1094 char * start;
1096 if (size == 0)
1097 return PyString_FromStringAndSize(NULL, 0);
1099 v = PyString_FromStringAndSize(NULL, cbAllocated);
1100 if (v == NULL)
1101 return NULL;
1103 start = out = PyString_AS_STRING(v);
1104 for (;i < size; ++i) {
1105 Py_UNICODE ch = s[i];
1107 if (!inShift) {
1108 if (ch == '+') {
1109 *out++ = '+';
1110 *out++ = '-';
1111 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1112 charsleft = ch;
1113 bitsleft = 16;
1114 *out++ = '+';
1115 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1116 inShift = bitsleft > 0;
1117 } else {
1118 *out++ = (char) ch;
1120 } else {
1121 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1122 *out++ = B64(charsleft << (6-bitsleft));
1123 charsleft = 0;
1124 bitsleft = 0;
1125 /* Characters not in the BASE64 set implicitly unshift the sequence
1126 so no '-' is required, except if the character is itself a '-' */
1127 if (B64CHAR(ch) || ch == '-') {
1128 *out++ = '-';
1130 inShift = 0;
1131 *out++ = (char) ch;
1132 } else {
1133 bitsleft += 16;
1134 charsleft = (charsleft << 16) | ch;
1135 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1137 /* If the next character is special then we dont' need to terminate
1138 the shift sequence. If the next character is not a BASE64 character
1139 or '-' then the shift sequence will be terminated implicitly and we
1140 don't have to insert a '-'. */
1142 if (bitsleft == 0) {
1143 if (i + 1 < size) {
1144 Py_UNICODE ch2 = s[i+1];
1146 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1148 } else if (B64CHAR(ch2) || ch2 == '-') {
1149 *out++ = '-';
1150 inShift = 0;
1151 } else {
1152 inShift = 0;
1156 else {
1157 *out++ = '-';
1158 inShift = 0;
1164 if (bitsleft) {
1165 *out++= B64(charsleft << (6-bitsleft) );
1166 *out++ = '-';
1169 _PyString_Resize(&v, out - start);
1170 return v;
1173 #undef SPECIAL
1174 #undef B64
1175 #undef B64CHAR
1176 #undef UB64
1177 #undef ENCODE
1178 #undef DECODE
1180 /* --- UTF-8 Codec -------------------------------------------------------- */
1182 static
1183 char utf8_code_length[256] = {
1184 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1185 illegal prefix. see RFC 2279 for details */
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1199 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1200 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1201 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1204 PyObject *PyUnicode_DecodeUTF8(const char *s,
1205 Py_ssize_t size,
1206 const char *errors)
1208 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1211 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1212 Py_ssize_t size,
1213 const char *errors,
1214 Py_ssize_t *consumed)
1216 const char *starts = s;
1217 int n;
1218 Py_ssize_t startinpos;
1219 Py_ssize_t endinpos;
1220 Py_ssize_t outpos;
1221 const char *e;
1222 PyUnicodeObject *unicode;
1223 Py_UNICODE *p;
1224 const char *errmsg = "";
1225 PyObject *errorHandler = NULL;
1226 PyObject *exc = NULL;
1228 /* Note: size will always be longer than the resulting Unicode
1229 character count */
1230 unicode = _PyUnicode_New(size);
1231 if (!unicode)
1232 return NULL;
1233 if (size == 0) {
1234 if (consumed)
1235 *consumed = 0;
1236 return (PyObject *)unicode;
1239 /* Unpack UTF-8 encoded data */
1240 p = unicode->str;
1241 e = s + size;
1243 while (s < e) {
1244 Py_UCS4 ch = (unsigned char)*s;
1246 if (ch < 0x80) {
1247 *p++ = (Py_UNICODE)ch;
1248 s++;
1249 continue;
1252 n = utf8_code_length[ch];
1254 if (s + n > e) {
1255 if (consumed)
1256 break;
1257 else {
1258 errmsg = "unexpected end of data";
1259 startinpos = s-starts;
1260 endinpos = size;
1261 goto utf8Error;
1265 switch (n) {
1267 case 0:
1268 errmsg = "unexpected code byte";
1269 startinpos = s-starts;
1270 endinpos = startinpos+1;
1271 goto utf8Error;
1273 case 1:
1274 errmsg = "internal error";
1275 startinpos = s-starts;
1276 endinpos = startinpos+1;
1277 goto utf8Error;
1279 case 2:
1280 if ((s[1] & 0xc0) != 0x80) {
1281 errmsg = "invalid data";
1282 startinpos = s-starts;
1283 endinpos = startinpos+2;
1284 goto utf8Error;
1286 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1287 if (ch < 0x80) {
1288 startinpos = s-starts;
1289 endinpos = startinpos+2;
1290 errmsg = "illegal encoding";
1291 goto utf8Error;
1293 else
1294 *p++ = (Py_UNICODE)ch;
1295 break;
1297 case 3:
1298 if ((s[1] & 0xc0) != 0x80 ||
1299 (s[2] & 0xc0) != 0x80) {
1300 errmsg = "invalid data";
1301 startinpos = s-starts;
1302 endinpos = startinpos+3;
1303 goto utf8Error;
1305 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1306 if (ch < 0x0800) {
1307 /* Note: UTF-8 encodings of surrogates are considered
1308 legal UTF-8 sequences;
1310 XXX For wide builds (UCS-4) we should probably try
1311 to recombine the surrogates into a single code
1312 unit.
1314 errmsg = "illegal encoding";
1315 startinpos = s-starts;
1316 endinpos = startinpos+3;
1317 goto utf8Error;
1319 else
1320 *p++ = (Py_UNICODE)ch;
1321 break;
1323 case 4:
1324 if ((s[1] & 0xc0) != 0x80 ||
1325 (s[2] & 0xc0) != 0x80 ||
1326 (s[3] & 0xc0) != 0x80) {
1327 errmsg = "invalid data";
1328 startinpos = s-starts;
1329 endinpos = startinpos+4;
1330 goto utf8Error;
1332 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1333 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1334 /* validate and convert to UTF-16 */
1335 if ((ch < 0x10000) /* minimum value allowed for 4
1336 byte encoding */
1337 || (ch > 0x10ffff)) /* maximum value allowed for
1338 UTF-16 */
1340 errmsg = "illegal encoding";
1341 startinpos = s-starts;
1342 endinpos = startinpos+4;
1343 goto utf8Error;
1345 #ifdef Py_UNICODE_WIDE
1346 *p++ = (Py_UNICODE)ch;
1347 #else
1348 /* compute and append the two surrogates: */
1350 /* translate from 10000..10FFFF to 0..FFFF */
1351 ch -= 0x10000;
1353 /* high surrogate = top 10 bits added to D800 */
1354 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1356 /* low surrogate = bottom 10 bits added to DC00 */
1357 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1358 #endif
1359 break;
1361 default:
1362 /* Other sizes are only needed for UCS-4 */
1363 errmsg = "unsupported Unicode code range";
1364 startinpos = s-starts;
1365 endinpos = startinpos+n;
1366 goto utf8Error;
1368 s += n;
1369 continue;
1371 utf8Error:
1372 outpos = p-PyUnicode_AS_UNICODE(unicode);
1373 if (unicode_decode_call_errorhandler(
1374 errors, &errorHandler,
1375 "utf8", errmsg,
1376 starts, size, &startinpos, &endinpos, &exc, &s,
1377 (PyObject **)&unicode, &outpos, &p))
1378 goto onError;
1380 if (consumed)
1381 *consumed = s-starts;
1383 /* Adjust length */
1384 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1385 goto onError;
1387 Py_XDECREF(errorHandler);
1388 Py_XDECREF(exc);
1389 return (PyObject *)unicode;
1391 onError:
1392 Py_XDECREF(errorHandler);
1393 Py_XDECREF(exc);
1394 Py_DECREF(unicode);
1395 return NULL;
1398 /* Allocation strategy: if the string is short, convert into a stack buffer
1399 and allocate exactly as much space needed at the end. Else allocate the
1400 maximum possible needed (4 result bytes per Unicode character), and return
1401 the excess memory at the end.
1403 PyObject *
1404 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1405 Py_ssize_t size,
1406 const char *errors)
1408 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1410 Py_ssize_t i; /* index into s of next input byte */
1411 PyObject *v; /* result string object */
1412 char *p; /* next free byte in output buffer */
1413 Py_ssize_t nallocated; /* number of result bytes allocated */
1414 Py_ssize_t nneeded; /* number of result bytes needed */
1415 char stackbuf[MAX_SHORT_UNICHARS * 4];
1417 assert(s != NULL);
1418 assert(size >= 0);
1420 if (size <= MAX_SHORT_UNICHARS) {
1421 /* Write into the stack buffer; nallocated can't overflow.
1422 * At the end, we'll allocate exactly as much heap space as it
1423 * turns out we need.
1425 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1426 v = NULL; /* will allocate after we're done */
1427 p = stackbuf;
1429 else {
1430 /* Overallocate on the heap, and give the excess back at the end. */
1431 nallocated = size * 4;
1432 if (nallocated / 4 != size) /* overflow! */
1433 return PyErr_NoMemory();
1434 v = PyString_FromStringAndSize(NULL, nallocated);
1435 if (v == NULL)
1436 return NULL;
1437 p = PyString_AS_STRING(v);
1440 for (i = 0; i < size;) {
1441 Py_UCS4 ch = s[i++];
1443 if (ch < 0x80)
1444 /* Encode ASCII */
1445 *p++ = (char) ch;
1447 else if (ch < 0x0800) {
1448 /* Encode Latin-1 */
1449 *p++ = (char)(0xc0 | (ch >> 6));
1450 *p++ = (char)(0x80 | (ch & 0x3f));
1452 else {
1453 /* Encode UCS2 Unicode ordinals */
1454 if (ch < 0x10000) {
1455 /* Special case: check for high surrogate */
1456 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1457 Py_UCS4 ch2 = s[i];
1458 /* Check for low surrogate and combine the two to
1459 form a UCS4 value */
1460 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1461 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1462 i++;
1463 goto encodeUCS4;
1465 /* Fall through: handles isolated high surrogates */
1467 *p++ = (char)(0xe0 | (ch >> 12));
1468 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1469 *p++ = (char)(0x80 | (ch & 0x3f));
1470 continue;
1472 encodeUCS4:
1473 /* Encode UCS4 Unicode ordinals */
1474 *p++ = (char)(0xf0 | (ch >> 18));
1475 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1476 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1477 *p++ = (char)(0x80 | (ch & 0x3f));
1481 if (v == NULL) {
1482 /* This was stack allocated. */
1483 nneeded = p - stackbuf;
1484 assert(nneeded <= nallocated);
1485 v = PyString_FromStringAndSize(stackbuf, nneeded);
1487 else {
1488 /* Cut back to size actually needed. */
1489 nneeded = p - PyString_AS_STRING(v);
1490 assert(nneeded <= nallocated);
1491 _PyString_Resize(&v, nneeded);
1493 return v;
1495 #undef MAX_SHORT_UNICHARS
1498 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1500 if (!PyUnicode_Check(unicode)) {
1501 PyErr_BadArgument();
1502 return NULL;
1504 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1505 PyUnicode_GET_SIZE(unicode),
1506 NULL);
1509 /* --- UTF-32 Codec ------------------------------------------------------- */
1511 PyObject *
1512 PyUnicode_DecodeUTF32(const char *s,
1513 Py_ssize_t size,
1514 const char *errors,
1515 int *byteorder)
1517 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
1520 PyObject *
1521 PyUnicode_DecodeUTF32Stateful(const char *s,
1522 Py_ssize_t size,
1523 const char *errors,
1524 int *byteorder,
1525 Py_ssize_t *consumed)
1527 const char *starts = s;
1528 Py_ssize_t startinpos;
1529 Py_ssize_t endinpos;
1530 Py_ssize_t outpos;
1531 PyUnicodeObject *unicode;
1532 Py_UNICODE *p;
1533 #ifndef Py_UNICODE_WIDE
1534 int i, pairs;
1535 #else
1536 const int pairs = 0;
1537 #endif
1538 const unsigned char *q, *e;
1539 int bo = 0; /* assume native ordering by default */
1540 const char *errmsg = "";
1541 /* Offsets from q for retrieving bytes in the right order. */
1542 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1543 int iorder[] = {0, 1, 2, 3};
1544 #else
1545 int iorder[] = {3, 2, 1, 0};
1546 #endif
1547 PyObject *errorHandler = NULL;
1548 PyObject *exc = NULL;
1549 /* On narrow builds we split characters outside the BMP into two
1550 codepoints => count how much extra space we need. */
1551 #ifndef Py_UNICODE_WIDE
1552 for (i = pairs = 0; i < size/4; i++)
1553 if (((Py_UCS4 *)s)[i] >= 0x10000)
1554 pairs++;
1555 #endif
1557 /* This might be one to much, because of a BOM */
1558 unicode = _PyUnicode_New((size+3)/4+pairs);
1559 if (!unicode)
1560 return NULL;
1561 if (size == 0)
1562 return (PyObject *)unicode;
1564 /* Unpack UTF-32 encoded data */
1565 p = unicode->str;
1566 q = (unsigned char *)s;
1567 e = q + size;
1569 if (byteorder)
1570 bo = *byteorder;
1572 /* Check for BOM marks (U+FEFF) in the input and adjust current
1573 byte order setting accordingly. In native mode, the leading BOM
1574 mark is skipped, in all other modes, it is copied to the output
1575 stream as-is (giving a ZWNBSP character). */
1576 if (bo == 0) {
1577 if (size >= 4) {
1578 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1579 (q[iorder[1]] << 8) | q[iorder[0]];
1580 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1581 if (bom == 0x0000FEFF) {
1582 q += 4;
1583 bo = -1;
1585 else if (bom == 0xFFFE0000) {
1586 q += 4;
1587 bo = 1;
1589 #else
1590 if (bom == 0x0000FEFF) {
1591 q += 4;
1592 bo = 1;
1594 else if (bom == 0xFFFE0000) {
1595 q += 4;
1596 bo = -1;
1598 #endif
1602 if (bo == -1) {
1603 /* force LE */
1604 iorder[0] = 0;
1605 iorder[1] = 1;
1606 iorder[2] = 2;
1607 iorder[3] = 3;
1609 else if (bo == 1) {
1610 /* force BE */
1611 iorder[0] = 3;
1612 iorder[1] = 2;
1613 iorder[2] = 1;
1614 iorder[3] = 0;
1617 while (q < e) {
1618 Py_UCS4 ch;
1619 /* remaining bytes at the end? (size should be divisible by 4) */
1620 if (e-q<4) {
1621 if (consumed)
1622 break;
1623 errmsg = "truncated data";
1624 startinpos = ((const char *)q)-starts;
1625 endinpos = ((const char *)e)-starts;
1626 goto utf32Error;
1627 /* The remaining input chars are ignored if the callback
1628 chooses to skip the input */
1630 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
1631 (q[iorder[1]] << 8) | q[iorder[0]];
1633 if (ch >= 0x110000)
1635 errmsg = "codepoint not in range(0x110000)";
1636 startinpos = ((const char *)q)-starts;
1637 endinpos = startinpos+4;
1638 goto utf32Error;
1640 #ifndef Py_UNICODE_WIDE
1641 if (ch >= 0x10000)
1643 *p++ = 0xD800 | ((ch-0x10000) >> 10);
1644 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
1646 else
1647 #endif
1648 *p++ = ch;
1649 q += 4;
1650 continue;
1651 utf32Error:
1652 outpos = p-PyUnicode_AS_UNICODE(unicode);
1653 if (unicode_decode_call_errorhandler(
1654 errors, &errorHandler,
1655 "utf32", errmsg,
1656 starts, size, &startinpos, &endinpos, &exc, &s,
1657 (PyObject **)&unicode, &outpos, &p))
1658 goto onError;
1661 if (byteorder)
1662 *byteorder = bo;
1664 if (consumed)
1665 *consumed = (const char *)q-starts;
1667 /* Adjust length */
1668 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1669 goto onError;
1671 Py_XDECREF(errorHandler);
1672 Py_XDECREF(exc);
1673 return (PyObject *)unicode;
1675 onError:
1676 Py_DECREF(unicode);
1677 Py_XDECREF(errorHandler);
1678 Py_XDECREF(exc);
1679 return NULL;
1682 PyObject *
1683 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
1684 Py_ssize_t size,
1685 const char *errors,
1686 int byteorder)
1688 PyObject *v;
1689 unsigned char *p;
1690 #ifndef Py_UNICODE_WIDE
1691 int i, pairs;
1692 #else
1693 const int pairs = 0;
1694 #endif
1695 /* Offsets from p for storing byte pairs in the right order. */
1696 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1697 int iorder[] = {0, 1, 2, 3};
1698 #else
1699 int iorder[] = {3, 2, 1, 0};
1700 #endif
1702 #define STORECHAR(CH) \
1703 do { \
1704 p[iorder[3]] = ((CH) >> 24) & 0xff; \
1705 p[iorder[2]] = ((CH) >> 16) & 0xff; \
1706 p[iorder[1]] = ((CH) >> 8) & 0xff; \
1707 p[iorder[0]] = (CH) & 0xff; \
1708 p += 4; \
1709 } while(0)
1711 /* In narrow builds we can output surrogate pairs as one codepoint,
1712 so we need less space. */
1713 #ifndef Py_UNICODE_WIDE
1714 for (i = pairs = 0; i < size-1; i++)
1715 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
1716 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
1717 pairs++;
1718 #endif
1719 v = PyString_FromStringAndSize(NULL,
1720 4 * (size - pairs + (byteorder == 0)));
1721 if (v == NULL)
1722 return NULL;
1724 p = (unsigned char *)PyString_AS_STRING(v);
1725 if (byteorder == 0)
1726 STORECHAR(0xFEFF);
1727 if (size == 0)
1728 return v;
1730 if (byteorder == -1) {
1731 /* force LE */
1732 iorder[0] = 0;
1733 iorder[1] = 1;
1734 iorder[2] = 2;
1735 iorder[3] = 3;
1737 else if (byteorder == 1) {
1738 /* force BE */
1739 iorder[0] = 3;
1740 iorder[1] = 2;
1741 iorder[2] = 1;
1742 iorder[3] = 0;
1745 while (size-- > 0) {
1746 Py_UCS4 ch = *s++;
1747 #ifndef Py_UNICODE_WIDE
1748 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
1749 Py_UCS4 ch2 = *s;
1750 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1751 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1752 s++;
1753 size--;
1756 #endif
1757 STORECHAR(ch);
1759 return v;
1760 #undef STORECHAR
1763 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
1765 if (!PyUnicode_Check(unicode)) {
1766 PyErr_BadArgument();
1767 return NULL;
1769 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
1770 PyUnicode_GET_SIZE(unicode),
1771 NULL,
1775 /* --- UTF-16 Codec ------------------------------------------------------- */
1777 PyObject *
1778 PyUnicode_DecodeUTF16(const char *s,
1779 Py_ssize_t size,
1780 const char *errors,
1781 int *byteorder)
1783 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1786 PyObject *
1787 PyUnicode_DecodeUTF16Stateful(const char *s,
1788 Py_ssize_t size,
1789 const char *errors,
1790 int *byteorder,
1791 Py_ssize_t *consumed)
1793 const char *starts = s;
1794 Py_ssize_t startinpos;
1795 Py_ssize_t endinpos;
1796 Py_ssize_t outpos;
1797 PyUnicodeObject *unicode;
1798 Py_UNICODE *p;
1799 const unsigned char *q, *e;
1800 int bo = 0; /* assume native ordering by default */
1801 const char *errmsg = "";
1802 /* Offsets from q for retrieving byte pairs in the right order. */
1803 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1804 int ihi = 1, ilo = 0;
1805 #else
1806 int ihi = 0, ilo = 1;
1807 #endif
1808 PyObject *errorHandler = NULL;
1809 PyObject *exc = NULL;
1811 /* Note: size will always be longer than the resulting Unicode
1812 character count */
1813 unicode = _PyUnicode_New(size);
1814 if (!unicode)
1815 return NULL;
1816 if (size == 0)
1817 return (PyObject *)unicode;
1819 /* Unpack UTF-16 encoded data */
1820 p = unicode->str;
1821 q = (unsigned char *)s;
1822 e = q + size;
1824 if (byteorder)
1825 bo = *byteorder;
1827 /* Check for BOM marks (U+FEFF) in the input and adjust current
1828 byte order setting accordingly. In native mode, the leading BOM
1829 mark is skipped, in all other modes, it is copied to the output
1830 stream as-is (giving a ZWNBSP character). */
1831 if (bo == 0) {
1832 if (size >= 2) {
1833 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1834 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1835 if (bom == 0xFEFF) {
1836 q += 2;
1837 bo = -1;
1839 else if (bom == 0xFFFE) {
1840 q += 2;
1841 bo = 1;
1843 #else
1844 if (bom == 0xFEFF) {
1845 q += 2;
1846 bo = 1;
1848 else if (bom == 0xFFFE) {
1849 q += 2;
1850 bo = -1;
1852 #endif
1856 if (bo == -1) {
1857 /* force LE */
1858 ihi = 1;
1859 ilo = 0;
1861 else if (bo == 1) {
1862 /* force BE */
1863 ihi = 0;
1864 ilo = 1;
1867 while (q < e) {
1868 Py_UNICODE ch;
1869 /* remaining bytes at the end? (size should be even) */
1870 if (e-q<2) {
1871 if (consumed)
1872 break;
1873 errmsg = "truncated data";
1874 startinpos = ((const char *)q)-starts;
1875 endinpos = ((const char *)e)-starts;
1876 goto utf16Error;
1877 /* The remaining input chars are ignored if the callback
1878 chooses to skip the input */
1880 ch = (q[ihi] << 8) | q[ilo];
1882 q += 2;
1884 if (ch < 0xD800 || ch > 0xDFFF) {
1885 *p++ = ch;
1886 continue;
1889 /* UTF-16 code pair: */
1890 if (q >= e) {
1891 errmsg = "unexpected end of data";
1892 startinpos = (((const char *)q)-2)-starts;
1893 endinpos = ((const char *)e)-starts;
1894 goto utf16Error;
1896 if (0xD800 <= ch && ch <= 0xDBFF) {
1897 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1898 q += 2;
1899 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1900 #ifndef Py_UNICODE_WIDE
1901 *p++ = ch;
1902 *p++ = ch2;
1903 #else
1904 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1905 #endif
1906 continue;
1908 else {
1909 errmsg = "illegal UTF-16 surrogate";
1910 startinpos = (((const char *)q)-4)-starts;
1911 endinpos = startinpos+2;
1912 goto utf16Error;
1916 errmsg = "illegal encoding";
1917 startinpos = (((const char *)q)-2)-starts;
1918 endinpos = startinpos+2;
1919 /* Fall through to report the error */
1921 utf16Error:
1922 outpos = p-PyUnicode_AS_UNICODE(unicode);
1923 if (unicode_decode_call_errorhandler(
1924 errors, &errorHandler,
1925 "utf16", errmsg,
1926 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1927 (PyObject **)&unicode, &outpos, &p))
1928 goto onError;
1931 if (byteorder)
1932 *byteorder = bo;
1934 if (consumed)
1935 *consumed = (const char *)q-starts;
1937 /* Adjust length */
1938 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
1939 goto onError;
1941 Py_XDECREF(errorHandler);
1942 Py_XDECREF(exc);
1943 return (PyObject *)unicode;
1945 onError:
1946 Py_DECREF(unicode);
1947 Py_XDECREF(errorHandler);
1948 Py_XDECREF(exc);
1949 return NULL;
1952 PyObject *
1953 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1954 Py_ssize_t size,
1955 const char *errors,
1956 int byteorder)
1958 PyObject *v;
1959 unsigned char *p;
1960 #ifdef Py_UNICODE_WIDE
1961 int i, pairs;
1962 #else
1963 const int pairs = 0;
1964 #endif
1965 /* Offsets from p for storing byte pairs in the right order. */
1966 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1967 int ihi = 1, ilo = 0;
1968 #else
1969 int ihi = 0, ilo = 1;
1970 #endif
1972 #define STORECHAR(CH) \
1973 do { \
1974 p[ihi] = ((CH) >> 8) & 0xff; \
1975 p[ilo] = (CH) & 0xff; \
1976 p += 2; \
1977 } while(0)
1979 #ifdef Py_UNICODE_WIDE
1980 for (i = pairs = 0; i < size; i++)
1981 if (s[i] >= 0x10000)
1982 pairs++;
1983 #endif
1984 v = PyString_FromStringAndSize(NULL,
1985 2 * (size + pairs + (byteorder == 0)));
1986 if (v == NULL)
1987 return NULL;
1989 p = (unsigned char *)PyString_AS_STRING(v);
1990 if (byteorder == 0)
1991 STORECHAR(0xFEFF);
1992 if (size == 0)
1993 return v;
1995 if (byteorder == -1) {
1996 /* force LE */
1997 ihi = 1;
1998 ilo = 0;
2000 else if (byteorder == 1) {
2001 /* force BE */
2002 ihi = 0;
2003 ilo = 1;
2006 while (size-- > 0) {
2007 Py_UNICODE ch = *s++;
2008 Py_UNICODE ch2 = 0;
2009 #ifdef Py_UNICODE_WIDE
2010 if (ch >= 0x10000) {
2011 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2012 ch = 0xD800 | ((ch-0x10000) >> 10);
2014 #endif
2015 STORECHAR(ch);
2016 if (ch2)
2017 STORECHAR(ch2);
2019 return v;
2020 #undef STORECHAR
2023 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2025 if (!PyUnicode_Check(unicode)) {
2026 PyErr_BadArgument();
2027 return NULL;
2029 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2030 PyUnicode_GET_SIZE(unicode),
2031 NULL,
2035 /* --- Unicode Escape Codec ----------------------------------------------- */
2037 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2039 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2040 Py_ssize_t size,
2041 const char *errors)
2043 const char *starts = s;
2044 Py_ssize_t startinpos;
2045 Py_ssize_t endinpos;
2046 Py_ssize_t outpos;
2047 int i;
2048 PyUnicodeObject *v;
2049 Py_UNICODE *p;
2050 const char *end;
2051 char* message;
2052 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2053 PyObject *errorHandler = NULL;
2054 PyObject *exc = NULL;
2056 /* Escaped strings will always be longer than the resulting
2057 Unicode string, so we start with size here and then reduce the
2058 length after conversion to the true value.
2059 (but if the error callback returns a long replacement string
2060 we'll have to allocate more space) */
2061 v = _PyUnicode_New(size);
2062 if (v == NULL)
2063 goto onError;
2064 if (size == 0)
2065 return (PyObject *)v;
2067 p = PyUnicode_AS_UNICODE(v);
2068 end = s + size;
2070 while (s < end) {
2071 unsigned char c;
2072 Py_UNICODE x;
2073 int digits;
2075 /* Non-escape characters are interpreted as Unicode ordinals */
2076 if (*s != '\\') {
2077 *p++ = (unsigned char) *s++;
2078 continue;
2081 startinpos = s-starts;
2082 /* \ - Escapes */
2083 s++;
2084 switch (*s++) {
2086 /* \x escapes */
2087 case '\n': break;
2088 case '\\': *p++ = '\\'; break;
2089 case '\'': *p++ = '\''; break;
2090 case '\"': *p++ = '\"'; break;
2091 case 'b': *p++ = '\b'; break;
2092 case 'f': *p++ = '\014'; break; /* FF */
2093 case 't': *p++ = '\t'; break;
2094 case 'n': *p++ = '\n'; break;
2095 case 'r': *p++ = '\r'; break;
2096 case 'v': *p++ = '\013'; break; /* VT */
2097 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2099 /* \OOO (octal) escapes */
2100 case '0': case '1': case '2': case '3':
2101 case '4': case '5': case '6': case '7':
2102 x = s[-1] - '0';
2103 if ('0' <= *s && *s <= '7') {
2104 x = (x<<3) + *s++ - '0';
2105 if ('0' <= *s && *s <= '7')
2106 x = (x<<3) + *s++ - '0';
2108 *p++ = x;
2109 break;
2111 /* hex escapes */
2112 /* \xXX */
2113 case 'x':
2114 digits = 2;
2115 message = "truncated \\xXX escape";
2116 goto hexescape;
2118 /* \uXXXX */
2119 case 'u':
2120 digits = 4;
2121 message = "truncated \\uXXXX escape";
2122 goto hexescape;
2124 /* \UXXXXXXXX */
2125 case 'U':
2126 digits = 8;
2127 message = "truncated \\UXXXXXXXX escape";
2128 hexescape:
2129 chr = 0;
2130 outpos = p-PyUnicode_AS_UNICODE(v);
2131 if (s+digits>end) {
2132 endinpos = size;
2133 if (unicode_decode_call_errorhandler(
2134 errors, &errorHandler,
2135 "unicodeescape", "end of string in escape sequence",
2136 starts, size, &startinpos, &endinpos, &exc, &s,
2137 (PyObject **)&v, &outpos, &p))
2138 goto onError;
2139 goto nextByte;
2141 for (i = 0; i < digits; ++i) {
2142 c = (unsigned char) s[i];
2143 if (!isxdigit(c)) {
2144 endinpos = (s+i+1)-starts;
2145 if (unicode_decode_call_errorhandler(
2146 errors, &errorHandler,
2147 "unicodeescape", message,
2148 starts, size, &startinpos, &endinpos, &exc, &s,
2149 (PyObject **)&v, &outpos, &p))
2150 goto onError;
2151 goto nextByte;
2153 chr = (chr<<4) & ~0xF;
2154 if (c >= '0' && c <= '9')
2155 chr += c - '0';
2156 else if (c >= 'a' && c <= 'f')
2157 chr += 10 + c - 'a';
2158 else
2159 chr += 10 + c - 'A';
2161 s += i;
2162 if (chr == 0xffffffff && PyErr_Occurred())
2163 /* _decoding_error will have already written into the
2164 target buffer. */
2165 break;
2166 store:
2167 /* when we get here, chr is a 32-bit unicode character */
2168 if (chr <= 0xffff)
2169 /* UCS-2 character */
2170 *p++ = (Py_UNICODE) chr;
2171 else if (chr <= 0x10ffff) {
2172 /* UCS-4 character. Either store directly, or as
2173 surrogate pair. */
2174 #ifdef Py_UNICODE_WIDE
2175 *p++ = chr;
2176 #else
2177 chr -= 0x10000L;
2178 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2179 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2180 #endif
2181 } else {
2182 endinpos = s-starts;
2183 outpos = p-PyUnicode_AS_UNICODE(v);
2184 if (unicode_decode_call_errorhandler(
2185 errors, &errorHandler,
2186 "unicodeescape", "illegal Unicode character",
2187 starts, size, &startinpos, &endinpos, &exc, &s,
2188 (PyObject **)&v, &outpos, &p))
2189 goto onError;
2191 break;
2193 /* \N{name} */
2194 case 'N':
2195 message = "malformed \\N character escape";
2196 if (ucnhash_CAPI == NULL) {
2197 /* load the unicode data module */
2198 PyObject *m, *api;
2199 m = PyImport_ImportModule("unicodedata");
2200 if (m == NULL)
2201 goto ucnhashError;
2202 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
2203 Py_DECREF(m);
2204 if (api == NULL)
2205 goto ucnhashError;
2206 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
2207 Py_DECREF(api);
2208 if (ucnhash_CAPI == NULL)
2209 goto ucnhashError;
2211 if (*s == '{') {
2212 const char *start = s+1;
2213 /* look for the closing brace */
2214 while (*s != '}' && s < end)
2215 s++;
2216 if (s > start && s < end && *s == '}') {
2217 /* found a name. look it up in the unicode database */
2218 message = "unknown Unicode character name";
2219 s++;
2220 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2221 goto store;
2224 endinpos = s-starts;
2225 outpos = p-PyUnicode_AS_UNICODE(v);
2226 if (unicode_decode_call_errorhandler(
2227 errors, &errorHandler,
2228 "unicodeescape", message,
2229 starts, size, &startinpos, &endinpos, &exc, &s,
2230 (PyObject **)&v, &outpos, &p))
2231 goto onError;
2232 break;
2234 default:
2235 if (s > end) {
2236 message = "\\ at end of string";
2237 s--;
2238 endinpos = s-starts;
2239 outpos = p-PyUnicode_AS_UNICODE(v);
2240 if (unicode_decode_call_errorhandler(
2241 errors, &errorHandler,
2242 "unicodeescape", message,
2243 starts, size, &startinpos, &endinpos, &exc, &s,
2244 (PyObject **)&v, &outpos, &p))
2245 goto onError;
2247 else {
2248 *p++ = '\\';
2249 *p++ = (unsigned char)s[-1];
2251 break;
2253 nextByte:
2256 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2257 goto onError;
2258 Py_XDECREF(errorHandler);
2259 Py_XDECREF(exc);
2260 return (PyObject *)v;
2262 ucnhashError:
2263 PyErr_SetString(
2264 PyExc_UnicodeError,
2265 "\\N escapes not supported (can't load unicodedata module)"
2267 Py_XDECREF(v);
2268 Py_XDECREF(errorHandler);
2269 Py_XDECREF(exc);
2270 return NULL;
2272 onError:
2273 Py_XDECREF(v);
2274 Py_XDECREF(errorHandler);
2275 Py_XDECREF(exc);
2276 return NULL;
2279 /* Return a Unicode-Escape string version of the Unicode object.
2281 If quotes is true, the string is enclosed in u"" or u'' quotes as
2282 appropriate.
2286 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2287 Py_ssize_t size,
2288 Py_UNICODE ch)
2290 /* like wcschr, but doesn't stop at NULL characters */
2292 while (size-- > 0) {
2293 if (*s == ch)
2294 return s;
2295 s++;
2298 return NULL;
2301 static
2302 PyObject *unicodeescape_string(const Py_UNICODE *s,
2303 Py_ssize_t size,
2304 int quotes)
2306 PyObject *repr;
2307 char *p;
2309 static const char *hexdigit = "0123456789abcdef";
2311 /* XXX(nnorwitz): rather than over-allocating, it would be
2312 better to choose a different scheme. Perhaps scan the
2313 first N-chars of the string and allocate based on that size.
2315 /* Initial allocation is based on the longest-possible unichr
2316 escape.
2318 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2319 unichr, so in this case it's the longest unichr escape. In
2320 narrow (UTF-16) builds this is five chars per source unichr
2321 since there are two unichrs in the surrogate pair, so in narrow
2322 (UTF-16) builds it's not the longest unichr escape.
2324 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2325 so in the narrow (UTF-16) build case it's the longest unichr
2326 escape.
2329 repr = PyString_FromStringAndSize(NULL,
2331 #ifdef Py_UNICODE_WIDE
2332 + 10*size
2333 #else
2334 + 6*size
2335 #endif
2336 + 1);
2337 if (repr == NULL)
2338 return NULL;
2340 p = PyString_AS_STRING(repr);
2342 if (quotes) {
2343 *p++ = 'u';
2344 *p++ = (findchar(s, size, '\'') &&
2345 !findchar(s, size, '"')) ? '"' : '\'';
2347 while (size-- > 0) {
2348 Py_UNICODE ch = *s++;
2350 /* Escape quotes and backslashes */
2351 if ((quotes &&
2352 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
2353 *p++ = '\\';
2354 *p++ = (char) ch;
2355 continue;
2358 #ifdef Py_UNICODE_WIDE
2359 /* Map 21-bit characters to '\U00xxxxxx' */
2360 else if (ch >= 0x10000) {
2361 *p++ = '\\';
2362 *p++ = 'U';
2363 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2364 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2365 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2366 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2367 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2368 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2369 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
2370 *p++ = hexdigit[ch & 0x0000000F];
2371 continue;
2373 #else
2374 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2375 else if (ch >= 0xD800 && ch < 0xDC00) {
2376 Py_UNICODE ch2;
2377 Py_UCS4 ucs;
2379 ch2 = *s++;
2380 size--;
2381 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2382 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2383 *p++ = '\\';
2384 *p++ = 'U';
2385 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2386 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2387 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2388 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2389 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2390 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2391 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2392 *p++ = hexdigit[ucs & 0x0000000F];
2393 continue;
2395 /* Fall through: isolated surrogates are copied as-is */
2396 s--;
2397 size++;
2399 #endif
2401 /* Map 16-bit characters to '\uxxxx' */
2402 if (ch >= 256) {
2403 *p++ = '\\';
2404 *p++ = 'u';
2405 *p++ = hexdigit[(ch >> 12) & 0x000F];
2406 *p++ = hexdigit[(ch >> 8) & 0x000F];
2407 *p++ = hexdigit[(ch >> 4) & 0x000F];
2408 *p++ = hexdigit[ch & 0x000F];
2411 /* Map special whitespace to '\t', \n', '\r' */
2412 else if (ch == '\t') {
2413 *p++ = '\\';
2414 *p++ = 't';
2416 else if (ch == '\n') {
2417 *p++ = '\\';
2418 *p++ = 'n';
2420 else if (ch == '\r') {
2421 *p++ = '\\';
2422 *p++ = 'r';
2425 /* Map non-printable US ASCII to '\xhh' */
2426 else if (ch < ' ' || ch >= 0x7F) {
2427 *p++ = '\\';
2428 *p++ = 'x';
2429 *p++ = hexdigit[(ch >> 4) & 0x000F];
2430 *p++ = hexdigit[ch & 0x000F];
2433 /* Copy everything else as-is */
2434 else
2435 *p++ = (char) ch;
2437 if (quotes)
2438 *p++ = PyString_AS_STRING(repr)[1];
2440 *p = '\0';
2441 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
2442 return repr;
2445 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2446 Py_ssize_t size)
2448 return unicodeescape_string(s, size, 0);
2451 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2453 if (!PyUnicode_Check(unicode)) {
2454 PyErr_BadArgument();
2455 return NULL;
2457 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2458 PyUnicode_GET_SIZE(unicode));
2461 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2463 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2464 Py_ssize_t size,
2465 const char *errors)
2467 const char *starts = s;
2468 Py_ssize_t startinpos;
2469 Py_ssize_t endinpos;
2470 Py_ssize_t outpos;
2471 PyUnicodeObject *v;
2472 Py_UNICODE *p;
2473 const char *end;
2474 const char *bs;
2475 PyObject *errorHandler = NULL;
2476 PyObject *exc = NULL;
2478 /* Escaped strings will always be longer than the resulting
2479 Unicode string, so we start with size here and then reduce the
2480 length after conversion to the true value. (But decoding error
2481 handler might have to resize the string) */
2482 v = _PyUnicode_New(size);
2483 if (v == NULL)
2484 goto onError;
2485 if (size == 0)
2486 return (PyObject *)v;
2487 p = PyUnicode_AS_UNICODE(v);
2488 end = s + size;
2489 while (s < end) {
2490 unsigned char c;
2491 Py_UCS4 x;
2492 int i;
2493 int count;
2495 /* Non-escape characters are interpreted as Unicode ordinals */
2496 if (*s != '\\') {
2497 *p++ = (unsigned char)*s++;
2498 continue;
2500 startinpos = s-starts;
2502 /* \u-escapes are only interpreted iff the number of leading
2503 backslashes if odd */
2504 bs = s;
2505 for (;s < end;) {
2506 if (*s != '\\')
2507 break;
2508 *p++ = (unsigned char)*s++;
2510 if (((s - bs) & 1) == 0 ||
2511 s >= end ||
2512 (*s != 'u' && *s != 'U')) {
2513 continue;
2515 p--;
2516 count = *s=='u' ? 4 : 8;
2517 s++;
2519 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2520 outpos = p-PyUnicode_AS_UNICODE(v);
2521 for (x = 0, i = 0; i < count; ++i, ++s) {
2522 c = (unsigned char)*s;
2523 if (!isxdigit(c)) {
2524 endinpos = s-starts;
2525 if (unicode_decode_call_errorhandler(
2526 errors, &errorHandler,
2527 "rawunicodeescape", "truncated \\uXXXX",
2528 starts, size, &startinpos, &endinpos, &exc, &s,
2529 (PyObject **)&v, &outpos, &p))
2530 goto onError;
2531 goto nextByte;
2533 x = (x<<4) & ~0xF;
2534 if (c >= '0' && c <= '9')
2535 x += c - '0';
2536 else if (c >= 'a' && c <= 'f')
2537 x += 10 + c - 'a';
2538 else
2539 x += 10 + c - 'A';
2541 #ifndef Py_UNICODE_WIDE
2542 if (x > 0x10000) {
2543 if (unicode_decode_call_errorhandler(
2544 errors, &errorHandler,
2545 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2546 starts, size, &startinpos, &endinpos, &exc, &s,
2547 (PyObject **)&v, &outpos, &p))
2548 goto onError;
2550 #endif
2551 *p++ = x;
2552 nextByte:
2555 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2556 goto onError;
2557 Py_XDECREF(errorHandler);
2558 Py_XDECREF(exc);
2559 return (PyObject *)v;
2561 onError:
2562 Py_XDECREF(v);
2563 Py_XDECREF(errorHandler);
2564 Py_XDECREF(exc);
2565 return NULL;
2568 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2569 Py_ssize_t size)
2571 PyObject *repr;
2572 char *p;
2573 char *q;
2575 static const char *hexdigit = "0123456789abcdef";
2577 #ifdef Py_UNICODE_WIDE
2578 repr = PyString_FromStringAndSize(NULL, 10 * size);
2579 #else
2580 repr = PyString_FromStringAndSize(NULL, 6 * size);
2581 #endif
2582 if (repr == NULL)
2583 return NULL;
2584 if (size == 0)
2585 return repr;
2587 p = q = PyString_AS_STRING(repr);
2588 while (size-- > 0) {
2589 Py_UNICODE ch = *s++;
2590 #ifdef Py_UNICODE_WIDE
2591 /* Map 32-bit characters to '\Uxxxxxxxx' */
2592 if (ch >= 0x10000) {
2593 *p++ = '\\';
2594 *p++ = 'U';
2595 *p++ = hexdigit[(ch >> 28) & 0xf];
2596 *p++ = hexdigit[(ch >> 24) & 0xf];
2597 *p++ = hexdigit[(ch >> 20) & 0xf];
2598 *p++ = hexdigit[(ch >> 16) & 0xf];
2599 *p++ = hexdigit[(ch >> 12) & 0xf];
2600 *p++ = hexdigit[(ch >> 8) & 0xf];
2601 *p++ = hexdigit[(ch >> 4) & 0xf];
2602 *p++ = hexdigit[ch & 15];
2604 else
2605 #endif
2606 /* Map 16-bit characters to '\uxxxx' */
2607 if (ch >= 256) {
2608 *p++ = '\\';
2609 *p++ = 'u';
2610 *p++ = hexdigit[(ch >> 12) & 0xf];
2611 *p++ = hexdigit[(ch >> 8) & 0xf];
2612 *p++ = hexdigit[(ch >> 4) & 0xf];
2613 *p++ = hexdigit[ch & 15];
2615 /* Copy everything else as-is */
2616 else
2617 *p++ = (char) ch;
2619 *p = '\0';
2620 _PyString_Resize(&repr, p - q);
2621 return repr;
2624 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2626 if (!PyUnicode_Check(unicode)) {
2627 PyErr_BadArgument();
2628 return NULL;
2630 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2631 PyUnicode_GET_SIZE(unicode));
2634 /* --- Unicode Internal Codec ------------------------------------------- */
2636 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2637 Py_ssize_t size,
2638 const char *errors)
2640 const char *starts = s;
2641 Py_ssize_t startinpos;
2642 Py_ssize_t endinpos;
2643 Py_ssize_t outpos;
2644 PyUnicodeObject *v;
2645 Py_UNICODE *p;
2646 const char *end;
2647 const char *reason;
2648 PyObject *errorHandler = NULL;
2649 PyObject *exc = NULL;
2651 #ifdef Py_UNICODE_WIDE
2652 Py_UNICODE unimax = PyUnicode_GetMax();
2653 #endif
2655 /* XXX overflow detection missing */
2656 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2657 if (v == NULL)
2658 goto onError;
2659 if (PyUnicode_GetSize((PyObject *)v) == 0)
2660 return (PyObject *)v;
2661 p = PyUnicode_AS_UNICODE(v);
2662 end = s + size;
2664 while (s < end) {
2665 memcpy(p, s, sizeof(Py_UNICODE));
2666 /* We have to sanity check the raw data, otherwise doom looms for
2667 some malformed UCS-4 data. */
2668 if (
2669 #ifdef Py_UNICODE_WIDE
2670 *p > unimax || *p < 0 ||
2671 #endif
2672 end-s < Py_UNICODE_SIZE
2675 startinpos = s - starts;
2676 if (end-s < Py_UNICODE_SIZE) {
2677 endinpos = end-starts;
2678 reason = "truncated input";
2680 else {
2681 endinpos = s - starts + Py_UNICODE_SIZE;
2682 reason = "illegal code point (> 0x10FFFF)";
2684 outpos = p - PyUnicode_AS_UNICODE(v);
2685 if (unicode_decode_call_errorhandler(
2686 errors, &errorHandler,
2687 "unicode_internal", reason,
2688 starts, size, &startinpos, &endinpos, &exc, &s,
2689 (PyObject **)&v, &outpos, &p)) {
2690 goto onError;
2693 else {
2694 p++;
2695 s += Py_UNICODE_SIZE;
2699 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2700 goto onError;
2701 Py_XDECREF(errorHandler);
2702 Py_XDECREF(exc);
2703 return (PyObject *)v;
2705 onError:
2706 Py_XDECREF(v);
2707 Py_XDECREF(errorHandler);
2708 Py_XDECREF(exc);
2709 return NULL;
2712 /* --- Latin-1 Codec ------------------------------------------------------ */
2714 PyObject *PyUnicode_DecodeLatin1(const char *s,
2715 Py_ssize_t size,
2716 const char *errors)
2718 PyUnicodeObject *v;
2719 Py_UNICODE *p;
2721 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2722 if (size == 1) {
2723 Py_UNICODE r = *(unsigned char*)s;
2724 return PyUnicode_FromUnicode(&r, 1);
2727 v = _PyUnicode_New(size);
2728 if (v == NULL)
2729 goto onError;
2730 if (size == 0)
2731 return (PyObject *)v;
2732 p = PyUnicode_AS_UNICODE(v);
2733 while (size-- > 0)
2734 *p++ = (unsigned char)*s++;
2735 return (PyObject *)v;
2737 onError:
2738 Py_XDECREF(v);
2739 return NULL;
2742 /* create or adjust a UnicodeEncodeError */
2743 static void make_encode_exception(PyObject **exceptionObject,
2744 const char *encoding,
2745 const Py_UNICODE *unicode, Py_ssize_t size,
2746 Py_ssize_t startpos, Py_ssize_t endpos,
2747 const char *reason)
2749 if (*exceptionObject == NULL) {
2750 *exceptionObject = PyUnicodeEncodeError_Create(
2751 encoding, unicode, size, startpos, endpos, reason);
2753 else {
2754 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2755 goto onError;
2756 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2757 goto onError;
2758 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2759 goto onError;
2760 return;
2761 onError:
2762 Py_DECREF(*exceptionObject);
2763 *exceptionObject = NULL;
2767 /* raises a UnicodeEncodeError */
2768 static void raise_encode_exception(PyObject **exceptionObject,
2769 const char *encoding,
2770 const Py_UNICODE *unicode, Py_ssize_t size,
2771 Py_ssize_t startpos, Py_ssize_t endpos,
2772 const char *reason)
2774 make_encode_exception(exceptionObject,
2775 encoding, unicode, size, startpos, endpos, reason);
2776 if (*exceptionObject != NULL)
2777 PyCodec_StrictErrors(*exceptionObject);
2780 /* error handling callback helper:
2781 build arguments, call the callback and check the arguments,
2782 put the result into newpos and return the replacement string, which
2783 has to be freed by the caller */
2784 static PyObject *unicode_encode_call_errorhandler(const char *errors,
2785 PyObject **errorHandler,
2786 const char *encoding, const char *reason,
2787 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
2788 Py_ssize_t startpos, Py_ssize_t endpos,
2789 Py_ssize_t *newpos)
2791 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
2793 PyObject *restuple;
2794 PyObject *resunicode;
2796 if (*errorHandler == NULL) {
2797 *errorHandler = PyCodec_LookupError(errors);
2798 if (*errorHandler == NULL)
2799 return NULL;
2802 make_encode_exception(exceptionObject,
2803 encoding, unicode, size, startpos, endpos, reason);
2804 if (*exceptionObject == NULL)
2805 return NULL;
2807 restuple = PyObject_CallFunctionObjArgs(
2808 *errorHandler, *exceptionObject, NULL);
2809 if (restuple == NULL)
2810 return NULL;
2811 if (!PyTuple_Check(restuple)) {
2812 PyErr_Format(PyExc_TypeError, &argparse[4]);
2813 Py_DECREF(restuple);
2814 return NULL;
2816 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2817 &resunicode, newpos)) {
2818 Py_DECREF(restuple);
2819 return NULL;
2821 if (*newpos<0)
2822 *newpos = size+*newpos;
2823 if (*newpos<0 || *newpos>size) {
2824 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
2825 Py_DECREF(restuple);
2826 return NULL;
2828 Py_INCREF(resunicode);
2829 Py_DECREF(restuple);
2830 return resunicode;
2833 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2834 Py_ssize_t size,
2835 const char *errors,
2836 int limit)
2838 /* output object */
2839 PyObject *res;
2840 /* pointers to the beginning and end+1 of input */
2841 const Py_UNICODE *startp = p;
2842 const Py_UNICODE *endp = p + size;
2843 /* pointer to the beginning of the unencodable characters */
2844 /* const Py_UNICODE *badp = NULL; */
2845 /* pointer into the output */
2846 char *str;
2847 /* current output position */
2848 Py_ssize_t respos = 0;
2849 Py_ssize_t ressize;
2850 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
2851 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2852 PyObject *errorHandler = NULL;
2853 PyObject *exc = NULL;
2854 /* the following variable is used for caching string comparisons
2855 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2856 int known_errorHandler = -1;
2858 /* allocate enough for a simple encoding without
2859 replacements, if we need more, we'll resize */
2860 res = PyString_FromStringAndSize(NULL, size);
2861 if (res == NULL)
2862 goto onError;
2863 if (size == 0)
2864 return res;
2865 str = PyString_AS_STRING(res);
2866 ressize = size;
2868 while (p<endp) {
2869 Py_UNICODE c = *p;
2871 /* can we encode this? */
2872 if (c<limit) {
2873 /* no overflow check, because we know that the space is enough */
2874 *str++ = (char)c;
2875 ++p;
2877 else {
2878 Py_ssize_t unicodepos = p-startp;
2879 Py_ssize_t requiredsize;
2880 PyObject *repunicode;
2881 Py_ssize_t repsize;
2882 Py_ssize_t newpos;
2883 Py_ssize_t respos;
2884 Py_UNICODE *uni2;
2885 /* startpos for collecting unencodable chars */
2886 const Py_UNICODE *collstart = p;
2887 const Py_UNICODE *collend = p;
2888 /* find all unecodable characters */
2889 while ((collend < endp) && ((*collend)>=limit))
2890 ++collend;
2891 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2892 if (known_errorHandler==-1) {
2893 if ((errors==NULL) || (!strcmp(errors, "strict")))
2894 known_errorHandler = 1;
2895 else if (!strcmp(errors, "replace"))
2896 known_errorHandler = 2;
2897 else if (!strcmp(errors, "ignore"))
2898 known_errorHandler = 3;
2899 else if (!strcmp(errors, "xmlcharrefreplace"))
2900 known_errorHandler = 4;
2901 else
2902 known_errorHandler = 0;
2904 switch (known_errorHandler) {
2905 case 1: /* strict */
2906 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2907 goto onError;
2908 case 2: /* replace */
2909 while (collstart++<collend)
2910 *str++ = '?'; /* fall through */
2911 case 3: /* ignore */
2912 p = collend;
2913 break;
2914 case 4: /* xmlcharrefreplace */
2915 respos = str-PyString_AS_STRING(res);
2916 /* determine replacement size (temporarily (mis)uses p) */
2917 for (p = collstart, repsize = 0; p < collend; ++p) {
2918 if (*p<10)
2919 repsize += 2+1+1;
2920 else if (*p<100)
2921 repsize += 2+2+1;
2922 else if (*p<1000)
2923 repsize += 2+3+1;
2924 else if (*p<10000)
2925 repsize += 2+4+1;
2926 #ifndef Py_UNICODE_WIDE
2927 else
2928 repsize += 2+5+1;
2929 #else
2930 else if (*p<100000)
2931 repsize += 2+5+1;
2932 else if (*p<1000000)
2933 repsize += 2+6+1;
2934 else
2935 repsize += 2+7+1;
2936 #endif
2938 requiredsize = respos+repsize+(endp-collend);
2939 if (requiredsize > ressize) {
2940 if (requiredsize<2*ressize)
2941 requiredsize = 2*ressize;
2942 if (_PyString_Resize(&res, requiredsize))
2943 goto onError;
2944 str = PyString_AS_STRING(res) + respos;
2945 ressize = requiredsize;
2947 /* generate replacement (temporarily (mis)uses p) */
2948 for (p = collstart; p < collend; ++p) {
2949 str += sprintf(str, "&#%d;", (int)*p);
2951 p = collend;
2952 break;
2953 default:
2954 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2955 encoding, reason, startp, size, &exc,
2956 collstart-startp, collend-startp, &newpos);
2957 if (repunicode == NULL)
2958 goto onError;
2959 /* need more space? (at least enough for what we
2960 have+the replacement+the rest of the string, so
2961 we won't have to check space for encodable characters) */
2962 respos = str-PyString_AS_STRING(res);
2963 repsize = PyUnicode_GET_SIZE(repunicode);
2964 requiredsize = respos+repsize+(endp-collend);
2965 if (requiredsize > ressize) {
2966 if (requiredsize<2*ressize)
2967 requiredsize = 2*ressize;
2968 if (_PyString_Resize(&res, requiredsize)) {
2969 Py_DECREF(repunicode);
2970 goto onError;
2972 str = PyString_AS_STRING(res) + respos;
2973 ressize = requiredsize;
2975 /* check if there is anything unencodable in the replacement
2976 and copy it to the output */
2977 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2978 c = *uni2;
2979 if (c >= limit) {
2980 raise_encode_exception(&exc, encoding, startp, size,
2981 unicodepos, unicodepos+1, reason);
2982 Py_DECREF(repunicode);
2983 goto onError;
2985 *str = (char)c;
2987 p = startp + newpos;
2988 Py_DECREF(repunicode);
2992 /* Resize if we allocated to much */
2993 respos = str-PyString_AS_STRING(res);
2994 if (respos<ressize)
2995 /* If this falls res will be NULL */
2996 _PyString_Resize(&res, respos);
2997 Py_XDECREF(errorHandler);
2998 Py_XDECREF(exc);
2999 return res;
3001 onError:
3002 Py_XDECREF(res);
3003 Py_XDECREF(errorHandler);
3004 Py_XDECREF(exc);
3005 return NULL;
3008 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3009 Py_ssize_t size,
3010 const char *errors)
3012 return unicode_encode_ucs1(p, size, errors, 256);
3015 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3017 if (!PyUnicode_Check(unicode)) {
3018 PyErr_BadArgument();
3019 return NULL;
3021 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3022 PyUnicode_GET_SIZE(unicode),
3023 NULL);
3026 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3028 PyObject *PyUnicode_DecodeASCII(const char *s,
3029 Py_ssize_t size,
3030 const char *errors)
3032 const char *starts = s;
3033 PyUnicodeObject *v;
3034 Py_UNICODE *p;
3035 Py_ssize_t startinpos;
3036 Py_ssize_t endinpos;
3037 Py_ssize_t outpos;
3038 const char *e;
3039 PyObject *errorHandler = NULL;
3040 PyObject *exc = NULL;
3042 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3043 if (size == 1 && *(unsigned char*)s < 128) {
3044 Py_UNICODE r = *(unsigned char*)s;
3045 return PyUnicode_FromUnicode(&r, 1);
3048 v = _PyUnicode_New(size);
3049 if (v == NULL)
3050 goto onError;
3051 if (size == 0)
3052 return (PyObject *)v;
3053 p = PyUnicode_AS_UNICODE(v);
3054 e = s + size;
3055 while (s < e) {
3056 register unsigned char c = (unsigned char)*s;
3057 if (c < 128) {
3058 *p++ = c;
3059 ++s;
3061 else {
3062 startinpos = s-starts;
3063 endinpos = startinpos + 1;
3064 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3065 if (unicode_decode_call_errorhandler(
3066 errors, &errorHandler,
3067 "ascii", "ordinal not in range(128)",
3068 starts, size, &startinpos, &endinpos, &exc, &s,
3069 (PyObject **)&v, &outpos, &p))
3070 goto onError;
3073 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3074 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3075 goto onError;
3076 Py_XDECREF(errorHandler);
3077 Py_XDECREF(exc);
3078 return (PyObject *)v;
3080 onError:
3081 Py_XDECREF(v);
3082 Py_XDECREF(errorHandler);
3083 Py_XDECREF(exc);
3084 return NULL;
3087 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3088 Py_ssize_t size,
3089 const char *errors)
3091 return unicode_encode_ucs1(p, size, errors, 128);
3094 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3096 if (!PyUnicode_Check(unicode)) {
3097 PyErr_BadArgument();
3098 return NULL;
3100 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3101 PyUnicode_GET_SIZE(unicode),
3102 NULL);
3105 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3107 /* --- MBCS codecs for Windows -------------------------------------------- */
3109 #if SIZEOF_INT < SIZEOF_SSIZE_T
3110 #define NEED_RETRY
3111 #endif
3113 /* XXX This code is limited to "true" double-byte encodings, as
3114 a) it assumes an incomplete character consists of a single byte, and
3115 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3116 encodings, see IsDBCSLeadByteEx documentation. */
3118 static int is_dbcs_lead_byte(const char *s, int offset)
3120 const char *curr = s + offset;
3122 if (IsDBCSLeadByte(*curr)) {
3123 const char *prev = CharPrev(s, curr);
3124 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3126 return 0;
3130 * Decode MBCS string into unicode object. If 'final' is set, converts
3131 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3133 static int decode_mbcs(PyUnicodeObject **v,
3134 const char *s, /* MBCS string */
3135 int size, /* sizeof MBCS string */
3136 int final)
3138 Py_UNICODE *p;
3139 Py_ssize_t n = 0;
3140 int usize = 0;
3142 assert(size >= 0);
3144 /* Skip trailing lead-byte unless 'final' is set */
3145 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3146 --size;
3148 /* First get the size of the result */
3149 if (size > 0) {
3150 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3151 if (usize == 0) {
3152 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3153 return -1;
3157 if (*v == NULL) {
3158 /* Create unicode object */
3159 *v = _PyUnicode_New(usize);
3160 if (*v == NULL)
3161 return -1;
3163 else {
3164 /* Extend unicode object */
3165 n = PyUnicode_GET_SIZE(*v);
3166 if (_PyUnicode_Resize(v, n + usize) < 0)
3167 return -1;
3170 /* Do the conversion */
3171 if (size > 0) {
3172 p = PyUnicode_AS_UNICODE(*v) + n;
3173 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3174 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3175 return -1;
3179 return size;
3182 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3183 Py_ssize_t size,
3184 const char *errors,
3185 Py_ssize_t *consumed)
3187 PyUnicodeObject *v = NULL;
3188 int done;
3190 if (consumed)
3191 *consumed = 0;
3193 #ifdef NEED_RETRY
3194 retry:
3195 if (size > INT_MAX)
3196 done = decode_mbcs(&v, s, INT_MAX, 0);
3197 else
3198 #endif
3199 done = decode_mbcs(&v, s, (int)size, !consumed);
3201 if (done < 0) {
3202 Py_XDECREF(v);
3203 return NULL;
3206 if (consumed)
3207 *consumed += done;
3209 #ifdef NEED_RETRY
3210 if (size > INT_MAX) {
3211 s += done;
3212 size -= done;
3213 goto retry;
3215 #endif
3217 return (PyObject *)v;
3220 PyObject *PyUnicode_DecodeMBCS(const char *s,
3221 Py_ssize_t size,
3222 const char *errors)
3224 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3228 * Convert unicode into string object (MBCS).
3229 * Returns 0 if succeed, -1 otherwise.
3231 static int encode_mbcs(PyObject **repr,
3232 const Py_UNICODE *p, /* unicode */
3233 int size) /* size of unicode */
3235 int mbcssize = 0;
3236 Py_ssize_t n = 0;
3238 assert(size >= 0);
3240 /* First get the size of the result */
3241 if (size > 0) {
3242 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3243 if (mbcssize == 0) {
3244 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3245 return -1;
3249 if (*repr == NULL) {
3250 /* Create string object */
3251 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3252 if (*repr == NULL)
3253 return -1;
3255 else {
3256 /* Extend string object */
3257 n = PyString_Size(*repr);
3258 if (_PyString_Resize(repr, n + mbcssize) < 0)
3259 return -1;
3262 /* Do the conversion */
3263 if (size > 0) {
3264 char *s = PyString_AS_STRING(*repr) + n;
3265 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3266 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3267 return -1;
3271 return 0;
3274 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
3275 Py_ssize_t size,
3276 const char *errors)
3278 PyObject *repr = NULL;
3279 int ret;
3281 #ifdef NEED_RETRY
3282 retry:
3283 if (size > INT_MAX)
3284 ret = encode_mbcs(&repr, p, INT_MAX);
3285 else
3286 #endif
3287 ret = encode_mbcs(&repr, p, (int)size);
3289 if (ret < 0) {
3290 Py_XDECREF(repr);
3291 return NULL;
3294 #ifdef NEED_RETRY
3295 if (size > INT_MAX) {
3296 p += INT_MAX;
3297 size -= INT_MAX;
3298 goto retry;
3300 #endif
3302 return repr;
3305 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3307 if (!PyUnicode_Check(unicode)) {
3308 PyErr_BadArgument();
3309 return NULL;
3311 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3312 PyUnicode_GET_SIZE(unicode),
3313 NULL);
3316 #undef NEED_RETRY
3318 #endif /* MS_WINDOWS */
3320 /* --- Character Mapping Codec -------------------------------------------- */
3322 PyObject *PyUnicode_DecodeCharmap(const char *s,
3323 Py_ssize_t size,
3324 PyObject *mapping,
3325 const char *errors)
3327 const char *starts = s;
3328 Py_ssize_t startinpos;
3329 Py_ssize_t endinpos;
3330 Py_ssize_t outpos;
3331 const char *e;
3332 PyUnicodeObject *v;
3333 Py_UNICODE *p;
3334 Py_ssize_t extrachars = 0;
3335 PyObject *errorHandler = NULL;
3336 PyObject *exc = NULL;
3337 Py_UNICODE *mapstring = NULL;
3338 Py_ssize_t maplen = 0;
3340 /* Default to Latin-1 */
3341 if (mapping == NULL)
3342 return PyUnicode_DecodeLatin1(s, size, errors);
3344 v = _PyUnicode_New(size);
3345 if (v == NULL)
3346 goto onError;
3347 if (size == 0)
3348 return (PyObject *)v;
3349 p = PyUnicode_AS_UNICODE(v);
3350 e = s + size;
3351 if (PyUnicode_CheckExact(mapping)) {
3352 mapstring = PyUnicode_AS_UNICODE(mapping);
3353 maplen = PyUnicode_GET_SIZE(mapping);
3354 while (s < e) {
3355 unsigned char ch = *s;
3356 Py_UNICODE x = 0xfffe; /* illegal value */
3358 if (ch < maplen)
3359 x = mapstring[ch];
3361 if (x == 0xfffe) {
3362 /* undefined mapping */
3363 outpos = p-PyUnicode_AS_UNICODE(v);
3364 startinpos = s-starts;
3365 endinpos = startinpos+1;
3366 if (unicode_decode_call_errorhandler(
3367 errors, &errorHandler,
3368 "charmap", "character maps to <undefined>",
3369 starts, size, &startinpos, &endinpos, &exc, &s,
3370 (PyObject **)&v, &outpos, &p)) {
3371 goto onError;
3373 continue;
3375 *p++ = x;
3376 ++s;
3379 else {
3380 while (s < e) {
3381 unsigned char ch = *s;
3382 PyObject *w, *x;
3384 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3385 w = PyInt_FromLong((long)ch);
3386 if (w == NULL)
3387 goto onError;
3388 x = PyObject_GetItem(mapping, w);
3389 Py_DECREF(w);
3390 if (x == NULL) {
3391 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3392 /* No mapping found means: mapping is undefined. */
3393 PyErr_Clear();
3394 x = Py_None;
3395 Py_INCREF(x);
3396 } else
3397 goto onError;
3400 /* Apply mapping */
3401 if (PyInt_Check(x)) {
3402 long value = PyInt_AS_LONG(x);
3403 if (value < 0 || value > 65535) {
3404 PyErr_SetString(PyExc_TypeError,
3405 "character mapping must be in range(65536)");
3406 Py_DECREF(x);
3407 goto onError;
3409 *p++ = (Py_UNICODE)value;
3411 else if (x == Py_None) {
3412 /* undefined mapping */
3413 outpos = p-PyUnicode_AS_UNICODE(v);
3414 startinpos = s-starts;
3415 endinpos = startinpos+1;
3416 if (unicode_decode_call_errorhandler(
3417 errors, &errorHandler,
3418 "charmap", "character maps to <undefined>",
3419 starts, size, &startinpos, &endinpos, &exc, &s,
3420 (PyObject **)&v, &outpos, &p)) {
3421 Py_DECREF(x);
3422 goto onError;
3424 Py_DECREF(x);
3425 continue;
3427 else if (PyUnicode_Check(x)) {
3428 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
3430 if (targetsize == 1)
3431 /* 1-1 mapping */
3432 *p++ = *PyUnicode_AS_UNICODE(x);
3434 else if (targetsize > 1) {
3435 /* 1-n mapping */
3436 if (targetsize > extrachars) {
3437 /* resize first */
3438 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
3439 Py_ssize_t needed = (targetsize - extrachars) + \
3440 (targetsize << 2);
3441 extrachars += needed;
3442 /* XXX overflow detection missing */
3443 if (_PyUnicode_Resize(&v,
3444 PyUnicode_GET_SIZE(v) + needed) < 0) {
3445 Py_DECREF(x);
3446 goto onError;
3448 p = PyUnicode_AS_UNICODE(v) + oldpos;
3450 Py_UNICODE_COPY(p,
3451 PyUnicode_AS_UNICODE(x),
3452 targetsize);
3453 p += targetsize;
3454 extrachars -= targetsize;
3456 /* 1-0 mapping: skip the character */
3458 else {
3459 /* wrong return value */
3460 PyErr_SetString(PyExc_TypeError,
3461 "character mapping must return integer, None or unicode");
3462 Py_DECREF(x);
3463 goto onError;
3465 Py_DECREF(x);
3466 ++s;
3469 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
3470 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3471 goto onError;
3472 Py_XDECREF(errorHandler);
3473 Py_XDECREF(exc);
3474 return (PyObject *)v;
3476 onError:
3477 Py_XDECREF(errorHandler);
3478 Py_XDECREF(exc);
3479 Py_XDECREF(v);
3480 return NULL;
3483 /* Charmap encoding: the lookup table */
3485 struct encoding_map{
3486 PyObject_HEAD
3487 unsigned char level1[32];
3488 int count2, count3;
3489 unsigned char level23[1];
3492 static PyObject*
3493 encoding_map_size(PyObject *obj, PyObject* args)
3495 struct encoding_map *map = (struct encoding_map*)obj;
3496 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
3497 128*map->count3);
3500 static PyMethodDef encoding_map_methods[] = {
3501 {"size", encoding_map_size, METH_NOARGS,
3502 PyDoc_STR("Return the size (in bytes) of this object") },
3503 { 0 }
3506 static void
3507 encoding_map_dealloc(PyObject* o)
3509 PyObject_FREE(o);
3512 static PyTypeObject EncodingMapType = {
3513 PyVarObject_HEAD_INIT(NULL, 0)
3514 "EncodingMap", /*tp_name*/
3515 sizeof(struct encoding_map), /*tp_basicsize*/
3516 0, /*tp_itemsize*/
3517 /* methods */
3518 encoding_map_dealloc, /*tp_dealloc*/
3519 0, /*tp_print*/
3520 0, /*tp_getattr*/
3521 0, /*tp_setattr*/
3522 0, /*tp_compare*/
3523 0, /*tp_repr*/
3524 0, /*tp_as_number*/
3525 0, /*tp_as_sequence*/
3526 0, /*tp_as_mapping*/
3527 0, /*tp_hash*/
3528 0, /*tp_call*/
3529 0, /*tp_str*/
3530 0, /*tp_getattro*/
3531 0, /*tp_setattro*/
3532 0, /*tp_as_buffer*/
3533 Py_TPFLAGS_DEFAULT, /*tp_flags*/
3534 0, /*tp_doc*/
3535 0, /*tp_traverse*/
3536 0, /*tp_clear*/
3537 0, /*tp_richcompare*/
3538 0, /*tp_weaklistoffset*/
3539 0, /*tp_iter*/
3540 0, /*tp_iternext*/
3541 encoding_map_methods, /*tp_methods*/
3542 0, /*tp_members*/
3543 0, /*tp_getset*/
3544 0, /*tp_base*/
3545 0, /*tp_dict*/
3546 0, /*tp_descr_get*/
3547 0, /*tp_descr_set*/
3548 0, /*tp_dictoffset*/
3549 0, /*tp_init*/
3550 0, /*tp_alloc*/
3551 0, /*tp_new*/
3552 0, /*tp_free*/
3553 0, /*tp_is_gc*/
3556 PyObject*
3557 PyUnicode_BuildEncodingMap(PyObject* string)
3559 Py_UNICODE *decode;
3560 PyObject *result;
3561 struct encoding_map *mresult;
3562 int i;
3563 int need_dict = 0;
3564 unsigned char level1[32];
3565 unsigned char level2[512];
3566 unsigned char *mlevel1, *mlevel2, *mlevel3;
3567 int count2 = 0, count3 = 0;
3569 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
3570 PyErr_BadArgument();
3571 return NULL;
3573 decode = PyUnicode_AS_UNICODE(string);
3574 memset(level1, 0xFF, sizeof level1);
3575 memset(level2, 0xFF, sizeof level2);
3577 /* If there isn't a one-to-one mapping of NULL to \0,
3578 or if there are non-BMP characters, we need to use
3579 a mapping dictionary. */
3580 if (decode[0] != 0)
3581 need_dict = 1;
3582 for (i = 1; i < 256; i++) {
3583 int l1, l2;
3584 if (decode[i] == 0
3585 #ifdef Py_UNICODE_WIDE
3586 || decode[i] > 0xFFFF
3587 #endif
3589 need_dict = 1;
3590 break;
3592 if (decode[i] == 0xFFFE)
3593 /* unmapped character */
3594 continue;
3595 l1 = decode[i] >> 11;
3596 l2 = decode[i] >> 7;
3597 if (level1[l1] == 0xFF)
3598 level1[l1] = count2++;
3599 if (level2[l2] == 0xFF)
3600 level2[l2] = count3++;
3603 if (count2 >= 0xFF || count3 >= 0xFF)
3604 need_dict = 1;
3606 if (need_dict) {
3607 PyObject *result = PyDict_New();
3608 PyObject *key, *value;
3609 if (!result)
3610 return NULL;
3611 for (i = 0; i < 256; i++) {
3612 key = value = NULL;
3613 key = PyInt_FromLong(decode[i]);
3614 value = PyInt_FromLong(i);
3615 if (!key || !value)
3616 goto failed1;
3617 if (PyDict_SetItem(result, key, value) == -1)
3618 goto failed1;
3619 Py_DECREF(key);
3620 Py_DECREF(value);
3622 return result;
3623 failed1:
3624 Py_XDECREF(key);
3625 Py_XDECREF(value);
3626 Py_DECREF(result);
3627 return NULL;
3630 /* Create a three-level trie */
3631 result = PyObject_MALLOC(sizeof(struct encoding_map) +
3632 16*count2 + 128*count3 - 1);
3633 if (!result)
3634 return PyErr_NoMemory();
3635 PyObject_Init(result, &EncodingMapType);
3636 mresult = (struct encoding_map*)result;
3637 mresult->count2 = count2;
3638 mresult->count3 = count3;
3639 mlevel1 = mresult->level1;
3640 mlevel2 = mresult->level23;
3641 mlevel3 = mresult->level23 + 16*count2;
3642 memcpy(mlevel1, level1, 32);
3643 memset(mlevel2, 0xFF, 16*count2);
3644 memset(mlevel3, 0, 128*count3);
3645 count3 = 0;
3646 for (i = 1; i < 256; i++) {
3647 int o1, o2, o3, i2, i3;
3648 if (decode[i] == 0xFFFE)
3649 /* unmapped character */
3650 continue;
3651 o1 = decode[i]>>11;
3652 o2 = (decode[i]>>7) & 0xF;
3653 i2 = 16*mlevel1[o1] + o2;
3654 if (mlevel2[i2] == 0xFF)
3655 mlevel2[i2] = count3++;
3656 o3 = decode[i] & 0x7F;
3657 i3 = 128*mlevel2[i2] + o3;
3658 mlevel3[i3] = i;
3660 return result;
3663 static int
3664 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
3666 struct encoding_map *map = (struct encoding_map*)mapping;
3667 int l1 = c>>11;
3668 int l2 = (c>>7) & 0xF;
3669 int l3 = c & 0x7F;
3670 int i;
3672 #ifdef Py_UNICODE_WIDE
3673 if (c > 0xFFFF) {
3674 return -1;
3676 #endif
3677 if (c == 0)
3678 return 0;
3679 /* level 1*/
3680 i = map->level1[l1];
3681 if (i == 0xFF) {
3682 return -1;
3684 /* level 2*/
3685 i = map->level23[16*i+l2];
3686 if (i == 0xFF) {
3687 return -1;
3689 /* level 3 */
3690 i = map->level23[16*map->count2 + 128*i + l3];
3691 if (i == 0) {
3692 return -1;
3694 return i;
3697 /* Lookup the character ch in the mapping. If the character
3698 can't be found, Py_None is returned (or NULL, if another
3699 error occurred). */
3700 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
3702 PyObject *w = PyInt_FromLong((long)c);
3703 PyObject *x;
3705 if (w == NULL)
3706 return NULL;
3707 x = PyObject_GetItem(mapping, w);
3708 Py_DECREF(w);
3709 if (x == NULL) {
3710 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3711 /* No mapping found means: mapping is undefined. */
3712 PyErr_Clear();
3713 x = Py_None;
3714 Py_INCREF(x);
3715 return x;
3716 } else
3717 return NULL;
3719 else if (x == Py_None)
3720 return x;
3721 else if (PyInt_Check(x)) {
3722 long value = PyInt_AS_LONG(x);
3723 if (value < 0 || value > 255) {
3724 PyErr_SetString(PyExc_TypeError,
3725 "character mapping must be in range(256)");
3726 Py_DECREF(x);
3727 return NULL;
3729 return x;
3731 else if (PyString_Check(x))
3732 return x;
3733 else {
3734 /* wrong return value */
3735 PyErr_SetString(PyExc_TypeError,
3736 "character mapping must return integer, None or str");
3737 Py_DECREF(x);
3738 return NULL;
3742 static int
3743 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
3745 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3746 /* exponentially overallocate to minimize reallocations */
3747 if (requiredsize < 2*outsize)
3748 requiredsize = 2*outsize;
3749 if (_PyString_Resize(outobj, requiredsize)) {
3750 return 0;
3752 return 1;
3755 typedef enum charmapencode_result {
3756 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
3757 }charmapencode_result;
3758 /* lookup the character, put the result in the output string and adjust
3759 various state variables. Reallocate the output string if not enough
3760 space is available. Return a new reference to the object that
3761 was put in the output buffer, or Py_None, if the mapping was undefined
3762 (in which case no character was written) or NULL, if a
3763 reallocation error occurred. The caller must decref the result */
3764 static
3765 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
3766 PyObject **outobj, Py_ssize_t *outpos)
3768 PyObject *rep;
3769 char *outstart;
3770 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
3772 if (Py_Type(mapping) == &EncodingMapType) {
3773 int res = encoding_map_lookup(c, mapping);
3774 Py_ssize_t requiredsize = *outpos+1;
3775 if (res == -1)
3776 return enc_FAILED;
3777 if (outsize<requiredsize)
3778 if (!charmapencode_resize(outobj, outpos, requiredsize))
3779 return enc_EXCEPTION;
3780 outstart = PyString_AS_STRING(*outobj);
3781 outstart[(*outpos)++] = (char)res;
3782 return enc_SUCCESS;
3785 rep = charmapencode_lookup(c, mapping);
3786 if (rep==NULL)
3787 return enc_EXCEPTION;
3788 else if (rep==Py_None) {
3789 Py_DECREF(rep);
3790 return enc_FAILED;
3791 } else {
3792 if (PyInt_Check(rep)) {
3793 Py_ssize_t requiredsize = *outpos+1;
3794 if (outsize<requiredsize)
3795 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3796 Py_DECREF(rep);
3797 return enc_EXCEPTION;
3799 outstart = PyString_AS_STRING(*outobj);
3800 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3802 else {
3803 const char *repchars = PyString_AS_STRING(rep);
3804 Py_ssize_t repsize = PyString_GET_SIZE(rep);
3805 Py_ssize_t requiredsize = *outpos+repsize;
3806 if (outsize<requiredsize)
3807 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
3808 Py_DECREF(rep);
3809 return enc_EXCEPTION;
3811 outstart = PyString_AS_STRING(*outobj);
3812 memcpy(outstart + *outpos, repchars, repsize);
3813 *outpos += repsize;
3816 Py_DECREF(rep);
3817 return enc_SUCCESS;
3820 /* handle an error in PyUnicode_EncodeCharmap
3821 Return 0 on success, -1 on error */
3822 static
3823 int charmap_encoding_error(
3824 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
3825 PyObject **exceptionObject,
3826 int *known_errorHandler, PyObject **errorHandler, const char *errors,
3827 PyObject **res, Py_ssize_t *respos)
3829 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3830 Py_ssize_t repsize;
3831 Py_ssize_t newpos;
3832 Py_UNICODE *uni2;
3833 /* startpos for collecting unencodable chars */
3834 Py_ssize_t collstartpos = *inpos;
3835 Py_ssize_t collendpos = *inpos+1;
3836 Py_ssize_t collpos;
3837 char *encoding = "charmap";
3838 char *reason = "character maps to <undefined>";
3839 charmapencode_result x;
3841 /* find all unencodable characters */
3842 while (collendpos < size) {
3843 PyObject *rep;
3844 if (Py_Type(mapping) == &EncodingMapType) {
3845 int res = encoding_map_lookup(p[collendpos], mapping);
3846 if (res != -1)
3847 break;
3848 ++collendpos;
3849 continue;
3852 rep = charmapencode_lookup(p[collendpos], mapping);
3853 if (rep==NULL)
3854 return -1;
3855 else if (rep!=Py_None) {
3856 Py_DECREF(rep);
3857 break;
3859 Py_DECREF(rep);
3860 ++collendpos;
3862 /* cache callback name lookup
3863 * (if not done yet, i.e. it's the first error) */
3864 if (*known_errorHandler==-1) {
3865 if ((errors==NULL) || (!strcmp(errors, "strict")))
3866 *known_errorHandler = 1;
3867 else if (!strcmp(errors, "replace"))
3868 *known_errorHandler = 2;
3869 else if (!strcmp(errors, "ignore"))
3870 *known_errorHandler = 3;
3871 else if (!strcmp(errors, "xmlcharrefreplace"))
3872 *known_errorHandler = 4;
3873 else
3874 *known_errorHandler = 0;
3876 switch (*known_errorHandler) {
3877 case 1: /* strict */
3878 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3879 return -1;
3880 case 2: /* replace */
3881 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3882 x = charmapencode_output('?', mapping, res, respos);
3883 if (x==enc_EXCEPTION) {
3884 return -1;
3886 else if (x==enc_FAILED) {
3887 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3888 return -1;
3891 /* fall through */
3892 case 3: /* ignore */
3893 *inpos = collendpos;
3894 break;
3895 case 4: /* xmlcharrefreplace */
3896 /* generate replacement (temporarily (mis)uses p) */
3897 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3898 char buffer[2+29+1+1];
3899 char *cp;
3900 sprintf(buffer, "&#%d;", (int)p[collpos]);
3901 for (cp = buffer; *cp; ++cp) {
3902 x = charmapencode_output(*cp, mapping, res, respos);
3903 if (x==enc_EXCEPTION)
3904 return -1;
3905 else if (x==enc_FAILED) {
3906 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3907 return -1;
3911 *inpos = collendpos;
3912 break;
3913 default:
3914 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
3915 encoding, reason, p, size, exceptionObject,
3916 collstartpos, collendpos, &newpos);
3917 if (repunicode == NULL)
3918 return -1;
3919 /* generate replacement */
3920 repsize = PyUnicode_GET_SIZE(repunicode);
3921 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3922 x = charmapencode_output(*uni2, mapping, res, respos);
3923 if (x==enc_EXCEPTION) {
3924 return -1;
3926 else if (x==enc_FAILED) {
3927 Py_DECREF(repunicode);
3928 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3929 return -1;
3932 *inpos = newpos;
3933 Py_DECREF(repunicode);
3935 return 0;
3938 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3939 Py_ssize_t size,
3940 PyObject *mapping,
3941 const char *errors)
3943 /* output object */
3944 PyObject *res = NULL;
3945 /* current input position */
3946 Py_ssize_t inpos = 0;
3947 /* current output position */
3948 Py_ssize_t respos = 0;
3949 PyObject *errorHandler = NULL;
3950 PyObject *exc = NULL;
3951 /* the following variable is used for caching string comparisons
3952 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3953 * 3=ignore, 4=xmlcharrefreplace */
3954 int known_errorHandler = -1;
3956 /* Default to Latin-1 */
3957 if (mapping == NULL)
3958 return PyUnicode_EncodeLatin1(p, size, errors);
3960 /* allocate enough for a simple encoding without
3961 replacements, if we need more, we'll resize */
3962 res = PyString_FromStringAndSize(NULL, size);
3963 if (res == NULL)
3964 goto onError;
3965 if (size == 0)
3966 return res;
3968 while (inpos<size) {
3969 /* try to encode it */
3970 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
3971 if (x==enc_EXCEPTION) /* error */
3972 goto onError;
3973 if (x==enc_FAILED) { /* unencodable character */
3974 if (charmap_encoding_error(p, size, &inpos, mapping,
3975 &exc,
3976 &known_errorHandler, &errorHandler, errors,
3977 &res, &respos)) {
3978 goto onError;
3981 else
3982 /* done with this character => adjust input position */
3983 ++inpos;
3986 /* Resize if we allocated to much */
3987 if (respos<PyString_GET_SIZE(res)) {
3988 if (_PyString_Resize(&res, respos))
3989 goto onError;
3991 Py_XDECREF(exc);
3992 Py_XDECREF(errorHandler);
3993 return res;
3995 onError:
3996 Py_XDECREF(res);
3997 Py_XDECREF(exc);
3998 Py_XDECREF(errorHandler);
3999 return NULL;
4002 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4003 PyObject *mapping)
4005 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4006 PyErr_BadArgument();
4007 return NULL;
4009 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4010 PyUnicode_GET_SIZE(unicode),
4011 mapping,
4012 NULL);
4015 /* create or adjust a UnicodeTranslateError */
4016 static void make_translate_exception(PyObject **exceptionObject,
4017 const Py_UNICODE *unicode, Py_ssize_t size,
4018 Py_ssize_t startpos, Py_ssize_t endpos,
4019 const char *reason)
4021 if (*exceptionObject == NULL) {
4022 *exceptionObject = PyUnicodeTranslateError_Create(
4023 unicode, size, startpos, endpos, reason);
4025 else {
4026 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4027 goto onError;
4028 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4029 goto onError;
4030 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4031 goto onError;
4032 return;
4033 onError:
4034 Py_DECREF(*exceptionObject);
4035 *exceptionObject = NULL;
4039 /* raises a UnicodeTranslateError */
4040 static void raise_translate_exception(PyObject **exceptionObject,
4041 const Py_UNICODE *unicode, Py_ssize_t size,
4042 Py_ssize_t startpos, Py_ssize_t endpos,
4043 const char *reason)
4045 make_translate_exception(exceptionObject,
4046 unicode, size, startpos, endpos, reason);
4047 if (*exceptionObject != NULL)
4048 PyCodec_StrictErrors(*exceptionObject);
4051 /* error handling callback helper:
4052 build arguments, call the callback and check the arguments,
4053 put the result into newpos and return the replacement string, which
4054 has to be freed by the caller */
4055 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4056 PyObject **errorHandler,
4057 const char *reason,
4058 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4059 Py_ssize_t startpos, Py_ssize_t endpos,
4060 Py_ssize_t *newpos)
4062 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4064 Py_ssize_t i_newpos;
4065 PyObject *restuple;
4066 PyObject *resunicode;
4068 if (*errorHandler == NULL) {
4069 *errorHandler = PyCodec_LookupError(errors);
4070 if (*errorHandler == NULL)
4071 return NULL;
4074 make_translate_exception(exceptionObject,
4075 unicode, size, startpos, endpos, reason);
4076 if (*exceptionObject == NULL)
4077 return NULL;
4079 restuple = PyObject_CallFunctionObjArgs(
4080 *errorHandler, *exceptionObject, NULL);
4081 if (restuple == NULL)
4082 return NULL;
4083 if (!PyTuple_Check(restuple)) {
4084 PyErr_Format(PyExc_TypeError, &argparse[4]);
4085 Py_DECREF(restuple);
4086 return NULL;
4088 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4089 &resunicode, &i_newpos)) {
4090 Py_DECREF(restuple);
4091 return NULL;
4093 if (i_newpos<0)
4094 *newpos = size+i_newpos;
4095 else
4096 *newpos = i_newpos;
4097 if (*newpos<0 || *newpos>size) {
4098 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4099 Py_DECREF(restuple);
4100 return NULL;
4102 Py_INCREF(resunicode);
4103 Py_DECREF(restuple);
4104 return resunicode;
4107 /* Lookup the character ch in the mapping and put the result in result,
4108 which must be decrefed by the caller.
4109 Return 0 on success, -1 on error */
4110 static
4111 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4113 PyObject *w = PyInt_FromLong((long)c);
4114 PyObject *x;
4116 if (w == NULL)
4117 return -1;
4118 x = PyObject_GetItem(mapping, w);
4119 Py_DECREF(w);
4120 if (x == NULL) {
4121 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4122 /* No mapping found means: use 1:1 mapping. */
4123 PyErr_Clear();
4124 *result = NULL;
4125 return 0;
4126 } else
4127 return -1;
4129 else if (x == Py_None) {
4130 *result = x;
4131 return 0;
4133 else if (PyInt_Check(x)) {
4134 long value = PyInt_AS_LONG(x);
4135 long max = PyUnicode_GetMax();
4136 if (value < 0 || value > max) {
4137 PyErr_Format(PyExc_TypeError,
4138 "character mapping must be in range(0x%lx)", max+1);
4139 Py_DECREF(x);
4140 return -1;
4142 *result = x;
4143 return 0;
4145 else if (PyUnicode_Check(x)) {
4146 *result = x;
4147 return 0;
4149 else {
4150 /* wrong return value */
4151 PyErr_SetString(PyExc_TypeError,
4152 "character mapping must return integer, None or unicode");
4153 Py_DECREF(x);
4154 return -1;
4157 /* ensure that *outobj is at least requiredsize characters long,
4158 if not reallocate and adjust various state variables.
4159 Return 0 on success, -1 on error */
4160 static
4161 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4162 Py_ssize_t requiredsize)
4164 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4165 if (requiredsize > oldsize) {
4166 /* remember old output position */
4167 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4168 /* exponentially overallocate to minimize reallocations */
4169 if (requiredsize < 2 * oldsize)
4170 requiredsize = 2 * oldsize;
4171 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
4172 return -1;
4173 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4175 return 0;
4177 /* lookup the character, put the result in the output string and adjust
4178 various state variables. Return a new reference to the object that
4179 was put in the output buffer in *result, or Py_None, if the mapping was
4180 undefined (in which case no character was written).
4181 The called must decref result.
4182 Return 0 on success, -1 on error. */
4183 static
4184 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4185 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4186 PyObject **res)
4188 if (charmaptranslate_lookup(*curinp, mapping, res))
4189 return -1;
4190 if (*res==NULL) {
4191 /* not found => default to 1:1 mapping */
4192 *(*outp)++ = *curinp;
4194 else if (*res==Py_None)
4196 else if (PyInt_Check(*res)) {
4197 /* no overflow check, because we know that the space is enough */
4198 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4200 else if (PyUnicode_Check(*res)) {
4201 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4202 if (repsize==1) {
4203 /* no overflow check, because we know that the space is enough */
4204 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4206 else if (repsize!=0) {
4207 /* more than one character */
4208 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4209 (insize - (curinp-startinp)) +
4210 repsize - 1;
4211 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4212 return -1;
4213 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4214 *outp += repsize;
4217 else
4218 return -1;
4219 return 0;
4222 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4223 Py_ssize_t size,
4224 PyObject *mapping,
4225 const char *errors)
4227 /* output object */
4228 PyObject *res = NULL;
4229 /* pointers to the beginning and end+1 of input */
4230 const Py_UNICODE *startp = p;
4231 const Py_UNICODE *endp = p + size;
4232 /* pointer into the output */
4233 Py_UNICODE *str;
4234 /* current output position */
4235 Py_ssize_t respos = 0;
4236 char *reason = "character maps to <undefined>";
4237 PyObject *errorHandler = NULL;
4238 PyObject *exc = NULL;
4239 /* the following variable is used for caching string comparisons
4240 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4241 * 3=ignore, 4=xmlcharrefreplace */
4242 int known_errorHandler = -1;
4244 if (mapping == NULL) {
4245 PyErr_BadArgument();
4246 return NULL;
4249 /* allocate enough for a simple 1:1 translation without
4250 replacements, if we need more, we'll resize */
4251 res = PyUnicode_FromUnicode(NULL, size);
4252 if (res == NULL)
4253 goto onError;
4254 if (size == 0)
4255 return res;
4256 str = PyUnicode_AS_UNICODE(res);
4258 while (p<endp) {
4259 /* try to encode it */
4260 PyObject *x = NULL;
4261 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4262 Py_XDECREF(x);
4263 goto onError;
4265 Py_XDECREF(x);
4266 if (x!=Py_None) /* it worked => adjust input pointer */
4267 ++p;
4268 else { /* untranslatable character */
4269 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4270 Py_ssize_t repsize;
4271 Py_ssize_t newpos;
4272 Py_UNICODE *uni2;
4273 /* startpos for collecting untranslatable chars */
4274 const Py_UNICODE *collstart = p;
4275 const Py_UNICODE *collend = p+1;
4276 const Py_UNICODE *coll;
4278 /* find all untranslatable characters */
4279 while (collend < endp) {
4280 if (charmaptranslate_lookup(*collend, mapping, &x))
4281 goto onError;
4282 Py_XDECREF(x);
4283 if (x!=Py_None)
4284 break;
4285 ++collend;
4287 /* cache callback name lookup
4288 * (if not done yet, i.e. it's the first error) */
4289 if (known_errorHandler==-1) {
4290 if ((errors==NULL) || (!strcmp(errors, "strict")))
4291 known_errorHandler = 1;
4292 else if (!strcmp(errors, "replace"))
4293 known_errorHandler = 2;
4294 else if (!strcmp(errors, "ignore"))
4295 known_errorHandler = 3;
4296 else if (!strcmp(errors, "xmlcharrefreplace"))
4297 known_errorHandler = 4;
4298 else
4299 known_errorHandler = 0;
4301 switch (known_errorHandler) {
4302 case 1: /* strict */
4303 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
4304 goto onError;
4305 case 2: /* replace */
4306 /* No need to check for space, this is a 1:1 replacement */
4307 for (coll = collstart; coll<collend; ++coll)
4308 *str++ = '?';
4309 /* fall through */
4310 case 3: /* ignore */
4311 p = collend;
4312 break;
4313 case 4: /* xmlcharrefreplace */
4314 /* generate replacement (temporarily (mis)uses p) */
4315 for (p = collstart; p < collend; ++p) {
4316 char buffer[2+29+1+1];
4317 char *cp;
4318 sprintf(buffer, "&#%d;", (int)*p);
4319 if (charmaptranslate_makespace(&res, &str,
4320 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4321 goto onError;
4322 for (cp = buffer; *cp; ++cp)
4323 *str++ = *cp;
4325 p = collend;
4326 break;
4327 default:
4328 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4329 reason, startp, size, &exc,
4330 collstart-startp, collend-startp, &newpos);
4331 if (repunicode == NULL)
4332 goto onError;
4333 /* generate replacement */
4334 repsize = PyUnicode_GET_SIZE(repunicode);
4335 if (charmaptranslate_makespace(&res, &str,
4336 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4337 Py_DECREF(repunicode);
4338 goto onError;
4340 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4341 *str++ = *uni2;
4342 p = startp + newpos;
4343 Py_DECREF(repunicode);
4347 /* Resize if we allocated to much */
4348 respos = str-PyUnicode_AS_UNICODE(res);
4349 if (respos<PyUnicode_GET_SIZE(res)) {
4350 if (_PyUnicode_Resize(&res, respos) < 0)
4351 goto onError;
4353 Py_XDECREF(exc);
4354 Py_XDECREF(errorHandler);
4355 return res;
4357 onError:
4358 Py_XDECREF(res);
4359 Py_XDECREF(exc);
4360 Py_XDECREF(errorHandler);
4361 return NULL;
4364 PyObject *PyUnicode_Translate(PyObject *str,
4365 PyObject *mapping,
4366 const char *errors)
4368 PyObject *result;
4370 str = PyUnicode_FromObject(str);
4371 if (str == NULL)
4372 goto onError;
4373 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
4374 PyUnicode_GET_SIZE(str),
4375 mapping,
4376 errors);
4377 Py_DECREF(str);
4378 return result;
4380 onError:
4381 Py_XDECREF(str);
4382 return NULL;
4385 /* --- Decimal Encoder ---------------------------------------------------- */
4387 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
4388 Py_ssize_t length,
4389 char *output,
4390 const char *errors)
4392 Py_UNICODE *p, *end;
4393 PyObject *errorHandler = NULL;
4394 PyObject *exc = NULL;
4395 const char *encoding = "decimal";
4396 const char *reason = "invalid decimal Unicode string";
4397 /* the following variable is used for caching string comparisons
4398 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4399 int known_errorHandler = -1;
4401 if (output == NULL) {
4402 PyErr_BadArgument();
4403 return -1;
4406 p = s;
4407 end = s + length;
4408 while (p < end) {
4409 register Py_UNICODE ch = *p;
4410 int decimal;
4411 PyObject *repunicode;
4412 Py_ssize_t repsize;
4413 Py_ssize_t newpos;
4414 Py_UNICODE *uni2;
4415 Py_UNICODE *collstart;
4416 Py_UNICODE *collend;
4418 if (Py_UNICODE_ISSPACE(ch)) {
4419 *output++ = ' ';
4420 ++p;
4421 continue;
4423 decimal = Py_UNICODE_TODECIMAL(ch);
4424 if (decimal >= 0) {
4425 *output++ = '0' + decimal;
4426 ++p;
4427 continue;
4429 if (0 < ch && ch < 256) {
4430 *output++ = (char)ch;
4431 ++p;
4432 continue;
4434 /* All other characters are considered unencodable */
4435 collstart = p;
4436 collend = p+1;
4437 while (collend < end) {
4438 if ((0 < *collend && *collend < 256) ||
4439 !Py_UNICODE_ISSPACE(*collend) ||
4440 Py_UNICODE_TODECIMAL(*collend))
4441 break;
4443 /* cache callback name lookup
4444 * (if not done yet, i.e. it's the first error) */
4445 if (known_errorHandler==-1) {
4446 if ((errors==NULL) || (!strcmp(errors, "strict")))
4447 known_errorHandler = 1;
4448 else if (!strcmp(errors, "replace"))
4449 known_errorHandler = 2;
4450 else if (!strcmp(errors, "ignore"))
4451 known_errorHandler = 3;
4452 else if (!strcmp(errors, "xmlcharrefreplace"))
4453 known_errorHandler = 4;
4454 else
4455 known_errorHandler = 0;
4457 switch (known_errorHandler) {
4458 case 1: /* strict */
4459 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
4460 goto onError;
4461 case 2: /* replace */
4462 for (p = collstart; p < collend; ++p)
4463 *output++ = '?';
4464 /* fall through */
4465 case 3: /* ignore */
4466 p = collend;
4467 break;
4468 case 4: /* xmlcharrefreplace */
4469 /* generate replacement (temporarily (mis)uses p) */
4470 for (p = collstart; p < collend; ++p)
4471 output += sprintf(output, "&#%d;", (int)*p);
4472 p = collend;
4473 break;
4474 default:
4475 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4476 encoding, reason, s, length, &exc,
4477 collstart-s, collend-s, &newpos);
4478 if (repunicode == NULL)
4479 goto onError;
4480 /* generate replacement */
4481 repsize = PyUnicode_GET_SIZE(repunicode);
4482 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4483 Py_UNICODE ch = *uni2;
4484 if (Py_UNICODE_ISSPACE(ch))
4485 *output++ = ' ';
4486 else {
4487 decimal = Py_UNICODE_TODECIMAL(ch);
4488 if (decimal >= 0)
4489 *output++ = '0' + decimal;
4490 else if (0 < ch && ch < 256)
4491 *output++ = (char)ch;
4492 else {
4493 Py_DECREF(repunicode);
4494 raise_encode_exception(&exc, encoding,
4495 s, length, collstart-s, collend-s, reason);
4496 goto onError;
4500 p = s + newpos;
4501 Py_DECREF(repunicode);
4504 /* 0-terminate the output string */
4505 *output++ = '\0';
4506 Py_XDECREF(exc);
4507 Py_XDECREF(errorHandler);
4508 return 0;
4510 onError:
4511 Py_XDECREF(exc);
4512 Py_XDECREF(errorHandler);
4513 return -1;
4516 /* --- Helpers ------------------------------------------------------------ */
4518 #define STRINGLIB_CHAR Py_UNICODE
4520 #define STRINGLIB_LEN PyUnicode_GET_SIZE
4521 #define STRINGLIB_NEW PyUnicode_FromUnicode
4522 #define STRINGLIB_STR PyUnicode_AS_UNICODE
4524 Py_LOCAL_INLINE(int)
4525 STRINGLIB_CMP(const Py_UNICODE* str, const Py_UNICODE* other, Py_ssize_t len)
4527 if (str[0] != other[0])
4528 return 1;
4529 return memcmp((void*) str, (void*) other, len * sizeof(Py_UNICODE));
4532 #define STRINGLIB_EMPTY unicode_empty
4534 #include "stringlib/fastsearch.h"
4536 #include "stringlib/count.h"
4537 #include "stringlib/find.h"
4538 #include "stringlib/partition.h"
4540 /* helper macro to fixup start/end slice values */
4541 #define FIX_START_END(obj) \
4542 if (start < 0) \
4543 start += (obj)->length; \
4544 if (start < 0) \
4545 start = 0; \
4546 if (end > (obj)->length) \
4547 end = (obj)->length; \
4548 if (end < 0) \
4549 end += (obj)->length; \
4550 if (end < 0) \
4551 end = 0;
4553 Py_ssize_t PyUnicode_Count(PyObject *str,
4554 PyObject *substr,
4555 Py_ssize_t start,
4556 Py_ssize_t end)
4558 Py_ssize_t result;
4559 PyUnicodeObject* str_obj;
4560 PyUnicodeObject* sub_obj;
4562 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
4563 if (!str_obj)
4564 return -1;
4565 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
4566 if (!sub_obj) {
4567 Py_DECREF(str_obj);
4568 return -1;
4571 FIX_START_END(str_obj);
4573 result = stringlib_count(
4574 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
4577 Py_DECREF(sub_obj);
4578 Py_DECREF(str_obj);
4580 return result;
4583 Py_ssize_t PyUnicode_Find(PyObject *str,
4584 PyObject *sub,
4585 Py_ssize_t start,
4586 Py_ssize_t end,
4587 int direction)
4589 Py_ssize_t result;
4591 str = PyUnicode_FromObject(str);
4592 if (!str)
4593 return -2;
4594 sub = PyUnicode_FromObject(sub);
4595 if (!sub) {
4596 Py_DECREF(str);
4597 return -2;
4600 if (direction > 0)
4601 result = stringlib_find_slice(
4602 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4603 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4604 start, end
4606 else
4607 result = stringlib_rfind_slice(
4608 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
4609 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
4610 start, end
4613 Py_DECREF(str);
4614 Py_DECREF(sub);
4616 return result;
4619 static
4620 int tailmatch(PyUnicodeObject *self,
4621 PyUnicodeObject *substring,
4622 Py_ssize_t start,
4623 Py_ssize_t end,
4624 int direction)
4626 if (substring->length == 0)
4627 return 1;
4629 FIX_START_END(self);
4631 end -= substring->length;
4632 if (end < start)
4633 return 0;
4635 if (direction > 0) {
4636 if (Py_UNICODE_MATCH(self, end, substring))
4637 return 1;
4638 } else {
4639 if (Py_UNICODE_MATCH(self, start, substring))
4640 return 1;
4643 return 0;
4646 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
4647 PyObject *substr,
4648 Py_ssize_t start,
4649 Py_ssize_t end,
4650 int direction)
4652 Py_ssize_t result;
4654 str = PyUnicode_FromObject(str);
4655 if (str == NULL)
4656 return -1;
4657 substr = PyUnicode_FromObject(substr);
4658 if (substr == NULL) {
4659 Py_DECREF(str);
4660 return -1;
4663 result = tailmatch((PyUnicodeObject *)str,
4664 (PyUnicodeObject *)substr,
4665 start, end, direction);
4666 Py_DECREF(str);
4667 Py_DECREF(substr);
4668 return result;
4671 /* Apply fixfct filter to the Unicode object self and return a
4672 reference to the modified object */
4674 static
4675 PyObject *fixup(PyUnicodeObject *self,
4676 int (*fixfct)(PyUnicodeObject *s))
4679 PyUnicodeObject *u;
4681 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
4682 if (u == NULL)
4683 return NULL;
4685 Py_UNICODE_COPY(u->str, self->str, self->length);
4687 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
4688 /* fixfct should return TRUE if it modified the buffer. If
4689 FALSE, return a reference to the original buffer instead
4690 (to save space, not time) */
4691 Py_INCREF(self);
4692 Py_DECREF(u);
4693 return (PyObject*) self;
4695 return (PyObject*) u;
4698 static
4699 int fixupper(PyUnicodeObject *self)
4701 Py_ssize_t len = self->length;
4702 Py_UNICODE *s = self->str;
4703 int status = 0;
4705 while (len-- > 0) {
4706 register Py_UNICODE ch;
4708 ch = Py_UNICODE_TOUPPER(*s);
4709 if (ch != *s) {
4710 status = 1;
4711 *s = ch;
4713 s++;
4716 return status;
4719 static
4720 int fixlower(PyUnicodeObject *self)
4722 Py_ssize_t len = self->length;
4723 Py_UNICODE *s = self->str;
4724 int status = 0;
4726 while (len-- > 0) {
4727 register Py_UNICODE ch;
4729 ch = Py_UNICODE_TOLOWER(*s);
4730 if (ch != *s) {
4731 status = 1;
4732 *s = ch;
4734 s++;
4737 return status;
4740 static
4741 int fixswapcase(PyUnicodeObject *self)
4743 Py_ssize_t len = self->length;
4744 Py_UNICODE *s = self->str;
4745 int status = 0;
4747 while (len-- > 0) {
4748 if (Py_UNICODE_ISUPPER(*s)) {
4749 *s = Py_UNICODE_TOLOWER(*s);
4750 status = 1;
4751 } else if (Py_UNICODE_ISLOWER(*s)) {
4752 *s = Py_UNICODE_TOUPPER(*s);
4753 status = 1;
4755 s++;
4758 return status;
4761 static
4762 int fixcapitalize(PyUnicodeObject *self)
4764 Py_ssize_t len = self->length;
4765 Py_UNICODE *s = self->str;
4766 int status = 0;
4768 if (len == 0)
4769 return 0;
4770 if (Py_UNICODE_ISLOWER(*s)) {
4771 *s = Py_UNICODE_TOUPPER(*s);
4772 status = 1;
4774 s++;
4775 while (--len > 0) {
4776 if (Py_UNICODE_ISUPPER(*s)) {
4777 *s = Py_UNICODE_TOLOWER(*s);
4778 status = 1;
4780 s++;
4782 return status;
4785 static
4786 int fixtitle(PyUnicodeObject *self)
4788 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4789 register Py_UNICODE *e;
4790 int previous_is_cased;
4792 /* Shortcut for single character strings */
4793 if (PyUnicode_GET_SIZE(self) == 1) {
4794 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4795 if (*p != ch) {
4796 *p = ch;
4797 return 1;
4799 else
4800 return 0;
4803 e = p + PyUnicode_GET_SIZE(self);
4804 previous_is_cased = 0;
4805 for (; p < e; p++) {
4806 register const Py_UNICODE ch = *p;
4808 if (previous_is_cased)
4809 *p = Py_UNICODE_TOLOWER(ch);
4810 else
4811 *p = Py_UNICODE_TOTITLE(ch);
4813 if (Py_UNICODE_ISLOWER(ch) ||
4814 Py_UNICODE_ISUPPER(ch) ||
4815 Py_UNICODE_ISTITLE(ch))
4816 previous_is_cased = 1;
4817 else
4818 previous_is_cased = 0;
4820 return 1;
4823 PyObject *
4824 PyUnicode_Join(PyObject *separator, PyObject *seq)
4826 PyObject *internal_separator = NULL;
4827 const Py_UNICODE blank = ' ';
4828 const Py_UNICODE *sep = &blank;
4829 Py_ssize_t seplen = 1;
4830 PyUnicodeObject *res = NULL; /* the result */
4831 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
4832 Py_ssize_t res_used; /* # used bytes */
4833 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4834 PyObject *fseq; /* PySequence_Fast(seq) */
4835 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
4836 PyObject *item;
4837 Py_ssize_t i;
4839 fseq = PySequence_Fast(seq, "");
4840 if (fseq == NULL) {
4841 return NULL;
4844 /* Grrrr. A codec may be invoked to convert str objects to
4845 * Unicode, and so it's possible to call back into Python code
4846 * during PyUnicode_FromObject(), and so it's possible for a sick
4847 * codec to change the size of fseq (if seq is a list). Therefore
4848 * we have to keep refetching the size -- can't assume seqlen
4849 * is invariant.
4851 seqlen = PySequence_Fast_GET_SIZE(fseq);
4852 /* If empty sequence, return u"". */
4853 if (seqlen == 0) {
4854 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4855 goto Done;
4857 /* If singleton sequence with an exact Unicode, return that. */
4858 if (seqlen == 1) {
4859 item = PySequence_Fast_GET_ITEM(fseq, 0);
4860 if (PyUnicode_CheckExact(item)) {
4861 Py_INCREF(item);
4862 res = (PyUnicodeObject *)item;
4863 goto Done;
4867 /* At least two items to join, or one that isn't exact Unicode. */
4868 if (seqlen > 1) {
4869 /* Set up sep and seplen -- they're needed. */
4870 if (separator == NULL) {
4871 sep = &blank;
4872 seplen = 1;
4874 else {
4875 internal_separator = PyUnicode_FromObject(separator);
4876 if (internal_separator == NULL)
4877 goto onError;
4878 sep = PyUnicode_AS_UNICODE(internal_separator);
4879 seplen = PyUnicode_GET_SIZE(internal_separator);
4880 /* In case PyUnicode_FromObject() mutated seq. */
4881 seqlen = PySequence_Fast_GET_SIZE(fseq);
4885 /* Get space. */
4886 res = _PyUnicode_New(res_alloc);
4887 if (res == NULL)
4888 goto onError;
4889 res_p = PyUnicode_AS_UNICODE(res);
4890 res_used = 0;
4892 for (i = 0; i < seqlen; ++i) {
4893 Py_ssize_t itemlen;
4894 Py_ssize_t new_res_used;
4896 item = PySequence_Fast_GET_ITEM(fseq, i);
4897 /* Convert item to Unicode. */
4898 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4899 PyErr_Format(PyExc_TypeError,
4900 "sequence item %zd: expected string or Unicode,"
4901 " %.80s found",
4902 i, Py_Type(item)->tp_name);
4903 goto onError;
4905 item = PyUnicode_FromObject(item);
4906 if (item == NULL)
4907 goto onError;
4908 /* We own a reference to item from here on. */
4910 /* In case PyUnicode_FromObject() mutated seq. */
4911 seqlen = PySequence_Fast_GET_SIZE(fseq);
4913 /* Make sure we have enough space for the separator and the item. */
4914 itemlen = PyUnicode_GET_SIZE(item);
4915 new_res_used = res_used + itemlen;
4916 if (new_res_used < 0)
4917 goto Overflow;
4918 if (i < seqlen - 1) {
4919 new_res_used += seplen;
4920 if (new_res_used < 0)
4921 goto Overflow;
4923 if (new_res_used > res_alloc) {
4924 /* double allocated size until it's big enough */
4925 do {
4926 res_alloc += res_alloc;
4927 if (res_alloc <= 0)
4928 goto Overflow;
4929 } while (new_res_used > res_alloc);
4930 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
4931 Py_DECREF(item);
4932 goto onError;
4934 res_p = PyUnicode_AS_UNICODE(res) + res_used;
4937 /* Copy item, and maybe the separator. */
4938 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
4939 res_p += itemlen;
4940 if (i < seqlen - 1) {
4941 Py_UNICODE_COPY(res_p, sep, seplen);
4942 res_p += seplen;
4944 Py_DECREF(item);
4945 res_used = new_res_used;
4948 /* Shrink res to match the used area; this probably can't fail,
4949 * but it's cheap to check.
4951 if (_PyUnicode_Resize(&res, res_used) < 0)
4952 goto onError;
4954 Done:
4955 Py_XDECREF(internal_separator);
4956 Py_DECREF(fseq);
4957 return (PyObject *)res;
4959 Overflow:
4960 PyErr_SetString(PyExc_OverflowError,
4961 "join() result is too long for a Python string");
4962 Py_DECREF(item);
4963 /* fall through */
4965 onError:
4966 Py_XDECREF(internal_separator);
4967 Py_DECREF(fseq);
4968 Py_XDECREF(res);
4969 return NULL;
4972 static
4973 PyUnicodeObject *pad(PyUnicodeObject *self,
4974 Py_ssize_t left,
4975 Py_ssize_t right,
4976 Py_UNICODE fill)
4978 PyUnicodeObject *u;
4980 if (left < 0)
4981 left = 0;
4982 if (right < 0)
4983 right = 0;
4985 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
4986 Py_INCREF(self);
4987 return self;
4990 u = _PyUnicode_New(left + self->length + right);
4991 if (u) {
4992 if (left)
4993 Py_UNICODE_FILL(u->str, fill, left);
4994 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4995 if (right)
4996 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4999 return u;
5002 #define SPLIT_APPEND(data, left, right) \
5003 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5004 if (!str) \
5005 goto onError; \
5006 if (PyList_Append(list, str)) { \
5007 Py_DECREF(str); \
5008 goto onError; \
5010 else \
5011 Py_DECREF(str);
5013 static
5014 PyObject *split_whitespace(PyUnicodeObject *self,
5015 PyObject *list,
5016 Py_ssize_t maxcount)
5018 register Py_ssize_t i;
5019 register Py_ssize_t j;
5020 Py_ssize_t len = self->length;
5021 PyObject *str;
5023 for (i = j = 0; i < len; ) {
5024 /* find a token */
5025 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5026 i++;
5027 j = i;
5028 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
5029 i++;
5030 if (j < i) {
5031 if (maxcount-- <= 0)
5032 break;
5033 SPLIT_APPEND(self->str, j, i);
5034 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
5035 i++;
5036 j = i;
5039 if (j < len) {
5040 SPLIT_APPEND(self->str, j, len);
5042 return list;
5044 onError:
5045 Py_DECREF(list);
5046 return NULL;
5049 PyObject *PyUnicode_Splitlines(PyObject *string,
5050 int keepends)
5052 register Py_ssize_t i;
5053 register Py_ssize_t j;
5054 Py_ssize_t len;
5055 PyObject *list;
5056 PyObject *str;
5057 Py_UNICODE *data;
5059 string = PyUnicode_FromObject(string);
5060 if (string == NULL)
5061 return NULL;
5062 data = PyUnicode_AS_UNICODE(string);
5063 len = PyUnicode_GET_SIZE(string);
5065 list = PyList_New(0);
5066 if (!list)
5067 goto onError;
5069 for (i = j = 0; i < len; ) {
5070 Py_ssize_t eol;
5072 /* Find a line and append it */
5073 while (i < len && !BLOOM_LINEBREAK(data[i]))
5074 i++;
5076 /* Skip the line break reading CRLF as one line break */
5077 eol = i;
5078 if (i < len) {
5079 if (data[i] == '\r' && i + 1 < len &&
5080 data[i+1] == '\n')
5081 i += 2;
5082 else
5083 i++;
5084 if (keepends)
5085 eol = i;
5087 SPLIT_APPEND(data, j, eol);
5088 j = i;
5090 if (j < len) {
5091 SPLIT_APPEND(data, j, len);
5094 Py_DECREF(string);
5095 return list;
5097 onError:
5098 Py_XDECREF(list);
5099 Py_DECREF(string);
5100 return NULL;
5103 static
5104 PyObject *split_char(PyUnicodeObject *self,
5105 PyObject *list,
5106 Py_UNICODE ch,
5107 Py_ssize_t maxcount)
5109 register Py_ssize_t i;
5110 register Py_ssize_t j;
5111 Py_ssize_t len = self->length;
5112 PyObject *str;
5114 for (i = j = 0; i < len; ) {
5115 if (self->str[i] == ch) {
5116 if (maxcount-- <= 0)
5117 break;
5118 SPLIT_APPEND(self->str, j, i);
5119 i = j = i + 1;
5120 } else
5121 i++;
5123 if (j <= len) {
5124 SPLIT_APPEND(self->str, j, len);
5126 return list;
5128 onError:
5129 Py_DECREF(list);
5130 return NULL;
5133 static
5134 PyObject *split_substring(PyUnicodeObject *self,
5135 PyObject *list,
5136 PyUnicodeObject *substring,
5137 Py_ssize_t maxcount)
5139 register Py_ssize_t i;
5140 register Py_ssize_t j;
5141 Py_ssize_t len = self->length;
5142 Py_ssize_t sublen = substring->length;
5143 PyObject *str;
5145 for (i = j = 0; i <= len - sublen; ) {
5146 if (Py_UNICODE_MATCH(self, i, substring)) {
5147 if (maxcount-- <= 0)
5148 break;
5149 SPLIT_APPEND(self->str, j, i);
5150 i = j = i + sublen;
5151 } else
5152 i++;
5154 if (j <= len) {
5155 SPLIT_APPEND(self->str, j, len);
5157 return list;
5159 onError:
5160 Py_DECREF(list);
5161 return NULL;
5164 static
5165 PyObject *rsplit_whitespace(PyUnicodeObject *self,
5166 PyObject *list,
5167 Py_ssize_t maxcount)
5169 register Py_ssize_t i;
5170 register Py_ssize_t j;
5171 Py_ssize_t len = self->length;
5172 PyObject *str;
5174 for (i = j = len - 1; i >= 0; ) {
5175 /* find a token */
5176 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5177 i--;
5178 j = i;
5179 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
5180 i--;
5181 if (j > i) {
5182 if (maxcount-- <= 0)
5183 break;
5184 SPLIT_APPEND(self->str, i + 1, j + 1);
5185 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
5186 i--;
5187 j = i;
5190 if (j >= 0) {
5191 SPLIT_APPEND(self->str, 0, j + 1);
5193 if (PyList_Reverse(list) < 0)
5194 goto onError;
5195 return list;
5197 onError:
5198 Py_DECREF(list);
5199 return NULL;
5202 static
5203 PyObject *rsplit_char(PyUnicodeObject *self,
5204 PyObject *list,
5205 Py_UNICODE ch,
5206 Py_ssize_t maxcount)
5208 register Py_ssize_t i;
5209 register Py_ssize_t j;
5210 Py_ssize_t len = self->length;
5211 PyObject *str;
5213 for (i = j = len - 1; i >= 0; ) {
5214 if (self->str[i] == ch) {
5215 if (maxcount-- <= 0)
5216 break;
5217 SPLIT_APPEND(self->str, i + 1, j + 1);
5218 j = i = i - 1;
5219 } else
5220 i--;
5222 if (j >= -1) {
5223 SPLIT_APPEND(self->str, 0, j + 1);
5225 if (PyList_Reverse(list) < 0)
5226 goto onError;
5227 return list;
5229 onError:
5230 Py_DECREF(list);
5231 return NULL;
5234 static
5235 PyObject *rsplit_substring(PyUnicodeObject *self,
5236 PyObject *list,
5237 PyUnicodeObject *substring,
5238 Py_ssize_t maxcount)
5240 register Py_ssize_t i;
5241 register Py_ssize_t j;
5242 Py_ssize_t len = self->length;
5243 Py_ssize_t sublen = substring->length;
5244 PyObject *str;
5246 for (i = len - sublen, j = len; i >= 0; ) {
5247 if (Py_UNICODE_MATCH(self, i, substring)) {
5248 if (maxcount-- <= 0)
5249 break;
5250 SPLIT_APPEND(self->str, i + sublen, j);
5251 j = i;
5252 i -= sublen;
5253 } else
5254 i--;
5256 if (j >= 0) {
5257 SPLIT_APPEND(self->str, 0, j);
5259 if (PyList_Reverse(list) < 0)
5260 goto onError;
5261 return list;
5263 onError:
5264 Py_DECREF(list);
5265 return NULL;
5268 #undef SPLIT_APPEND
5270 static
5271 PyObject *split(PyUnicodeObject *self,
5272 PyUnicodeObject *substring,
5273 Py_ssize_t maxcount)
5275 PyObject *list;
5277 if (maxcount < 0)
5278 maxcount = PY_SSIZE_T_MAX;
5280 list = PyList_New(0);
5281 if (!list)
5282 return NULL;
5284 if (substring == NULL)
5285 return split_whitespace(self,list,maxcount);
5287 else if (substring->length == 1)
5288 return split_char(self,list,substring->str[0],maxcount);
5290 else if (substring->length == 0) {
5291 Py_DECREF(list);
5292 PyErr_SetString(PyExc_ValueError, "empty separator");
5293 return NULL;
5295 else
5296 return split_substring(self,list,substring,maxcount);
5299 static
5300 PyObject *rsplit(PyUnicodeObject *self,
5301 PyUnicodeObject *substring,
5302 Py_ssize_t maxcount)
5304 PyObject *list;
5306 if (maxcount < 0)
5307 maxcount = PY_SSIZE_T_MAX;
5309 list = PyList_New(0);
5310 if (!list)
5311 return NULL;
5313 if (substring == NULL)
5314 return rsplit_whitespace(self,list,maxcount);
5316 else if (substring->length == 1)
5317 return rsplit_char(self,list,substring->str[0],maxcount);
5319 else if (substring->length == 0) {
5320 Py_DECREF(list);
5321 PyErr_SetString(PyExc_ValueError, "empty separator");
5322 return NULL;
5324 else
5325 return rsplit_substring(self,list,substring,maxcount);
5328 static
5329 PyObject *replace(PyUnicodeObject *self,
5330 PyUnicodeObject *str1,
5331 PyUnicodeObject *str2,
5332 Py_ssize_t maxcount)
5334 PyUnicodeObject *u;
5336 if (maxcount < 0)
5337 maxcount = PY_SSIZE_T_MAX;
5339 if (str1->length == str2->length) {
5340 /* same length */
5341 Py_ssize_t i;
5342 if (str1->length == 1) {
5343 /* replace characters */
5344 Py_UNICODE u1, u2;
5345 if (!findchar(self->str, self->length, str1->str[0]))
5346 goto nothing;
5347 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5348 if (!u)
5349 return NULL;
5350 Py_UNICODE_COPY(u->str, self->str, self->length);
5351 u1 = str1->str[0];
5352 u2 = str2->str[0];
5353 for (i = 0; i < u->length; i++)
5354 if (u->str[i] == u1) {
5355 if (--maxcount < 0)
5356 break;
5357 u->str[i] = u2;
5359 } else {
5360 i = fastsearch(
5361 self->str, self->length, str1->str, str1->length, FAST_SEARCH
5363 if (i < 0)
5364 goto nothing;
5365 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5366 if (!u)
5367 return NULL;
5368 Py_UNICODE_COPY(u->str, self->str, self->length);
5369 while (i <= self->length - str1->length)
5370 if (Py_UNICODE_MATCH(self, i, str1)) {
5371 if (--maxcount < 0)
5372 break;
5373 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5374 i += str1->length;
5375 } else
5376 i++;
5378 } else {
5380 Py_ssize_t n, i, j, e;
5381 Py_ssize_t product, new_size, delta;
5382 Py_UNICODE *p;
5384 /* replace strings */
5385 n = stringlib_count(self->str, self->length, str1->str, str1->length);
5386 if (n > maxcount)
5387 n = maxcount;
5388 if (n == 0)
5389 goto nothing;
5390 /* new_size = self->length + n * (str2->length - str1->length)); */
5391 delta = (str2->length - str1->length);
5392 if (delta == 0) {
5393 new_size = self->length;
5394 } else {
5395 product = n * (str2->length - str1->length);
5396 if ((product / (str2->length - str1->length)) != n) {
5397 PyErr_SetString(PyExc_OverflowError,
5398 "replace string is too long");
5399 return NULL;
5401 new_size = self->length + product;
5402 if (new_size < 0) {
5403 PyErr_SetString(PyExc_OverflowError,
5404 "replace string is too long");
5405 return NULL;
5408 u = _PyUnicode_New(new_size);
5409 if (!u)
5410 return NULL;
5411 i = 0;
5412 p = u->str;
5413 e = self->length - str1->length;
5414 if (str1->length > 0) {
5415 while (n-- > 0) {
5416 /* look for next match */
5417 j = i;
5418 while (j <= e) {
5419 if (Py_UNICODE_MATCH(self, j, str1))
5420 break;
5421 j++;
5423 if (j > i) {
5424 if (j > e)
5425 break;
5426 /* copy unchanged part [i:j] */
5427 Py_UNICODE_COPY(p, self->str+i, j-i);
5428 p += j - i;
5430 /* copy substitution string */
5431 if (str2->length > 0) {
5432 Py_UNICODE_COPY(p, str2->str, str2->length);
5433 p += str2->length;
5435 i = j + str1->length;
5437 if (i < self->length)
5438 /* copy tail [i:] */
5439 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5440 } else {
5441 /* interleave */
5442 while (n > 0) {
5443 Py_UNICODE_COPY(p, str2->str, str2->length);
5444 p += str2->length;
5445 if (--n <= 0)
5446 break;
5447 *p++ = self->str[i++];
5449 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5452 return (PyObject *) u;
5454 nothing:
5455 /* nothing to replace; return original string (when possible) */
5456 if (PyUnicode_CheckExact(self)) {
5457 Py_INCREF(self);
5458 return (PyObject *) self;
5460 return PyUnicode_FromUnicode(self->str, self->length);
5463 /* --- Unicode Object Methods --------------------------------------------- */
5465 PyDoc_STRVAR(title__doc__,
5466 "S.title() -> unicode\n\
5468 Return a titlecased version of S, i.e. words start with title case\n\
5469 characters, all remaining cased characters have lower case.");
5471 static PyObject*
5472 unicode_title(PyUnicodeObject *self)
5474 return fixup(self, fixtitle);
5477 PyDoc_STRVAR(capitalize__doc__,
5478 "S.capitalize() -> unicode\n\
5480 Return a capitalized version of S, i.e. make the first character\n\
5481 have upper case.");
5483 static PyObject*
5484 unicode_capitalize(PyUnicodeObject *self)
5486 return fixup(self, fixcapitalize);
5489 #if 0
5490 PyDoc_STRVAR(capwords__doc__,
5491 "S.capwords() -> unicode\n\
5493 Apply .capitalize() to all words in S and return the result with\n\
5494 normalized whitespace (all whitespace strings are replaced by ' ').");
5496 static PyObject*
5497 unicode_capwords(PyUnicodeObject *self)
5499 PyObject *list;
5500 PyObject *item;
5501 Py_ssize_t i;
5503 /* Split into words */
5504 list = split(self, NULL, -1);
5505 if (!list)
5506 return NULL;
5508 /* Capitalize each word */
5509 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5510 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5511 fixcapitalize);
5512 if (item == NULL)
5513 goto onError;
5514 Py_DECREF(PyList_GET_ITEM(list, i));
5515 PyList_SET_ITEM(list, i, item);
5518 /* Join the words to form a new string */
5519 item = PyUnicode_Join(NULL, list);
5521 onError:
5522 Py_DECREF(list);
5523 return (PyObject *)item;
5525 #endif
5527 /* Argument converter. Coerces to a single unicode character */
5529 static int
5530 convert_uc(PyObject *obj, void *addr)
5532 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5533 PyObject *uniobj;
5534 Py_UNICODE *unistr;
5536 uniobj = PyUnicode_FromObject(obj);
5537 if (uniobj == NULL) {
5538 PyErr_SetString(PyExc_TypeError,
5539 "The fill character cannot be converted to Unicode");
5540 return 0;
5542 if (PyUnicode_GET_SIZE(uniobj) != 1) {
5543 PyErr_SetString(PyExc_TypeError,
5544 "The fill character must be exactly one character long");
5545 Py_DECREF(uniobj);
5546 return 0;
5548 unistr = PyUnicode_AS_UNICODE(uniobj);
5549 *fillcharloc = unistr[0];
5550 Py_DECREF(uniobj);
5551 return 1;
5554 PyDoc_STRVAR(center__doc__,
5555 "S.center(width[, fillchar]) -> unicode\n\
5557 Return S centered in a Unicode string of length width. Padding is\n\
5558 done using the specified fill character (default is a space)");
5560 static PyObject *
5561 unicode_center(PyUnicodeObject *self, PyObject *args)
5563 Py_ssize_t marg, left;
5564 Py_ssize_t width;
5565 Py_UNICODE fillchar = ' ';
5567 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
5568 return NULL;
5570 if (self->length >= width && PyUnicode_CheckExact(self)) {
5571 Py_INCREF(self);
5572 return (PyObject*) self;
5575 marg = width - self->length;
5576 left = marg / 2 + (marg & width & 1);
5578 return (PyObject*) pad(self, left, marg - left, fillchar);
5581 #if 0
5583 /* This code should go into some future Unicode collation support
5584 module. The basic comparison should compare ordinals on a naive
5585 basis (this is what Java does and thus JPython too). */
5587 /* speedy UTF-16 code point order comparison */
5588 /* gleaned from: */
5589 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5591 static short utf16Fixup[32] =
5593 0, 0, 0, 0, 0, 0, 0, 0,
5594 0, 0, 0, 0, 0, 0, 0, 0,
5595 0, 0, 0, 0, 0, 0, 0, 0,
5596 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5599 static int
5600 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5602 Py_ssize_t len1, len2;
5604 Py_UNICODE *s1 = str1->str;
5605 Py_UNICODE *s2 = str2->str;
5607 len1 = str1->length;
5608 len2 = str2->length;
5610 while (len1 > 0 && len2 > 0) {
5611 Py_UNICODE c1, c2;
5613 c1 = *s1++;
5614 c2 = *s2++;
5616 if (c1 > (1<<11) * 26)
5617 c1 += utf16Fixup[c1>>11];
5618 if (c2 > (1<<11) * 26)
5619 c2 += utf16Fixup[c2>>11];
5620 /* now c1 and c2 are in UTF-32-compatible order */
5622 if (c1 != c2)
5623 return (c1 < c2) ? -1 : 1;
5625 len1--; len2--;
5628 return (len1 < len2) ? -1 : (len1 != len2);
5631 #else
5633 static int
5634 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
5636 register Py_ssize_t len1, len2;
5638 Py_UNICODE *s1 = str1->str;
5639 Py_UNICODE *s2 = str2->str;
5641 len1 = str1->length;
5642 len2 = str2->length;
5644 while (len1 > 0 && len2 > 0) {
5645 Py_UNICODE c1, c2;
5647 c1 = *s1++;
5648 c2 = *s2++;
5650 if (c1 != c2)
5651 return (c1 < c2) ? -1 : 1;
5653 len1--; len2--;
5656 return (len1 < len2) ? -1 : (len1 != len2);
5659 #endif
5661 int PyUnicode_Compare(PyObject *left,
5662 PyObject *right)
5664 PyUnicodeObject *u = NULL, *v = NULL;
5665 int result;
5667 /* Coerce the two arguments */
5668 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5669 if (u == NULL)
5670 goto onError;
5671 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5672 if (v == NULL)
5673 goto onError;
5675 /* Shortcut for empty or interned objects */
5676 if (v == u) {
5677 Py_DECREF(u);
5678 Py_DECREF(v);
5679 return 0;
5682 result = unicode_compare(u, v);
5684 Py_DECREF(u);
5685 Py_DECREF(v);
5686 return result;
5688 onError:
5689 Py_XDECREF(u);
5690 Py_XDECREF(v);
5691 return -1;
5694 PyObject *PyUnicode_RichCompare(PyObject *left,
5695 PyObject *right,
5696 int op)
5698 int result;
5700 result = PyUnicode_Compare(left, right);
5701 if (result == -1 && PyErr_Occurred())
5702 goto onError;
5704 /* Convert the return value to a Boolean */
5705 switch (op) {
5706 case Py_EQ:
5707 result = (result == 0);
5708 break;
5709 case Py_NE:
5710 result = (result != 0);
5711 break;
5712 case Py_LE:
5713 result = (result <= 0);
5714 break;
5715 case Py_GE:
5716 result = (result >= 0);
5717 break;
5718 case Py_LT:
5719 result = (result == -1);
5720 break;
5721 case Py_GT:
5722 result = (result == 1);
5723 break;
5725 return PyBool_FromLong(result);
5727 onError:
5729 /* Standard case
5731 Type errors mean that PyUnicode_FromObject() could not convert
5732 one of the arguments (usually the right hand side) to Unicode,
5733 ie. we can't handle the comparison request. However, it is
5734 possible that the other object knows a comparison method, which
5735 is why we return Py_NotImplemented to give the other object a
5736 chance.
5739 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
5740 PyErr_Clear();
5741 Py_INCREF(Py_NotImplemented);
5742 return Py_NotImplemented;
5744 if (op != Py_EQ && op != Py_NE)
5745 return NULL;
5747 /* Equality comparison.
5749 This is a special case: we silence any PyExc_UnicodeDecodeError
5750 and instead turn it into a PyErr_UnicodeWarning.
5753 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
5754 return NULL;
5755 PyErr_Clear();
5756 if (PyErr_Warn(PyExc_UnicodeWarning,
5757 (op == Py_EQ) ?
5758 "Unicode equal comparison "
5759 "failed to convert both arguments to Unicode - "
5760 "interpreting them as being unequal" :
5761 "Unicode unequal comparison "
5762 "failed to convert both arguments to Unicode - "
5763 "interpreting them as being unequal"
5764 ) < 0)
5765 return NULL;
5766 result = (op == Py_NE);
5767 return PyBool_FromLong(result);
5770 int PyUnicode_Contains(PyObject *container,
5771 PyObject *element)
5773 PyObject *str, *sub;
5774 int result;
5776 /* Coerce the two arguments */
5777 sub = PyUnicode_FromObject(element);
5778 if (!sub) {
5779 PyErr_SetString(PyExc_TypeError,
5780 "'in <string>' requires string as left operand");
5781 return -1;
5784 str = PyUnicode_FromObject(container);
5785 if (!str) {
5786 Py_DECREF(sub);
5787 return -1;
5790 result = stringlib_contains_obj(str, sub);
5792 Py_DECREF(str);
5793 Py_DECREF(sub);
5795 return result;
5798 /* Concat to string or Unicode object giving a new Unicode object. */
5800 PyObject *PyUnicode_Concat(PyObject *left,
5801 PyObject *right)
5803 PyUnicodeObject *u = NULL, *v = NULL, *w;
5805 /* Coerce the two arguments */
5806 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5807 if (u == NULL)
5808 goto onError;
5809 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5810 if (v == NULL)
5811 goto onError;
5813 /* Shortcuts */
5814 if (v == unicode_empty) {
5815 Py_DECREF(v);
5816 return (PyObject *)u;
5818 if (u == unicode_empty) {
5819 Py_DECREF(u);
5820 return (PyObject *)v;
5823 /* Concat the two Unicode strings */
5824 w = _PyUnicode_New(u->length + v->length);
5825 if (w == NULL)
5826 goto onError;
5827 Py_UNICODE_COPY(w->str, u->str, u->length);
5828 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5830 Py_DECREF(u);
5831 Py_DECREF(v);
5832 return (PyObject *)w;
5834 onError:
5835 Py_XDECREF(u);
5836 Py_XDECREF(v);
5837 return NULL;
5840 PyDoc_STRVAR(count__doc__,
5841 "S.count(sub[, start[, end]]) -> int\n\
5843 Return the number of non-overlapping occurrences of substring sub in\n\
5844 Unicode string S[start:end]. Optional arguments start and end are\n\
5845 interpreted as in slice notation.");
5847 static PyObject *
5848 unicode_count(PyUnicodeObject *self, PyObject *args)
5850 PyUnicodeObject *substring;
5851 Py_ssize_t start = 0;
5852 Py_ssize_t end = PY_SSIZE_T_MAX;
5853 PyObject *result;
5855 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5856 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5857 return NULL;
5859 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5860 (PyObject *)substring);
5861 if (substring == NULL)
5862 return NULL;
5864 FIX_START_END(self);
5866 result = PyInt_FromSsize_t(
5867 stringlib_count(self->str + start, end - start,
5868 substring->str, substring->length)
5871 Py_DECREF(substring);
5873 return result;
5876 PyDoc_STRVAR(encode__doc__,
5877 "S.encode([encoding[,errors]]) -> string or unicode\n\
5879 Encodes S using the codec registered for encoding. encoding defaults\n\
5880 to the default encoding. errors may be given to set a different error\n\
5881 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5882 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5883 'xmlcharrefreplace' as well as any other name registered with\n\
5884 codecs.register_error that can handle UnicodeEncodeErrors.");
5886 static PyObject *
5887 unicode_encode(PyUnicodeObject *self, PyObject *args)
5889 char *encoding = NULL;
5890 char *errors = NULL;
5891 PyObject *v;
5893 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5894 return NULL;
5895 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
5896 if (v == NULL)
5897 goto onError;
5898 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5899 PyErr_Format(PyExc_TypeError,
5900 "encoder did not return a string/unicode object "
5901 "(type=%.400s)",
5902 Py_Type(v)->tp_name);
5903 Py_DECREF(v);
5904 return NULL;
5906 return v;
5908 onError:
5909 return NULL;
5912 PyDoc_STRVAR(decode__doc__,
5913 "S.decode([encoding[,errors]]) -> string or unicode\n\
5915 Decodes S using the codec registered for encoding. encoding defaults\n\
5916 to the default encoding. errors may be given to set a different error\n\
5917 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5918 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5919 as well as any other name registerd with codecs.register_error that is\n\
5920 able to handle UnicodeDecodeErrors.");
5922 static PyObject *
5923 unicode_decode(PyUnicodeObject *self, PyObject *args)
5925 char *encoding = NULL;
5926 char *errors = NULL;
5927 PyObject *v;
5929 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5930 return NULL;
5931 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
5932 if (v == NULL)
5933 goto onError;
5934 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5935 PyErr_Format(PyExc_TypeError,
5936 "decoder did not return a string/unicode object "
5937 "(type=%.400s)",
5938 Py_Type(v)->tp_name);
5939 Py_DECREF(v);
5940 return NULL;
5942 return v;
5944 onError:
5945 return NULL;
5948 PyDoc_STRVAR(expandtabs__doc__,
5949 "S.expandtabs([tabsize]) -> unicode\n\
5951 Return a copy of S where all tab characters are expanded using spaces.\n\
5952 If tabsize is not given, a tab size of 8 characters is assumed.");
5954 static PyObject*
5955 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5957 Py_UNICODE *e;
5958 Py_UNICODE *p;
5959 Py_UNICODE *q;
5960 Py_ssize_t i, j, old_j;
5961 PyUnicodeObject *u;
5962 int tabsize = 8;
5964 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5965 return NULL;
5967 /* First pass: determine size of output string */
5968 i = j = old_j = 0;
5969 e = self->str + self->length;
5970 for (p = self->str; p < e; p++)
5971 if (*p == '\t') {
5972 if (tabsize > 0) {
5973 j += tabsize - (j % tabsize);
5974 if (old_j > j) {
5975 PyErr_SetString(PyExc_OverflowError,
5976 "new string is too long");
5977 return NULL;
5979 old_j = j;
5982 else {
5983 j++;
5984 if (*p == '\n' || *p == '\r') {
5985 i += j;
5986 old_j = j = 0;
5987 if (i < 0) {
5988 PyErr_SetString(PyExc_OverflowError,
5989 "new string is too long");
5990 return NULL;
5995 if ((i + j) < 0) {
5996 PyErr_SetString(PyExc_OverflowError, "new string is too long");
5997 return NULL;
6000 /* Second pass: create output string and fill it */
6001 u = _PyUnicode_New(i + j);
6002 if (!u)
6003 return NULL;
6005 j = 0;
6006 q = u->str;
6008 for (p = self->str; p < e; p++)
6009 if (*p == '\t') {
6010 if (tabsize > 0) {
6011 i = tabsize - (j % tabsize);
6012 j += i;
6013 while (i--)
6014 *q++ = ' ';
6017 else {
6018 j++;
6019 *q++ = *p;
6020 if (*p == '\n' || *p == '\r')
6021 j = 0;
6024 return (PyObject*) u;
6027 PyDoc_STRVAR(find__doc__,
6028 "S.find(sub [,start [,end]]) -> int\n\
6030 Return the lowest index in S where substring sub is found,\n\
6031 such that sub is contained within s[start:end]. Optional\n\
6032 arguments start and end are interpreted as in slice notation.\n\
6034 Return -1 on failure.");
6036 static PyObject *
6037 unicode_find(PyUnicodeObject *self, PyObject *args)
6039 PyObject *substring;
6040 Py_ssize_t start = 0;
6041 Py_ssize_t end = PY_SSIZE_T_MAX;
6042 Py_ssize_t result;
6044 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
6045 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6046 return NULL;
6047 substring = PyUnicode_FromObject(substring);
6048 if (!substring)
6049 return NULL;
6051 result = stringlib_find_slice(
6052 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6053 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6054 start, end
6057 Py_DECREF(substring);
6059 return PyInt_FromSsize_t(result);
6062 static PyObject *
6063 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6065 if (index < 0 || index >= self->length) {
6066 PyErr_SetString(PyExc_IndexError, "string index out of range");
6067 return NULL;
6070 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6073 static long
6074 unicode_hash(PyUnicodeObject *self)
6076 /* Since Unicode objects compare equal to their ASCII string
6077 counterparts, they should use the individual character values
6078 as basis for their hash value. This is needed to assure that
6079 strings and Unicode objects behave in the same way as
6080 dictionary keys. */
6082 register Py_ssize_t len;
6083 register Py_UNICODE *p;
6084 register long x;
6086 if (self->hash != -1)
6087 return self->hash;
6088 len = PyUnicode_GET_SIZE(self);
6089 p = PyUnicode_AS_UNICODE(self);
6090 x = *p << 7;
6091 while (--len >= 0)
6092 x = (1000003*x) ^ *p++;
6093 x ^= PyUnicode_GET_SIZE(self);
6094 if (x == -1)
6095 x = -2;
6096 self->hash = x;
6097 return x;
6100 PyDoc_STRVAR(index__doc__,
6101 "S.index(sub [,start [,end]]) -> int\n\
6103 Like S.find() but raise ValueError when the substring is not found.");
6105 static PyObject *
6106 unicode_index(PyUnicodeObject *self, PyObject *args)
6108 Py_ssize_t result;
6109 PyObject *substring;
6110 Py_ssize_t start = 0;
6111 Py_ssize_t end = PY_SSIZE_T_MAX;
6113 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
6114 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6115 return NULL;
6116 substring = PyUnicode_FromObject(substring);
6117 if (!substring)
6118 return NULL;
6120 result = stringlib_find_slice(
6121 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6122 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6123 start, end
6126 Py_DECREF(substring);
6128 if (result < 0) {
6129 PyErr_SetString(PyExc_ValueError, "substring not found");
6130 return NULL;
6133 return PyInt_FromSsize_t(result);
6136 PyDoc_STRVAR(islower__doc__,
6137 "S.islower() -> bool\n\
6139 Return True if all cased characters in S are lowercase and there is\n\
6140 at least one cased character in S, False otherwise.");
6142 static PyObject*
6143 unicode_islower(PyUnicodeObject *self)
6145 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6146 register const Py_UNICODE *e;
6147 int cased;
6149 /* Shortcut for single character strings */
6150 if (PyUnicode_GET_SIZE(self) == 1)
6151 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6153 /* Special case for empty strings */
6154 if (PyUnicode_GET_SIZE(self) == 0)
6155 return PyBool_FromLong(0);
6157 e = p + PyUnicode_GET_SIZE(self);
6158 cased = 0;
6159 for (; p < e; p++) {
6160 register const Py_UNICODE ch = *p;
6162 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6163 return PyBool_FromLong(0);
6164 else if (!cased && Py_UNICODE_ISLOWER(ch))
6165 cased = 1;
6167 return PyBool_FromLong(cased);
6170 PyDoc_STRVAR(isupper__doc__,
6171 "S.isupper() -> bool\n\
6173 Return True if all cased characters in S are uppercase and there is\n\
6174 at least one cased character in S, False otherwise.");
6176 static PyObject*
6177 unicode_isupper(PyUnicodeObject *self)
6179 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6180 register const Py_UNICODE *e;
6181 int cased;
6183 /* Shortcut for single character strings */
6184 if (PyUnicode_GET_SIZE(self) == 1)
6185 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6187 /* Special case for empty strings */
6188 if (PyUnicode_GET_SIZE(self) == 0)
6189 return PyBool_FromLong(0);
6191 e = p + PyUnicode_GET_SIZE(self);
6192 cased = 0;
6193 for (; p < e; p++) {
6194 register const Py_UNICODE ch = *p;
6196 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6197 return PyBool_FromLong(0);
6198 else if (!cased && Py_UNICODE_ISUPPER(ch))
6199 cased = 1;
6201 return PyBool_FromLong(cased);
6204 PyDoc_STRVAR(istitle__doc__,
6205 "S.istitle() -> bool\n\
6207 Return True if S is a titlecased string and there is at least one\n\
6208 character in S, i.e. upper- and titlecase characters may only\n\
6209 follow uncased characters and lowercase characters only cased ones.\n\
6210 Return False otherwise.");
6212 static PyObject*
6213 unicode_istitle(PyUnicodeObject *self)
6215 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6216 register const Py_UNICODE *e;
6217 int cased, previous_is_cased;
6219 /* Shortcut for single character strings */
6220 if (PyUnicode_GET_SIZE(self) == 1)
6221 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6222 (Py_UNICODE_ISUPPER(*p) != 0));
6224 /* Special case for empty strings */
6225 if (PyUnicode_GET_SIZE(self) == 0)
6226 return PyBool_FromLong(0);
6228 e = p + PyUnicode_GET_SIZE(self);
6229 cased = 0;
6230 previous_is_cased = 0;
6231 for (; p < e; p++) {
6232 register const Py_UNICODE ch = *p;
6234 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6235 if (previous_is_cased)
6236 return PyBool_FromLong(0);
6237 previous_is_cased = 1;
6238 cased = 1;
6240 else if (Py_UNICODE_ISLOWER(ch)) {
6241 if (!previous_is_cased)
6242 return PyBool_FromLong(0);
6243 previous_is_cased = 1;
6244 cased = 1;
6246 else
6247 previous_is_cased = 0;
6249 return PyBool_FromLong(cased);
6252 PyDoc_STRVAR(isspace__doc__,
6253 "S.isspace() -> bool\n\
6255 Return True if all characters in S are whitespace\n\
6256 and there is at least one character in S, False otherwise.");
6258 static PyObject*
6259 unicode_isspace(PyUnicodeObject *self)
6261 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6262 register const Py_UNICODE *e;
6264 /* Shortcut for single character strings */
6265 if (PyUnicode_GET_SIZE(self) == 1 &&
6266 Py_UNICODE_ISSPACE(*p))
6267 return PyBool_FromLong(1);
6269 /* Special case for empty strings */
6270 if (PyUnicode_GET_SIZE(self) == 0)
6271 return PyBool_FromLong(0);
6273 e = p + PyUnicode_GET_SIZE(self);
6274 for (; p < e; p++) {
6275 if (!Py_UNICODE_ISSPACE(*p))
6276 return PyBool_FromLong(0);
6278 return PyBool_FromLong(1);
6281 PyDoc_STRVAR(isalpha__doc__,
6282 "S.isalpha() -> bool\n\
6284 Return True if all characters in S are alphabetic\n\
6285 and there is at least one character in S, False otherwise.");
6287 static PyObject*
6288 unicode_isalpha(PyUnicodeObject *self)
6290 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6291 register const Py_UNICODE *e;
6293 /* Shortcut for single character strings */
6294 if (PyUnicode_GET_SIZE(self) == 1 &&
6295 Py_UNICODE_ISALPHA(*p))
6296 return PyBool_FromLong(1);
6298 /* Special case for empty strings */
6299 if (PyUnicode_GET_SIZE(self) == 0)
6300 return PyBool_FromLong(0);
6302 e = p + PyUnicode_GET_SIZE(self);
6303 for (; p < e; p++) {
6304 if (!Py_UNICODE_ISALPHA(*p))
6305 return PyBool_FromLong(0);
6307 return PyBool_FromLong(1);
6310 PyDoc_STRVAR(isalnum__doc__,
6311 "S.isalnum() -> bool\n\
6313 Return True if all characters in S are alphanumeric\n\
6314 and there is at least one character in S, False otherwise.");
6316 static PyObject*
6317 unicode_isalnum(PyUnicodeObject *self)
6319 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6320 register const Py_UNICODE *e;
6322 /* Shortcut for single character strings */
6323 if (PyUnicode_GET_SIZE(self) == 1 &&
6324 Py_UNICODE_ISALNUM(*p))
6325 return PyBool_FromLong(1);
6327 /* Special case for empty strings */
6328 if (PyUnicode_GET_SIZE(self) == 0)
6329 return PyBool_FromLong(0);
6331 e = p + PyUnicode_GET_SIZE(self);
6332 for (; p < e; p++) {
6333 if (!Py_UNICODE_ISALNUM(*p))
6334 return PyBool_FromLong(0);
6336 return PyBool_FromLong(1);
6339 PyDoc_STRVAR(isdecimal__doc__,
6340 "S.isdecimal() -> bool\n\
6342 Return True if there are only decimal characters in S,\n\
6343 False otherwise.");
6345 static PyObject*
6346 unicode_isdecimal(PyUnicodeObject *self)
6348 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6349 register const Py_UNICODE *e;
6351 /* Shortcut for single character strings */
6352 if (PyUnicode_GET_SIZE(self) == 1 &&
6353 Py_UNICODE_ISDECIMAL(*p))
6354 return PyBool_FromLong(1);
6356 /* Special case for empty strings */
6357 if (PyUnicode_GET_SIZE(self) == 0)
6358 return PyBool_FromLong(0);
6360 e = p + PyUnicode_GET_SIZE(self);
6361 for (; p < e; p++) {
6362 if (!Py_UNICODE_ISDECIMAL(*p))
6363 return PyBool_FromLong(0);
6365 return PyBool_FromLong(1);
6368 PyDoc_STRVAR(isdigit__doc__,
6369 "S.isdigit() -> bool\n\
6371 Return True if all characters in S are digits\n\
6372 and there is at least one character in S, False otherwise.");
6374 static PyObject*
6375 unicode_isdigit(PyUnicodeObject *self)
6377 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6378 register const Py_UNICODE *e;
6380 /* Shortcut for single character strings */
6381 if (PyUnicode_GET_SIZE(self) == 1 &&
6382 Py_UNICODE_ISDIGIT(*p))
6383 return PyBool_FromLong(1);
6385 /* Special case for empty strings */
6386 if (PyUnicode_GET_SIZE(self) == 0)
6387 return PyBool_FromLong(0);
6389 e = p + PyUnicode_GET_SIZE(self);
6390 for (; p < e; p++) {
6391 if (!Py_UNICODE_ISDIGIT(*p))
6392 return PyBool_FromLong(0);
6394 return PyBool_FromLong(1);
6397 PyDoc_STRVAR(isnumeric__doc__,
6398 "S.isnumeric() -> bool\n\
6400 Return True if there are only numeric characters in S,\n\
6401 False otherwise.");
6403 static PyObject*
6404 unicode_isnumeric(PyUnicodeObject *self)
6406 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6407 register const Py_UNICODE *e;
6409 /* Shortcut for single character strings */
6410 if (PyUnicode_GET_SIZE(self) == 1 &&
6411 Py_UNICODE_ISNUMERIC(*p))
6412 return PyBool_FromLong(1);
6414 /* Special case for empty strings */
6415 if (PyUnicode_GET_SIZE(self) == 0)
6416 return PyBool_FromLong(0);
6418 e = p + PyUnicode_GET_SIZE(self);
6419 for (; p < e; p++) {
6420 if (!Py_UNICODE_ISNUMERIC(*p))
6421 return PyBool_FromLong(0);
6423 return PyBool_FromLong(1);
6426 PyDoc_STRVAR(join__doc__,
6427 "S.join(sequence) -> unicode\n\
6429 Return a string which is the concatenation of the strings in the\n\
6430 sequence. The separator between elements is S.");
6432 static PyObject*
6433 unicode_join(PyObject *self, PyObject *data)
6435 return PyUnicode_Join(self, data);
6438 static Py_ssize_t
6439 unicode_length(PyUnicodeObject *self)
6441 return self->length;
6444 PyDoc_STRVAR(ljust__doc__,
6445 "S.ljust(width[, fillchar]) -> int\n\
6447 Return S left justified in a Unicode string of length width. Padding is\n\
6448 done using the specified fill character (default is a space).");
6450 static PyObject *
6451 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6453 Py_ssize_t width;
6454 Py_UNICODE fillchar = ' ';
6456 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6457 return NULL;
6459 if (self->length >= width && PyUnicode_CheckExact(self)) {
6460 Py_INCREF(self);
6461 return (PyObject*) self;
6464 return (PyObject*) pad(self, 0, width - self->length, fillchar);
6467 PyDoc_STRVAR(lower__doc__,
6468 "S.lower() -> unicode\n\
6470 Return a copy of the string S converted to lowercase.");
6472 static PyObject*
6473 unicode_lower(PyUnicodeObject *self)
6475 return fixup(self, fixlower);
6478 #define LEFTSTRIP 0
6479 #define RIGHTSTRIP 1
6480 #define BOTHSTRIP 2
6482 /* Arrays indexed by above */
6483 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6485 #define STRIPNAME(i) (stripformat[i]+3)
6487 /* externally visible for str.strip(unicode) */
6488 PyObject *
6489 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6491 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6492 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6493 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6494 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6495 Py_ssize_t i, j;
6497 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6499 i = 0;
6500 if (striptype != RIGHTSTRIP) {
6501 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6502 i++;
6506 j = len;
6507 if (striptype != LEFTSTRIP) {
6508 do {
6509 j--;
6510 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6511 j++;
6514 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6515 Py_INCREF(self);
6516 return (PyObject*)self;
6518 else
6519 return PyUnicode_FromUnicode(s+i, j-i);
6523 static PyObject *
6524 do_strip(PyUnicodeObject *self, int striptype)
6526 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6527 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6529 i = 0;
6530 if (striptype != RIGHTSTRIP) {
6531 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6532 i++;
6536 j = len;
6537 if (striptype != LEFTSTRIP) {
6538 do {
6539 j--;
6540 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6541 j++;
6544 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6545 Py_INCREF(self);
6546 return (PyObject*)self;
6548 else
6549 return PyUnicode_FromUnicode(s+i, j-i);
6553 static PyObject *
6554 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
6556 PyObject *sep = NULL;
6558 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
6559 return NULL;
6561 if (sep != NULL && sep != Py_None) {
6562 if (PyUnicode_Check(sep))
6563 return _PyUnicode_XStrip(self, striptype, sep);
6564 else if (PyString_Check(sep)) {
6565 PyObject *res;
6566 sep = PyUnicode_FromObject(sep);
6567 if (sep==NULL)
6568 return NULL;
6569 res = _PyUnicode_XStrip(self, striptype, sep);
6570 Py_DECREF(sep);
6571 return res;
6573 else {
6574 PyErr_Format(PyExc_TypeError,
6575 "%s arg must be None, unicode or str",
6576 STRIPNAME(striptype));
6577 return NULL;
6581 return do_strip(self, striptype);
6585 PyDoc_STRVAR(strip__doc__,
6586 "S.strip([chars]) -> unicode\n\
6588 Return a copy of the string S with leading and trailing\n\
6589 whitespace removed.\n\
6590 If chars is given and not None, remove characters in chars instead.\n\
6591 If chars is a str, it will be converted to unicode before stripping");
6593 static PyObject *
6594 unicode_strip(PyUnicodeObject *self, PyObject *args)
6596 if (PyTuple_GET_SIZE(args) == 0)
6597 return do_strip(self, BOTHSTRIP); /* Common case */
6598 else
6599 return do_argstrip(self, BOTHSTRIP, args);
6603 PyDoc_STRVAR(lstrip__doc__,
6604 "S.lstrip([chars]) -> unicode\n\
6606 Return a copy of the string S with leading whitespace removed.\n\
6607 If chars is given and not None, remove characters in chars instead.\n\
6608 If chars is a str, it will be converted to unicode before stripping");
6610 static PyObject *
6611 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
6613 if (PyTuple_GET_SIZE(args) == 0)
6614 return do_strip(self, LEFTSTRIP); /* Common case */
6615 else
6616 return do_argstrip(self, LEFTSTRIP, args);
6620 PyDoc_STRVAR(rstrip__doc__,
6621 "S.rstrip([chars]) -> unicode\n\
6623 Return a copy of the string S with trailing whitespace removed.\n\
6624 If chars is given and not None, remove characters in chars instead.\n\
6625 If chars is a str, it will be converted to unicode before stripping");
6627 static PyObject *
6628 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
6630 if (PyTuple_GET_SIZE(args) == 0)
6631 return do_strip(self, RIGHTSTRIP); /* Common case */
6632 else
6633 return do_argstrip(self, RIGHTSTRIP, args);
6637 static PyObject*
6638 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
6640 PyUnicodeObject *u;
6641 Py_UNICODE *p;
6642 Py_ssize_t nchars;
6643 size_t nbytes;
6645 if (len < 0)
6646 len = 0;
6648 if (len == 1 && PyUnicode_CheckExact(str)) {
6649 /* no repeat, return original string */
6650 Py_INCREF(str);
6651 return (PyObject*) str;
6654 /* ensure # of chars needed doesn't overflow int and # of bytes
6655 * needed doesn't overflow size_t
6657 nchars = len * str->length;
6658 if (len && nchars / len != str->length) {
6659 PyErr_SetString(PyExc_OverflowError,
6660 "repeated string is too long");
6661 return NULL;
6663 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
6664 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
6665 PyErr_SetString(PyExc_OverflowError,
6666 "repeated string is too long");
6667 return NULL;
6669 u = _PyUnicode_New(nchars);
6670 if (!u)
6671 return NULL;
6673 p = u->str;
6675 if (str->length == 1 && len > 0) {
6676 Py_UNICODE_FILL(p, str->str[0], len);
6677 } else {
6678 Py_ssize_t done = 0; /* number of characters copied this far */
6679 if (done < nchars) {
6680 Py_UNICODE_COPY(p, str->str, str->length);
6681 done = str->length;
6683 while (done < nchars) {
6684 int n = (done <= nchars-done) ? done : nchars-done;
6685 Py_UNICODE_COPY(p+done, p, n);
6686 done += n;
6690 return (PyObject*) u;
6693 PyObject *PyUnicode_Replace(PyObject *obj,
6694 PyObject *subobj,
6695 PyObject *replobj,
6696 Py_ssize_t maxcount)
6698 PyObject *self;
6699 PyObject *str1;
6700 PyObject *str2;
6701 PyObject *result;
6703 self = PyUnicode_FromObject(obj);
6704 if (self == NULL)
6705 return NULL;
6706 str1 = PyUnicode_FromObject(subobj);
6707 if (str1 == NULL) {
6708 Py_DECREF(self);
6709 return NULL;
6711 str2 = PyUnicode_FromObject(replobj);
6712 if (str2 == NULL) {
6713 Py_DECREF(self);
6714 Py_DECREF(str1);
6715 return NULL;
6717 result = replace((PyUnicodeObject *)self,
6718 (PyUnicodeObject *)str1,
6719 (PyUnicodeObject *)str2,
6720 maxcount);
6721 Py_DECREF(self);
6722 Py_DECREF(str1);
6723 Py_DECREF(str2);
6724 return result;
6727 PyDoc_STRVAR(replace__doc__,
6728 "S.replace (old, new[, maxsplit]) -> unicode\n\
6730 Return a copy of S with all occurrences of substring\n\
6731 old replaced by new. If the optional argument maxsplit is\n\
6732 given, only the first maxsplit occurrences are replaced.");
6734 static PyObject*
6735 unicode_replace(PyUnicodeObject *self, PyObject *args)
6737 PyUnicodeObject *str1;
6738 PyUnicodeObject *str2;
6739 Py_ssize_t maxcount = -1;
6740 PyObject *result;
6742 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
6743 return NULL;
6744 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
6745 if (str1 == NULL)
6746 return NULL;
6747 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
6748 if (str2 == NULL) {
6749 Py_DECREF(str1);
6750 return NULL;
6753 result = replace(self, str1, str2, maxcount);
6755 Py_DECREF(str1);
6756 Py_DECREF(str2);
6757 return result;
6760 static
6761 PyObject *unicode_repr(PyObject *unicode)
6763 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
6764 PyUnicode_GET_SIZE(unicode),
6768 PyDoc_STRVAR(rfind__doc__,
6769 "S.rfind(sub [,start [,end]]) -> int\n\
6771 Return the highest index in S where substring sub is found,\n\
6772 such that sub is contained within s[start:end]. Optional\n\
6773 arguments start and end are interpreted as in slice notation.\n\
6775 Return -1 on failure.");
6777 static PyObject *
6778 unicode_rfind(PyUnicodeObject *self, PyObject *args)
6780 PyObject *substring;
6781 Py_ssize_t start = 0;
6782 Py_ssize_t end = PY_SSIZE_T_MAX;
6783 Py_ssize_t result;
6785 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
6786 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6787 return NULL;
6788 substring = PyUnicode_FromObject(substring);
6789 if (!substring)
6790 return NULL;
6792 result = stringlib_rfind_slice(
6793 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6794 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6795 start, end
6798 Py_DECREF(substring);
6800 return PyInt_FromSsize_t(result);
6803 PyDoc_STRVAR(rindex__doc__,
6804 "S.rindex(sub [,start [,end]]) -> int\n\
6806 Like S.rfind() but raise ValueError when the substring is not found.");
6808 static PyObject *
6809 unicode_rindex(PyUnicodeObject *self, PyObject *args)
6811 PyObject *substring;
6812 Py_ssize_t start = 0;
6813 Py_ssize_t end = PY_SSIZE_T_MAX;
6814 Py_ssize_t result;
6816 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6817 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
6818 return NULL;
6819 substring = PyUnicode_FromObject(substring);
6820 if (!substring)
6821 return NULL;
6823 result = stringlib_rfind_slice(
6824 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6825 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6826 start, end
6829 Py_DECREF(substring);
6831 if (result < 0) {
6832 PyErr_SetString(PyExc_ValueError, "substring not found");
6833 return NULL;
6835 return PyInt_FromSsize_t(result);
6838 PyDoc_STRVAR(rjust__doc__,
6839 "S.rjust(width[, fillchar]) -> unicode\n\
6841 Return S right justified in a Unicode string of length width. Padding is\n\
6842 done using the specified fill character (default is a space).");
6844 static PyObject *
6845 unicode_rjust(PyUnicodeObject *self, PyObject *args)
6847 Py_ssize_t width;
6848 Py_UNICODE fillchar = ' ';
6850 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
6851 return NULL;
6853 if (self->length >= width && PyUnicode_CheckExact(self)) {
6854 Py_INCREF(self);
6855 return (PyObject*) self;
6858 return (PyObject*) pad(self, width - self->length, 0, fillchar);
6861 static PyObject*
6862 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
6864 /* standard clamping */
6865 if (start < 0)
6866 start = 0;
6867 if (end < 0)
6868 end = 0;
6869 if (end > self->length)
6870 end = self->length;
6871 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
6872 /* full slice, return original string */
6873 Py_INCREF(self);
6874 return (PyObject*) self;
6876 if (start > end)
6877 start = end;
6878 /* copy slice */
6879 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6880 end - start);
6883 PyObject *PyUnicode_Split(PyObject *s,
6884 PyObject *sep,
6885 Py_ssize_t maxsplit)
6887 PyObject *result;
6889 s = PyUnicode_FromObject(s);
6890 if (s == NULL)
6891 return NULL;
6892 if (sep != NULL) {
6893 sep = PyUnicode_FromObject(sep);
6894 if (sep == NULL) {
6895 Py_DECREF(s);
6896 return NULL;
6900 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6902 Py_DECREF(s);
6903 Py_XDECREF(sep);
6904 return result;
6907 PyDoc_STRVAR(split__doc__,
6908 "S.split([sep [,maxsplit]]) -> list of strings\n\
6910 Return a list of the words in S, using sep as the\n\
6911 delimiter string. If maxsplit is given, at most maxsplit\n\
6912 splits are done. If sep is not specified or is None,\n\
6913 any whitespace string is a separator.");
6915 static PyObject*
6916 unicode_split(PyUnicodeObject *self, PyObject *args)
6918 PyObject *substring = Py_None;
6919 Py_ssize_t maxcount = -1;
6921 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
6922 return NULL;
6924 if (substring == Py_None)
6925 return split(self, NULL, maxcount);
6926 else if (PyUnicode_Check(substring))
6927 return split(self, (PyUnicodeObject *)substring, maxcount);
6928 else
6929 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6932 PyObject *
6933 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
6935 PyObject* str_obj;
6936 PyObject* sep_obj;
6937 PyObject* out;
6939 str_obj = PyUnicode_FromObject(str_in);
6940 if (!str_obj)
6941 return NULL;
6942 sep_obj = PyUnicode_FromObject(sep_in);
6943 if (!sep_obj) {
6944 Py_DECREF(str_obj);
6945 return NULL;
6948 out = stringlib_partition(
6949 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6950 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6953 Py_DECREF(sep_obj);
6954 Py_DECREF(str_obj);
6956 return out;
6960 PyObject *
6961 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
6963 PyObject* str_obj;
6964 PyObject* sep_obj;
6965 PyObject* out;
6967 str_obj = PyUnicode_FromObject(str_in);
6968 if (!str_obj)
6969 return NULL;
6970 sep_obj = PyUnicode_FromObject(sep_in);
6971 if (!sep_obj) {
6972 Py_DECREF(str_obj);
6973 return NULL;
6976 out = stringlib_rpartition(
6977 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
6978 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
6981 Py_DECREF(sep_obj);
6982 Py_DECREF(str_obj);
6984 return out;
6987 PyDoc_STRVAR(partition__doc__,
6988 "S.partition(sep) -> (head, sep, tail)\n\
6990 Searches for the separator sep in S, and returns the part before it,\n\
6991 the separator itself, and the part after it. If the separator is not\n\
6992 found, returns S and two empty strings.");
6994 static PyObject*
6995 unicode_partition(PyUnicodeObject *self, PyObject *separator)
6997 return PyUnicode_Partition((PyObject *)self, separator);
7000 PyDoc_STRVAR(rpartition__doc__,
7001 "S.rpartition(sep) -> (tail, sep, head)\n\
7003 Searches for the separator sep in S, starting at the end of S, and returns\n\
7004 the part before it, the separator itself, and the part after it. If the\n\
7005 separator is not found, returns two empty strings and S.");
7007 static PyObject*
7008 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7010 return PyUnicode_RPartition((PyObject *)self, separator);
7013 PyObject *PyUnicode_RSplit(PyObject *s,
7014 PyObject *sep,
7015 Py_ssize_t maxsplit)
7017 PyObject *result;
7019 s = PyUnicode_FromObject(s);
7020 if (s == NULL)
7021 return NULL;
7022 if (sep != NULL) {
7023 sep = PyUnicode_FromObject(sep);
7024 if (sep == NULL) {
7025 Py_DECREF(s);
7026 return NULL;
7030 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7032 Py_DECREF(s);
7033 Py_XDECREF(sep);
7034 return result;
7037 PyDoc_STRVAR(rsplit__doc__,
7038 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7040 Return a list of the words in S, using sep as the\n\
7041 delimiter string, starting at the end of the string and\n\
7042 working to the front. If maxsplit is given, at most maxsplit\n\
7043 splits are done. If sep is not specified, any whitespace string\n\
7044 is a separator.");
7046 static PyObject*
7047 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7049 PyObject *substring = Py_None;
7050 Py_ssize_t maxcount = -1;
7052 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7053 return NULL;
7055 if (substring == Py_None)
7056 return rsplit(self, NULL, maxcount);
7057 else if (PyUnicode_Check(substring))
7058 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7059 else
7060 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7063 PyDoc_STRVAR(splitlines__doc__,
7064 "S.splitlines([keepends]]) -> list of strings\n\
7066 Return a list of the lines in S, breaking at line boundaries.\n\
7067 Line breaks are not included in the resulting list unless keepends\n\
7068 is given and true.");
7070 static PyObject*
7071 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7073 int keepends = 0;
7075 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7076 return NULL;
7078 return PyUnicode_Splitlines((PyObject *)self, keepends);
7081 static
7082 PyObject *unicode_str(PyUnicodeObject *self)
7084 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7087 PyDoc_STRVAR(swapcase__doc__,
7088 "S.swapcase() -> unicode\n\
7090 Return a copy of S with uppercase characters converted to lowercase\n\
7091 and vice versa.");
7093 static PyObject*
7094 unicode_swapcase(PyUnicodeObject *self)
7096 return fixup(self, fixswapcase);
7099 PyDoc_STRVAR(translate__doc__,
7100 "S.translate(table) -> unicode\n\
7102 Return a copy of the string S, where all characters have been mapped\n\
7103 through the given translation table, which must be a mapping of\n\
7104 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7105 Unmapped characters are left untouched. Characters mapped to None\n\
7106 are deleted.");
7108 static PyObject*
7109 unicode_translate(PyUnicodeObject *self, PyObject *table)
7111 return PyUnicode_TranslateCharmap(self->str,
7112 self->length,
7113 table,
7114 "ignore");
7117 PyDoc_STRVAR(upper__doc__,
7118 "S.upper() -> unicode\n\
7120 Return a copy of S converted to uppercase.");
7122 static PyObject*
7123 unicode_upper(PyUnicodeObject *self)
7125 return fixup(self, fixupper);
7128 PyDoc_STRVAR(zfill__doc__,
7129 "S.zfill(width) -> unicode\n\
7131 Pad a numeric string x with zeros on the left, to fill a field\n\
7132 of the specified width. The string x is never truncated.");
7134 static PyObject *
7135 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7137 Py_ssize_t fill;
7138 PyUnicodeObject *u;
7140 Py_ssize_t width;
7141 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7142 return NULL;
7144 if (self->length >= width) {
7145 if (PyUnicode_CheckExact(self)) {
7146 Py_INCREF(self);
7147 return (PyObject*) self;
7149 else
7150 return PyUnicode_FromUnicode(
7151 PyUnicode_AS_UNICODE(self),
7152 PyUnicode_GET_SIZE(self)
7156 fill = width - self->length;
7158 u = pad(self, fill, 0, '0');
7160 if (u == NULL)
7161 return NULL;
7163 if (u->str[fill] == '+' || u->str[fill] == '-') {
7164 /* move sign to beginning of string */
7165 u->str[0] = u->str[fill];
7166 u->str[fill] = '0';
7169 return (PyObject*) u;
7172 #if 0
7173 static PyObject*
7174 unicode_freelistsize(PyUnicodeObject *self)
7176 return PyInt_FromLong(unicode_freelist_size);
7178 #endif
7180 PyDoc_STRVAR(startswith__doc__,
7181 "S.startswith(prefix[, start[, end]]) -> bool\n\
7183 Return True if S starts with the specified prefix, False otherwise.\n\
7184 With optional start, test S beginning at that position.\n\
7185 With optional end, stop comparing S at that position.\n\
7186 prefix can also be a tuple of strings to try.");
7188 static PyObject *
7189 unicode_startswith(PyUnicodeObject *self,
7190 PyObject *args)
7192 PyObject *subobj;
7193 PyUnicodeObject *substring;
7194 Py_ssize_t start = 0;
7195 Py_ssize_t end = PY_SSIZE_T_MAX;
7196 int result;
7198 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
7199 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7200 return NULL;
7201 if (PyTuple_Check(subobj)) {
7202 Py_ssize_t i;
7203 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7204 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7205 PyTuple_GET_ITEM(subobj, i));
7206 if (substring == NULL)
7207 return NULL;
7208 result = tailmatch(self, substring, start, end, -1);
7209 Py_DECREF(substring);
7210 if (result) {
7211 Py_RETURN_TRUE;
7214 /* nothing matched */
7215 Py_RETURN_FALSE;
7217 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7218 if (substring == NULL)
7219 return NULL;
7220 result = tailmatch(self, substring, start, end, -1);
7221 Py_DECREF(substring);
7222 return PyBool_FromLong(result);
7226 PyDoc_STRVAR(endswith__doc__,
7227 "S.endswith(suffix[, start[, end]]) -> bool\n\
7229 Return True if S ends with the specified suffix, False otherwise.\n\
7230 With optional start, test S beginning at that position.\n\
7231 With optional end, stop comparing S at that position.\n\
7232 suffix can also be a tuple of strings to try.");
7234 static PyObject *
7235 unicode_endswith(PyUnicodeObject *self,
7236 PyObject *args)
7238 PyObject *subobj;
7239 PyUnicodeObject *substring;
7240 Py_ssize_t start = 0;
7241 Py_ssize_t end = PY_SSIZE_T_MAX;
7242 int result;
7244 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
7245 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7246 return NULL;
7247 if (PyTuple_Check(subobj)) {
7248 Py_ssize_t i;
7249 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7250 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7251 PyTuple_GET_ITEM(subobj, i));
7252 if (substring == NULL)
7253 return NULL;
7254 result = tailmatch(self, substring, start, end, +1);
7255 Py_DECREF(substring);
7256 if (result) {
7257 Py_RETURN_TRUE;
7260 Py_RETURN_FALSE;
7262 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7263 if (substring == NULL)
7264 return NULL;
7266 result = tailmatch(self, substring, start, end, +1);
7267 Py_DECREF(substring);
7268 return PyBool_FromLong(result);
7273 static PyObject *
7274 unicode_getnewargs(PyUnicodeObject *v)
7276 return Py_BuildValue("(u#)", v->str, v->length);
7280 static PyMethodDef unicode_methods[] = {
7282 /* Order is according to common usage: often used methods should
7283 appear first, since lookup is done sequentially. */
7285 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
7286 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7287 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7288 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7289 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7290 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7291 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7292 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7293 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7294 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7295 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7296 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7297 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7298 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7299 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7300 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7301 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
7302 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7303 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7304 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7305 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7306 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7307 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7308 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7309 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7310 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7311 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7312 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7313 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7314 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7315 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7316 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7317 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7318 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7319 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7320 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7321 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7322 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7323 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7324 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7325 #if 0
7326 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7327 #endif
7329 #if 0
7330 /* This one is just used for debugging the implementation. */
7331 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
7332 #endif
7334 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
7335 {NULL, NULL}
7338 static PyObject *
7339 unicode_mod(PyObject *v, PyObject *w)
7341 if (!PyUnicode_Check(v)) {
7342 Py_INCREF(Py_NotImplemented);
7343 return Py_NotImplemented;
7345 return PyUnicode_Format(v, w);
7348 static PyNumberMethods unicode_as_number = {
7349 0, /*nb_add*/
7350 0, /*nb_subtract*/
7351 0, /*nb_multiply*/
7352 0, /*nb_divide*/
7353 unicode_mod, /*nb_remainder*/
7356 static PySequenceMethods unicode_as_sequence = {
7357 (lenfunc) unicode_length, /* sq_length */
7358 PyUnicode_Concat, /* sq_concat */
7359 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7360 (ssizeargfunc) unicode_getitem, /* sq_item */
7361 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7362 0, /* sq_ass_item */
7363 0, /* sq_ass_slice */
7364 PyUnicode_Contains, /* sq_contains */
7367 static PyObject*
7368 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7370 if (PyIndex_Check(item)) {
7371 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7372 if (i == -1 && PyErr_Occurred())
7373 return NULL;
7374 if (i < 0)
7375 i += PyUnicode_GET_SIZE(self);
7376 return unicode_getitem(self, i);
7377 } else if (PySlice_Check(item)) {
7378 Py_ssize_t start, stop, step, slicelength, cur, i;
7379 Py_UNICODE* source_buf;
7380 Py_UNICODE* result_buf;
7381 PyObject* result;
7383 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7384 &start, &stop, &step, &slicelength) < 0) {
7385 return NULL;
7388 if (slicelength <= 0) {
7389 return PyUnicode_FromUnicode(NULL, 0);
7390 } else if (start == 0 && step == 1 && slicelength == self->length &&
7391 PyUnicode_CheckExact(self)) {
7392 Py_INCREF(self);
7393 return (PyObject *)self;
7394 } else if (step == 1) {
7395 return PyUnicode_FromUnicode(self->str + start, slicelength);
7396 } else {
7397 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7398 result_buf = (Py_UNICODE *)PyMem_MALLOC(slicelength*
7399 sizeof(Py_UNICODE));
7401 if (result_buf == NULL)
7402 return PyErr_NoMemory();
7404 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7405 result_buf[i] = source_buf[cur];
7408 result = PyUnicode_FromUnicode(result_buf, slicelength);
7409 PyMem_FREE(result_buf);
7410 return result;
7412 } else {
7413 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7414 return NULL;
7418 static PyMappingMethods unicode_as_mapping = {
7419 (lenfunc)unicode_length, /* mp_length */
7420 (binaryfunc)unicode_subscript, /* mp_subscript */
7421 (objobjargproc)0, /* mp_ass_subscript */
7424 static Py_ssize_t
7425 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7426 Py_ssize_t index,
7427 const void **ptr)
7429 if (index != 0) {
7430 PyErr_SetString(PyExc_SystemError,
7431 "accessing non-existent unicode segment");
7432 return -1;
7434 *ptr = (void *) self->str;
7435 return PyUnicode_GET_DATA_SIZE(self);
7438 static Py_ssize_t
7439 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7440 const void **ptr)
7442 PyErr_SetString(PyExc_TypeError,
7443 "cannot use unicode as modifiable buffer");
7444 return -1;
7447 static int
7448 unicode_buffer_getsegcount(PyUnicodeObject *self,
7449 Py_ssize_t *lenp)
7451 if (lenp)
7452 *lenp = PyUnicode_GET_DATA_SIZE(self);
7453 return 1;
7456 static Py_ssize_t
7457 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7458 Py_ssize_t index,
7459 const void **ptr)
7461 PyObject *str;
7463 if (index != 0) {
7464 PyErr_SetString(PyExc_SystemError,
7465 "accessing non-existent unicode segment");
7466 return -1;
7468 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7469 if (str == NULL)
7470 return -1;
7471 *ptr = (void *) PyString_AS_STRING(str);
7472 return PyString_GET_SIZE(str);
7475 /* Helpers for PyUnicode_Format() */
7477 static PyObject *
7478 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7480 Py_ssize_t argidx = *p_argidx;
7481 if (argidx < arglen) {
7482 (*p_argidx)++;
7483 if (arglen < 0)
7484 return args;
7485 else
7486 return PyTuple_GetItem(args, argidx);
7488 PyErr_SetString(PyExc_TypeError,
7489 "not enough arguments for format string");
7490 return NULL;
7493 #define F_LJUST (1<<0)
7494 #define F_SIGN (1<<1)
7495 #define F_BLANK (1<<2)
7496 #define F_ALT (1<<3)
7497 #define F_ZERO (1<<4)
7499 static Py_ssize_t
7500 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
7502 register Py_ssize_t i;
7503 Py_ssize_t len = strlen(charbuffer);
7504 for (i = len - 1; i >= 0; i--)
7505 buffer[i] = (Py_UNICODE) charbuffer[i];
7507 return len;
7510 static int
7511 doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
7513 Py_ssize_t result;
7515 PyOS_ascii_formatd((char *)buffer, len, format, x);
7516 result = strtounicode(buffer, (char *)buffer);
7517 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7520 static int
7521 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
7523 Py_ssize_t result;
7525 PyOS_snprintf((char *)buffer, len, format, x);
7526 result = strtounicode(buffer, (char *)buffer);
7527 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
7530 /* XXX To save some code duplication, formatfloat/long/int could have been
7531 shared with stringobject.c, converting from 8-bit to Unicode after the
7532 formatting is done. */
7534 static int
7535 formatfloat(Py_UNICODE *buf,
7536 size_t buflen,
7537 int flags,
7538 int prec,
7539 int type,
7540 PyObject *v)
7542 /* fmt = '%#.' + `prec` + `type`
7543 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7544 char fmt[20];
7545 double x;
7547 x = PyFloat_AsDouble(v);
7548 if (x == -1.0 && PyErr_Occurred())
7549 return -1;
7550 if (prec < 0)
7551 prec = 6;
7552 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
7553 type = 'g';
7554 /* Worst case length calc to ensure no buffer overrun:
7556 'g' formats:
7557 fmt = %#.<prec>g
7558 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7559 for any double rep.)
7560 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7562 'f' formats:
7563 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7564 len = 1 + 50 + 1 + prec = 52 + prec
7566 If prec=0 the effective precision is 1 (the leading digit is
7567 always given), therefore increase the length by one.
7570 if (((type == 'g' || type == 'G') &&
7571 buflen <= (size_t)10 + (size_t)prec) ||
7572 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
7573 PyErr_SetString(PyExc_OverflowError,
7574 "formatted float is too long (precision too large?)");
7575 return -1;
7577 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
7578 (flags&F_ALT) ? "#" : "",
7579 prec, type);
7580 return doubletounicode(buf, buflen, fmt, x);
7583 static PyObject*
7584 formatlong(PyObject *val, int flags, int prec, int type)
7586 char *buf;
7587 int i, len;
7588 PyObject *str; /* temporary string object. */
7589 PyUnicodeObject *result;
7591 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
7592 if (!str)
7593 return NULL;
7594 result = _PyUnicode_New(len);
7595 if (!result) {
7596 Py_DECREF(str);
7597 return NULL;
7599 for (i = 0; i < len; i++)
7600 result->str[i] = buf[i];
7601 result->str[len] = 0;
7602 Py_DECREF(str);
7603 return (PyObject*)result;
7606 static int
7607 formatint(Py_UNICODE *buf,
7608 size_t buflen,
7609 int flags,
7610 int prec,
7611 int type,
7612 PyObject *v)
7614 /* fmt = '%#.' + `prec` + 'l' + `type`
7615 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7616 * + 1 + 1
7617 * = 24
7619 char fmt[64]; /* plenty big enough! */
7620 char *sign;
7621 long x;
7623 x = PyInt_AsLong(v);
7624 if (x == -1 && PyErr_Occurred())
7625 return -1;
7626 if (x < 0 && type == 'u') {
7627 type = 'd';
7629 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
7630 sign = "-";
7631 else
7632 sign = "";
7633 if (prec < 0)
7634 prec = 1;
7636 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7637 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7639 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
7640 PyErr_SetString(PyExc_OverflowError,
7641 "formatted integer is too long (precision too large?)");
7642 return -1;
7645 if ((flags & F_ALT) &&
7646 (type == 'x' || type == 'X')) {
7647 /* When converting under %#x or %#X, there are a number
7648 * of issues that cause pain:
7649 * - when 0 is being converted, the C standard leaves off
7650 * the '0x' or '0X', which is inconsistent with other
7651 * %#x/%#X conversions and inconsistent with Python's
7652 * hex() function
7653 * - there are platforms that violate the standard and
7654 * convert 0 with the '0x' or '0X'
7655 * (Metrowerks, Compaq Tru64)
7656 * - there are platforms that give '0x' when converting
7657 * under %#X, but convert 0 in accordance with the
7658 * standard (OS/2 EMX)
7660 * We can achieve the desired consistency by inserting our
7661 * own '0x' or '0X' prefix, and substituting %x/%X in place
7662 * of %#x/%#X.
7664 * Note that this is the same approach as used in
7665 * formatint() in stringobject.c
7667 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
7668 sign, type, prec, type);
7670 else {
7671 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
7672 sign, (flags&F_ALT) ? "#" : "",
7673 prec, type);
7675 if (sign[0])
7676 return longtounicode(buf, buflen, fmt, -x);
7677 else
7678 return longtounicode(buf, buflen, fmt, x);
7681 static int
7682 formatchar(Py_UNICODE *buf,
7683 size_t buflen,
7684 PyObject *v)
7686 /* presume that the buffer is at least 2 characters long */
7687 if (PyUnicode_Check(v)) {
7688 if (PyUnicode_GET_SIZE(v) != 1)
7689 goto onError;
7690 buf[0] = PyUnicode_AS_UNICODE(v)[0];
7693 else if (PyString_Check(v)) {
7694 if (PyString_GET_SIZE(v) != 1)
7695 goto onError;
7696 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
7699 else {
7700 /* Integer input truncated to a character */
7701 long x;
7702 x = PyInt_AsLong(v);
7703 if (x == -1 && PyErr_Occurred())
7704 goto onError;
7705 #ifdef Py_UNICODE_WIDE
7706 if (x < 0 || x > 0x10ffff) {
7707 PyErr_SetString(PyExc_OverflowError,
7708 "%c arg not in range(0x110000) "
7709 "(wide Python build)");
7710 return -1;
7712 #else
7713 if (x < 0 || x > 0xffff) {
7714 PyErr_SetString(PyExc_OverflowError,
7715 "%c arg not in range(0x10000) "
7716 "(narrow Python build)");
7717 return -1;
7719 #endif
7720 buf[0] = (Py_UNICODE) x;
7722 buf[1] = '\0';
7723 return 1;
7725 onError:
7726 PyErr_SetString(PyExc_TypeError,
7727 "%c requires int or char");
7728 return -1;
7731 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7733 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7734 chars are formatted. XXX This is a magic number. Each formatting
7735 routine does bounds checking to ensure no overflow, but a better
7736 solution may be to malloc a buffer of appropriate size for each
7737 format. For now, the current solution is sufficient.
7739 #define FORMATBUFLEN (size_t)120
7741 PyObject *PyUnicode_Format(PyObject *format,
7742 PyObject *args)
7744 Py_UNICODE *fmt, *res;
7745 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
7746 int args_owned = 0;
7747 PyUnicodeObject *result = NULL;
7748 PyObject *dict = NULL;
7749 PyObject *uformat;
7751 if (format == NULL || args == NULL) {
7752 PyErr_BadInternalCall();
7753 return NULL;
7755 uformat = PyUnicode_FromObject(format);
7756 if (uformat == NULL)
7757 return NULL;
7758 fmt = PyUnicode_AS_UNICODE(uformat);
7759 fmtcnt = PyUnicode_GET_SIZE(uformat);
7761 reslen = rescnt = fmtcnt + 100;
7762 result = _PyUnicode_New(reslen);
7763 if (result == NULL)
7764 goto onError;
7765 res = PyUnicode_AS_UNICODE(result);
7767 if (PyTuple_Check(args)) {
7768 arglen = PyTuple_Size(args);
7769 argidx = 0;
7771 else {
7772 arglen = -1;
7773 argidx = -2;
7775 if (Py_Type(args)->tp_as_mapping && !PyTuple_Check(args) &&
7776 !PyObject_TypeCheck(args, &PyBaseString_Type))
7777 dict = args;
7779 while (--fmtcnt >= 0) {
7780 if (*fmt != '%') {
7781 if (--rescnt < 0) {
7782 rescnt = fmtcnt + 100;
7783 reslen += rescnt;
7784 if (_PyUnicode_Resize(&result, reslen) < 0)
7785 goto onError;
7786 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
7787 --rescnt;
7789 *res++ = *fmt++;
7791 else {
7792 /* Got a format specifier */
7793 int flags = 0;
7794 Py_ssize_t width = -1;
7795 int prec = -1;
7796 Py_UNICODE c = '\0';
7797 Py_UNICODE fill;
7798 PyObject *v = NULL;
7799 PyObject *temp = NULL;
7800 Py_UNICODE *pbuf;
7801 Py_UNICODE sign;
7802 Py_ssize_t len;
7803 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
7805 fmt++;
7806 if (*fmt == '(') {
7807 Py_UNICODE *keystart;
7808 Py_ssize_t keylen;
7809 PyObject *key;
7810 int pcount = 1;
7812 if (dict == NULL) {
7813 PyErr_SetString(PyExc_TypeError,
7814 "format requires a mapping");
7815 goto onError;
7817 ++fmt;
7818 --fmtcnt;
7819 keystart = fmt;
7820 /* Skip over balanced parentheses */
7821 while (pcount > 0 && --fmtcnt >= 0) {
7822 if (*fmt == ')')
7823 --pcount;
7824 else if (*fmt == '(')
7825 ++pcount;
7826 fmt++;
7828 keylen = fmt - keystart - 1;
7829 if (fmtcnt < 0 || pcount > 0) {
7830 PyErr_SetString(PyExc_ValueError,
7831 "incomplete format key");
7832 goto onError;
7834 #if 0
7835 /* keys are converted to strings using UTF-8 and
7836 then looked up since Python uses strings to hold
7837 variables names etc. in its namespaces and we
7838 wouldn't want to break common idioms. */
7839 key = PyUnicode_EncodeUTF8(keystart,
7840 keylen,
7841 NULL);
7842 #else
7843 key = PyUnicode_FromUnicode(keystart, keylen);
7844 #endif
7845 if (key == NULL)
7846 goto onError;
7847 if (args_owned) {
7848 Py_DECREF(args);
7849 args_owned = 0;
7851 args = PyObject_GetItem(dict, key);
7852 Py_DECREF(key);
7853 if (args == NULL) {
7854 goto onError;
7856 args_owned = 1;
7857 arglen = -1;
7858 argidx = -2;
7860 while (--fmtcnt >= 0) {
7861 switch (c = *fmt++) {
7862 case '-': flags |= F_LJUST; continue;
7863 case '+': flags |= F_SIGN; continue;
7864 case ' ': flags |= F_BLANK; continue;
7865 case '#': flags |= F_ALT; continue;
7866 case '0': flags |= F_ZERO; continue;
7868 break;
7870 if (c == '*') {
7871 v = getnextarg(args, arglen, &argidx);
7872 if (v == NULL)
7873 goto onError;
7874 if (!PyInt_Check(v)) {
7875 PyErr_SetString(PyExc_TypeError,
7876 "* wants int");
7877 goto onError;
7879 width = PyInt_AsLong(v);
7880 if (width < 0) {
7881 flags |= F_LJUST;
7882 width = -width;
7884 if (--fmtcnt >= 0)
7885 c = *fmt++;
7887 else if (c >= '0' && c <= '9') {
7888 width = c - '0';
7889 while (--fmtcnt >= 0) {
7890 c = *fmt++;
7891 if (c < '0' || c > '9')
7892 break;
7893 if ((width*10) / 10 != width) {
7894 PyErr_SetString(PyExc_ValueError,
7895 "width too big");
7896 goto onError;
7898 width = width*10 + (c - '0');
7901 if (c == '.') {
7902 prec = 0;
7903 if (--fmtcnt >= 0)
7904 c = *fmt++;
7905 if (c == '*') {
7906 v = getnextarg(args, arglen, &argidx);
7907 if (v == NULL)
7908 goto onError;
7909 if (!PyInt_Check(v)) {
7910 PyErr_SetString(PyExc_TypeError,
7911 "* wants int");
7912 goto onError;
7914 prec = PyInt_AsLong(v);
7915 if (prec < 0)
7916 prec = 0;
7917 if (--fmtcnt >= 0)
7918 c = *fmt++;
7920 else if (c >= '0' && c <= '9') {
7921 prec = c - '0';
7922 while (--fmtcnt >= 0) {
7923 c = Py_CHARMASK(*fmt++);
7924 if (c < '0' || c > '9')
7925 break;
7926 if ((prec*10) / 10 != prec) {
7927 PyErr_SetString(PyExc_ValueError,
7928 "prec too big");
7929 goto onError;
7931 prec = prec*10 + (c - '0');
7934 } /* prec */
7935 if (fmtcnt >= 0) {
7936 if (c == 'h' || c == 'l' || c == 'L') {
7937 if (--fmtcnt >= 0)
7938 c = *fmt++;
7941 if (fmtcnt < 0) {
7942 PyErr_SetString(PyExc_ValueError,
7943 "incomplete format");
7944 goto onError;
7946 if (c != '%') {
7947 v = getnextarg(args, arglen, &argidx);
7948 if (v == NULL)
7949 goto onError;
7951 sign = 0;
7952 fill = ' ';
7953 switch (c) {
7955 case '%':
7956 pbuf = formatbuf;
7957 /* presume that buffer length is at least 1 */
7958 pbuf[0] = '%';
7959 len = 1;
7960 break;
7962 case 's':
7963 case 'r':
7964 if (PyUnicode_Check(v) && c == 's') {
7965 temp = v;
7966 Py_INCREF(temp);
7968 else {
7969 PyObject *unicode;
7970 if (c == 's')
7971 temp = PyObject_Unicode(v);
7972 else
7973 temp = PyObject_Repr(v);
7974 if (temp == NULL)
7975 goto onError;
7976 if (PyUnicode_Check(temp))
7977 /* nothing to do */;
7978 else if (PyString_Check(temp)) {
7979 /* convert to string to Unicode */
7980 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
7981 PyString_GET_SIZE(temp),
7982 NULL,
7983 "strict");
7984 Py_DECREF(temp);
7985 temp = unicode;
7986 if (temp == NULL)
7987 goto onError;
7989 else {
7990 Py_DECREF(temp);
7991 PyErr_SetString(PyExc_TypeError,
7992 "%s argument has non-string str()");
7993 goto onError;
7996 pbuf = PyUnicode_AS_UNICODE(temp);
7997 len = PyUnicode_GET_SIZE(temp);
7998 if (prec >= 0 && len > prec)
7999 len = prec;
8000 break;
8002 case 'i':
8003 case 'd':
8004 case 'u':
8005 case 'o':
8006 case 'x':
8007 case 'X':
8008 if (c == 'i')
8009 c = 'd';
8010 if (PyLong_Check(v)) {
8011 temp = formatlong(v, flags, prec, c);
8012 if (!temp)
8013 goto onError;
8014 pbuf = PyUnicode_AS_UNICODE(temp);
8015 len = PyUnicode_GET_SIZE(temp);
8016 sign = 1;
8018 else {
8019 pbuf = formatbuf;
8020 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8021 flags, prec, c, v);
8022 if (len < 0)
8023 goto onError;
8024 sign = 1;
8026 if (flags & F_ZERO)
8027 fill = '0';
8028 break;
8030 case 'e':
8031 case 'E':
8032 case 'f':
8033 case 'F':
8034 case 'g':
8035 case 'G':
8036 if (c == 'F')
8037 c = 'f';
8038 pbuf = formatbuf;
8039 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8040 flags, prec, c, v);
8041 if (len < 0)
8042 goto onError;
8043 sign = 1;
8044 if (flags & F_ZERO)
8045 fill = '0';
8046 break;
8048 case 'c':
8049 pbuf = formatbuf;
8050 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8051 if (len < 0)
8052 goto onError;
8053 break;
8055 default:
8056 PyErr_Format(PyExc_ValueError,
8057 "unsupported format character '%c' (0x%x) "
8058 "at index %zd",
8059 (31<=c && c<=126) ? (char)c : '?',
8060 (int)c,
8061 (Py_ssize_t)(fmt - 1 -
8062 PyUnicode_AS_UNICODE(uformat)));
8063 goto onError;
8065 if (sign) {
8066 if (*pbuf == '-' || *pbuf == '+') {
8067 sign = *pbuf++;
8068 len--;
8070 else if (flags & F_SIGN)
8071 sign = '+';
8072 else if (flags & F_BLANK)
8073 sign = ' ';
8074 else
8075 sign = 0;
8077 if (width < len)
8078 width = len;
8079 if (rescnt - (sign != 0) < width) {
8080 reslen -= rescnt;
8081 rescnt = width + fmtcnt + 100;
8082 reslen += rescnt;
8083 if (reslen < 0) {
8084 Py_XDECREF(temp);
8085 PyErr_NoMemory();
8086 goto onError;
8088 if (_PyUnicode_Resize(&result, reslen) < 0) {
8089 Py_XDECREF(temp);
8090 goto onError;
8092 res = PyUnicode_AS_UNICODE(result)
8093 + reslen - rescnt;
8095 if (sign) {
8096 if (fill != ' ')
8097 *res++ = sign;
8098 rescnt--;
8099 if (width > len)
8100 width--;
8102 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8103 assert(pbuf[0] == '0');
8104 assert(pbuf[1] == c);
8105 if (fill != ' ') {
8106 *res++ = *pbuf++;
8107 *res++ = *pbuf++;
8109 rescnt -= 2;
8110 width -= 2;
8111 if (width < 0)
8112 width = 0;
8113 len -= 2;
8115 if (width > len && !(flags & F_LJUST)) {
8116 do {
8117 --rescnt;
8118 *res++ = fill;
8119 } while (--width > len);
8121 if (fill == ' ') {
8122 if (sign)
8123 *res++ = sign;
8124 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8125 assert(pbuf[0] == '0');
8126 assert(pbuf[1] == c);
8127 *res++ = *pbuf++;
8128 *res++ = *pbuf++;
8131 Py_UNICODE_COPY(res, pbuf, len);
8132 res += len;
8133 rescnt -= len;
8134 while (--width >= len) {
8135 --rescnt;
8136 *res++ = ' ';
8138 if (dict && (argidx < arglen) && c != '%') {
8139 PyErr_SetString(PyExc_TypeError,
8140 "not all arguments converted during string formatting");
8141 Py_XDECREF(temp);
8142 goto onError;
8144 Py_XDECREF(temp);
8145 } /* '%' */
8146 } /* until end */
8147 if (argidx < arglen && !dict) {
8148 PyErr_SetString(PyExc_TypeError,
8149 "not all arguments converted during string formatting");
8150 goto onError;
8153 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8154 goto onError;
8155 if (args_owned) {
8156 Py_DECREF(args);
8158 Py_DECREF(uformat);
8159 return (PyObject *)result;
8161 onError:
8162 Py_XDECREF(result);
8163 Py_DECREF(uformat);
8164 if (args_owned) {
8165 Py_DECREF(args);
8167 return NULL;
8170 static PyBufferProcs unicode_as_buffer = {
8171 (readbufferproc) unicode_buffer_getreadbuf,
8172 (writebufferproc) unicode_buffer_getwritebuf,
8173 (segcountproc) unicode_buffer_getsegcount,
8174 (charbufferproc) unicode_buffer_getcharbuf,
8177 static PyObject *
8178 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8180 static PyObject *
8181 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8183 PyObject *x = NULL;
8184 static char *kwlist[] = {"string", "encoding", "errors", 0};
8185 char *encoding = NULL;
8186 char *errors = NULL;
8188 if (type != &PyUnicode_Type)
8189 return unicode_subtype_new(type, args, kwds);
8190 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8191 kwlist, &x, &encoding, &errors))
8192 return NULL;
8193 if (x == NULL)
8194 return (PyObject *)_PyUnicode_New(0);
8195 if (encoding == NULL && errors == NULL)
8196 return PyObject_Unicode(x);
8197 else
8198 return PyUnicode_FromEncodedObject(x, encoding, errors);
8201 static PyObject *
8202 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8204 PyUnicodeObject *tmp, *pnew;
8205 Py_ssize_t n;
8207 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8208 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8209 if (tmp == NULL)
8210 return NULL;
8211 assert(PyUnicode_Check(tmp));
8212 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8213 if (pnew == NULL) {
8214 Py_DECREF(tmp);
8215 return NULL;
8217 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
8218 if (pnew->str == NULL) {
8219 _Py_ForgetReference((PyObject *)pnew);
8220 PyObject_Del(pnew);
8221 Py_DECREF(tmp);
8222 return PyErr_NoMemory();
8224 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8225 pnew->length = n;
8226 pnew->hash = tmp->hash;
8227 Py_DECREF(tmp);
8228 return (PyObject *)pnew;
8231 PyDoc_STRVAR(unicode_doc,
8232 "unicode(string [, encoding[, errors]]) -> object\n\
8234 Create a new Unicode object from the given encoded string.\n\
8235 encoding defaults to the current default string encoding.\n\
8236 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8238 PyTypeObject PyUnicode_Type = {
8239 PyVarObject_HEAD_INIT(&PyType_Type, 0)
8240 "unicode", /* tp_name */
8241 sizeof(PyUnicodeObject), /* tp_size */
8242 0, /* tp_itemsize */
8243 /* Slots */
8244 (destructor)unicode_dealloc, /* tp_dealloc */
8245 0, /* tp_print */
8246 0, /* tp_getattr */
8247 0, /* tp_setattr */
8248 0, /* tp_compare */
8249 unicode_repr, /* tp_repr */
8250 &unicode_as_number, /* tp_as_number */
8251 &unicode_as_sequence, /* tp_as_sequence */
8252 &unicode_as_mapping, /* tp_as_mapping */
8253 (hashfunc) unicode_hash, /* tp_hash*/
8254 0, /* tp_call*/
8255 (reprfunc) unicode_str, /* tp_str */
8256 PyObject_GenericGetAttr, /* tp_getattro */
8257 0, /* tp_setattro */
8258 &unicode_as_buffer, /* tp_as_buffer */
8259 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8260 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
8261 unicode_doc, /* tp_doc */
8262 0, /* tp_traverse */
8263 0, /* tp_clear */
8264 PyUnicode_RichCompare, /* tp_richcompare */
8265 0, /* tp_weaklistoffset */
8266 0, /* tp_iter */
8267 0, /* tp_iternext */
8268 unicode_methods, /* tp_methods */
8269 0, /* tp_members */
8270 0, /* tp_getset */
8271 &PyBaseString_Type, /* tp_base */
8272 0, /* tp_dict */
8273 0, /* tp_descr_get */
8274 0, /* tp_descr_set */
8275 0, /* tp_dictoffset */
8276 0, /* tp_init */
8277 0, /* tp_alloc */
8278 unicode_new, /* tp_new */
8279 PyObject_Del, /* tp_free */
8282 /* Initialize the Unicode implementation */
8284 void _PyUnicode_Init(void)
8286 int i;
8288 /* XXX - move this array to unicodectype.c ? */
8289 Py_UNICODE linebreak[] = {
8290 0x000A, /* LINE FEED */
8291 0x000D, /* CARRIAGE RETURN */
8292 0x001C, /* FILE SEPARATOR */
8293 0x001D, /* GROUP SEPARATOR */
8294 0x001E, /* RECORD SEPARATOR */
8295 0x0085, /* NEXT LINE */
8296 0x2028, /* LINE SEPARATOR */
8297 0x2029, /* PARAGRAPH SEPARATOR */
8300 /* Init the implementation */
8301 unicode_freelist = NULL;
8302 unicode_freelist_size = 0;
8303 unicode_empty = _PyUnicode_New(0);
8304 if (!unicode_empty)
8305 return;
8307 strcpy(unicode_default_encoding, "ascii");
8308 for (i = 0; i < 256; i++)
8309 unicode_latin1[i] = NULL;
8310 if (PyType_Ready(&PyUnicode_Type) < 0)
8311 Py_FatalError("Can't initialize 'unicode'");
8313 /* initialize the linebreak bloom filter */
8314 bloom_linebreak = make_bloom_mask(
8315 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8318 PyType_Ready(&EncodingMapType);
8321 /* Finalize the Unicode implementation */
8323 void
8324 _PyUnicode_Fini(void)
8326 PyUnicodeObject *u;
8327 int i;
8329 Py_XDECREF(unicode_empty);
8330 unicode_empty = NULL;
8332 for (i = 0; i < 256; i++) {
8333 if (unicode_latin1[i]) {
8334 Py_DECREF(unicode_latin1[i]);
8335 unicode_latin1[i] = NULL;
8339 for (u = unicode_freelist; u != NULL;) {
8340 PyUnicodeObject *v = u;
8341 u = *(PyUnicodeObject **)u;
8342 if (v->str)
8343 PyMem_DEL(v->str);
8344 Py_XDECREF(v->defenc);
8345 PyObject_Del(v);
8347 unicode_freelist = NULL;
8348 unicode_freelist_size = 0;
8351 #ifdef __cplusplus
8353 #endif
8357 Local variables:
8358 c-basic-offset: 4
8359 indent-tabs-mode: nil
8360 End: