3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
45 #include "unicodeobject.h"
52 /* Limit for the Unicode object free list */
54 #define MAX_UNICODE_FREELIST_SIZE 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
80 # define BYTEORDER_IS_LITTLE_ENDIAN
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
95 /* Free list for Unicode objects */
96 static PyUnicodeObject
*unicode_freelist
;
97 static int unicode_freelist_size
;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject
*unicode_empty
;
102 /* Single character Unicode strings in the Latin-1 range are being
104 static PyUnicodeObject
*unicode_latin1
[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding
[100];
116 PyUnicode_GetMax(void)
118 #ifdef Py_UNICODE_WIDE
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
127 /* --- Bloom Filters ----------------------------------------------------- */
129 /* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
133 /* the linebreak mask is set up by Unicode_Init below */
135 #define BLOOM_MASK unsigned long
137 static BLOOM_MASK bloom_linebreak
;
139 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
141 #define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
144 Py_LOCAL_INLINE(BLOOM_MASK
) make_bloom_mask(Py_UNICODE
* ptr
, Py_ssize_t len
)
146 /* calculate simple bloom-style bitmask for a given unicode string */
152 for (i
= 0; i
< len
; i
++)
153 mask
|= (1 << (ptr
[i
] & 0x1F));
158 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr
, Py_UNICODE
* set
, Py_ssize_t setlen
)
162 for (i
= 0; i
< setlen
; i
++)
169 #define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
172 /* --- Unicode Object ----------------------------------------------------- */
175 int unicode_resize(register PyUnicodeObject
*unicode
,
180 /* Shortcut if there's nothing much to do. */
181 if (unicode
->length
== length
)
184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
188 if (unicode
== unicode_empty
||
189 (unicode
->length
== 1 &&
190 unicode
->str
[0] < 256U &&
191 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
192 PyErr_SetString(PyExc_SystemError
,
193 "can't resize shared unicode objects");
197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
199 safe to look at str[length] (without making any assumptions about what
202 oldstr
= unicode
->str
;
203 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
205 unicode
->str
= (Py_UNICODE
*)oldstr
;
209 unicode
->str
[length
] = 0;
210 unicode
->length
= length
;
213 /* Reset the object caches */
214 if (unicode
->defenc
) {
215 Py_DECREF(unicode
->defenc
);
216 unicode
->defenc
= NULL
;
223 /* We allocate one more byte to make sure the string is
224 Ux0000 terminated -- XXX is this needed ?
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
232 PyUnicodeObject
*_PyUnicode_New(Py_ssize_t length
)
234 register PyUnicodeObject
*unicode
;
236 /* Optimization for empty strings */
237 if (length
== 0 && unicode_empty
!= NULL
) {
238 Py_INCREF(unicode_empty
);
239 return unicode_empty
;
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist
) {
244 unicode
= unicode_freelist
;
245 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
246 unicode_freelist_size
--;
248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
250 if ((unicode
->length
< length
) &&
251 unicode_resize(unicode
, length
) < 0) {
252 PyMem_DEL(unicode
->str
);
257 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
259 PyObject_INIT(unicode
, &PyUnicode_Type
);
262 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
265 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
272 /* Initialize the first element to guard against cases where
273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
280 unicode
->str
[length
] = 0;
281 unicode
->length
= length
;
283 unicode
->defenc
= NULL
;
287 _Py_ForgetReference((PyObject
*)unicode
);
288 PyObject_Del(unicode
);
293 void unicode_dealloc(register PyUnicodeObject
*unicode
)
295 if (PyUnicode_CheckExact(unicode
) &&
296 unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
297 /* Keep-Alive optimization */
298 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
299 PyMem_DEL(unicode
->str
);
303 if (unicode
->defenc
) {
304 Py_DECREF(unicode
->defenc
);
305 unicode
->defenc
= NULL
;
307 /* Add to free list */
308 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
309 unicode_freelist
= unicode
;
310 unicode_freelist_size
++;
313 PyMem_DEL(unicode
->str
);
314 Py_XDECREF(unicode
->defenc
);
315 unicode
->ob_type
->tp_free((PyObject
*)unicode
);
319 int PyUnicode_Resize(PyObject
**unicode
, Py_ssize_t length
)
321 register PyUnicodeObject
*v
;
323 /* Argument checks */
324 if (unicode
== NULL
) {
325 PyErr_BadInternalCall();
328 v
= (PyUnicodeObject
*)*unicode
;
329 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1 || length
< 0) {
330 PyErr_BadInternalCall();
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
337 if (v
->length
!= length
&&
338 (v
== unicode_empty
|| v
->length
== 1)) {
339 PyUnicodeObject
*w
= _PyUnicode_New(length
);
342 Py_UNICODE_COPY(w
->str
, v
->str
,
343 length
< v
->length
? length
: v
->length
);
345 *unicode
= (PyObject
*)w
;
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v
, length
);
354 /* Internal API for use in unicodeobject.c only ! */
355 #define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
358 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
361 PyUnicodeObject
*unicode
;
363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
367 /* Optimization for empty strings */
368 if (size
== 0 && unicode_empty
!= NULL
) {
369 Py_INCREF(unicode_empty
);
370 return (PyObject
*)unicode_empty
;
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size
== 1 && *u
< 256) {
376 unicode
= unicode_latin1
[*u
];
378 unicode
= _PyUnicode_New(1);
381 unicode
->str
[0] = *u
;
382 unicode_latin1
[*u
] = unicode
;
385 return (PyObject
*)unicode
;
389 unicode
= _PyUnicode_New(size
);
393 /* Copy the Unicode data into the new object */
395 Py_UNICODE_COPY(unicode
->str
, u
, size
);
397 return (PyObject
*)unicode
;
402 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
405 PyUnicodeObject
*unicode
;
408 PyErr_BadInternalCall();
412 unicode
= _PyUnicode_New(size
);
416 /* Copy the wchar_t data into the new object */
417 #ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
421 register Py_UNICODE
*u
;
422 register Py_ssize_t i
;
423 u
= PyUnicode_AS_UNICODE(unicode
);
424 for (i
= size
; i
> 0; i
--)
429 return (PyObject
*)unicode
;
432 Py_ssize_t
PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
436 if (unicode
== NULL
) {
437 PyErr_BadInternalCall();
441 /* If possible, try to copy the 0-termination as well */
442 if (size
> PyUnicode_GET_SIZE(unicode
))
443 size
= PyUnicode_GET_SIZE(unicode
) + 1;
445 #ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
449 register Py_UNICODE
*u
;
450 register Py_ssize_t i
;
451 u
= PyUnicode_AS_UNICODE(unicode
);
452 for (i
= size
; i
> 0; i
--)
457 if (size
> PyUnicode_GET_SIZE(unicode
))
458 return PyUnicode_GET_SIZE(unicode
);
465 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
469 #ifdef Py_UNICODE_WIDE
470 if (ordinal
< 0 || ordinal
> 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError
,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
477 if (ordinal
< 0 || ordinal
> 0xffff) {
478 PyErr_SetString(PyExc_ValueError
,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
485 s
[0] = (Py_UNICODE
)ordinal
;
486 return PyUnicode_FromUnicode(s
, 1);
489 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj
)) {
497 if (PyUnicode_Check(obj
)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
501 PyUnicode_GET_SIZE(obj
));
503 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
506 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
507 const char *encoding
,
510 const char *s
= NULL
;
515 PyErr_BadInternalCall();
520 /* For b/w compatibility we also accept Unicode objects provided
521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
529 if (PyUnicode_Check(obj
)) {
531 PyErr_SetString(PyExc_TypeError
,
532 "decoding Unicode is not supported");
535 return PyObject_Unicode(obj
);
538 if (PyUnicode_Check(obj
)) {
539 PyErr_SetString(PyExc_TypeError
,
540 "decoding Unicode is not supported");
546 if (PyString_Check(obj
)) {
547 s
= PyString_AS_STRING(obj
);
548 len
= PyString_GET_SIZE(obj
);
550 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError
))
554 PyErr_Format(PyExc_TypeError
,
555 "coercing to Unicode: need string or buffer, "
557 obj
->ob_type
->tp_name
);
561 /* Convert to Unicode */
563 Py_INCREF(unicode_empty
);
564 v
= (PyObject
*)unicode_empty
;
567 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
575 PyObject
*PyUnicode_Decode(const char *s
,
577 const char *encoding
,
580 PyObject
*buffer
= NULL
, *unicode
;
582 if (encoding
== NULL
)
583 encoding
= PyUnicode_GetDefaultEncoding();
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding
, "utf-8") == 0)
587 return PyUnicode_DecodeUTF8(s
, size
, errors
);
588 else if (strcmp(encoding
, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s
, size
, errors
);
590 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding
, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s
, size
, errors
);
594 else if (strcmp(encoding
, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s
, size
, errors
);
597 /* Decode via the codec registry */
598 buffer
= PyBuffer_FromMemory((void *)s
, size
);
601 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
604 if (!PyUnicode_Check(unicode
)) {
605 PyErr_Format(PyExc_TypeError
,
606 "decoder did not return an unicode object (type=%.400s)",
607 unicode
->ob_type
->tp_name
);
619 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
620 const char *encoding
,
625 if (!PyUnicode_Check(unicode
)) {
630 if (encoding
== NULL
)
631 encoding
= PyUnicode_GetDefaultEncoding();
633 /* Decode via the codec registry */
634 v
= PyCodec_Decode(unicode
, encoding
, errors
);
643 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
645 const char *encoding
,
648 PyObject
*v
, *unicode
;
650 unicode
= PyUnicode_FromUnicode(s
, size
);
653 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
658 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
659 const char *encoding
,
664 if (!PyUnicode_Check(unicode
)) {
669 if (encoding
== NULL
)
670 encoding
= PyUnicode_GetDefaultEncoding();
672 /* Encode via the codec registry */
673 v
= PyCodec_Encode(unicode
, encoding
, errors
);
682 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
683 const char *encoding
,
688 if (!PyUnicode_Check(unicode
)) {
693 if (encoding
== NULL
)
694 encoding
= PyUnicode_GetDefaultEncoding();
696 /* Shortcuts for common default encodings */
697 if (errors
== NULL
) {
698 if (strcmp(encoding
, "utf-8") == 0)
699 return PyUnicode_AsUTF8String(unicode
);
700 else if (strcmp(encoding
, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode
);
702 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding
, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode
);
706 else if (strcmp(encoding
, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode
);
710 /* Encode via the codec registry */
711 v
= PyCodec_Encode(unicode
, encoding
, errors
);
714 if (!PyString_Check(v
)) {
715 PyErr_Format(PyExc_TypeError
,
716 "encoder did not return a string object (type=%.400s)",
717 v
->ob_type
->tp_name
);
727 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
730 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
734 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
735 if (v
&& errors
== NULL
)
736 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
740 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
742 if (!PyUnicode_Check(unicode
)) {
746 return PyUnicode_AS_UNICODE(unicode
);
752 Py_ssize_t
PyUnicode_GetSize(PyObject
*unicode
)
754 if (!PyUnicode_Check(unicode
)) {
758 return PyUnicode_GET_SIZE(unicode
);
764 const char *PyUnicode_GetDefaultEncoding(void)
766 return unicode_default_encoding
;
769 int PyUnicode_SetDefaultEncoding(const char *encoding
)
773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v
= _PyCodec_Lookup(encoding
);
779 strncpy(unicode_default_encoding
,
781 sizeof(unicode_default_encoding
));
788 /* error handling callback helper:
789 build arguments, call the callback and check the arguments,
790 if no exception occurred, copy the replacement to the output
791 and adjust various state variables.
792 return 0 on success, -1 on error
796 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
797 const char *encoding
, const char *reason
,
798 const char *input
, Py_ssize_t insize
, Py_ssize_t
*startinpos
, Py_ssize_t
*endinpos
, PyObject
**exceptionObject
, const char **inptr
,
799 PyObject
**output
, Py_ssize_t
*outpos
, Py_UNICODE
**outptr
)
801 static char *argparse
= "O!n;decoding error handler must return (unicode, int) tuple";
803 PyObject
*restuple
= NULL
;
804 PyObject
*repunicode
= NULL
;
805 Py_ssize_t outsize
= PyUnicode_GET_SIZE(*output
);
806 Py_ssize_t requiredsize
;
812 if (*errorHandler
== NULL
) {
813 *errorHandler
= PyCodec_LookupError(errors
);
814 if (*errorHandler
== NULL
)
818 if (*exceptionObject
== NULL
) {
819 *exceptionObject
= PyUnicodeDecodeError_Create(
820 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
821 if (*exceptionObject
== NULL
)
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
833 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
834 if (restuple
== NULL
)
836 if (!PyTuple_Check(restuple
)) {
837 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
840 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
843 newpos
= insize
+newpos
;
844 if (newpos
<0 || newpos
>insize
) {
845 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", newpos
);
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr
= PyUnicode_AS_UNICODE(repunicode
);
854 repsize
= PyUnicode_GET_SIZE(repunicode
);
855 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
856 if (requiredsize
> outsize
) {
857 if (requiredsize
<2*outsize
)
858 requiredsize
= 2*outsize
;
859 if (PyUnicode_Resize(output
, requiredsize
) < 0)
861 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
864 *inptr
= input
+ newpos
;
865 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
872 Py_XDECREF(restuple
);
876 /* --- UTF-7 Codec -------------------------------------------------------- */
878 /* see RFC2152 for details */
881 char utf7_special
[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
899 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
904 #define SPECIAL(c, encodeO, encodeWS) \
905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
906 (encodeWS && (utf7_special[(c)] == 2)) || \
907 (encodeO && (utf7_special[(c)] == 3)))
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
912 (isalnum(c) || (c) == '+' || (c) == '/')
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
917 #define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
923 #define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
932 /* This is a surrogate pair. Unfortunately we can't represent \
933 it in a 16-bit character */ \
935 errmsg = "code pairs are not supported"; \
942 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
946 const char *starts
= s
;
947 Py_ssize_t startinpos
;
951 PyUnicodeObject
*unicode
;
953 const char *errmsg
= "";
955 unsigned int bitsleft
= 0;
956 unsigned long charsleft
= 0;
958 PyObject
*errorHandler
= NULL
;
959 PyObject
*exc
= NULL
;
961 unicode
= _PyUnicode_New(size
);
965 return (PyObject
*)unicode
;
976 if ((ch
== '-') || !B64CHAR(ch
)) {
980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
986 errmsg
= "partial character in shift sequence";
989 /* According to RFC2152 the remaining bits should be zero. We
990 choose to signal an error/insert a replacement character
991 here so indicate the potential of a misencoded character. */
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft
&& charsleft
<< (sizeof(charsleft
) * 8 - bitsleft
)) {
995 errmsg
= "non-zero padding bits in shift sequence";
1000 if ((s
< e
) && (*(s
) == '-')) {
1004 } else if (SPECIAL(ch
,0,0)) {
1005 errmsg
= "unexpected special character";
1011 charsleft
= (charsleft
<< 6) | UB64(ch
);
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
1017 else if ( ch
== '+' ) {
1018 startinpos
= s
-starts
;
1020 if (s
< e
&& *s
== '-') {
1029 else if (SPECIAL(ch
,0,0)) {
1030 errmsg
= "unexpected special character";
1040 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1041 endinpos
= s
-starts
;
1042 if (unicode_decode_call_errorhandler(
1043 errors
, &errorHandler
,
1045 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1046 (PyObject
**)&unicode
, &outpos
, &p
))
1051 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1053 if (unicode_decode_call_errorhandler(
1054 errors
, &errorHandler
,
1055 "utf7", "unterminated shift sequence",
1056 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1057 (PyObject
**)&unicode
, &outpos
, &p
))
1063 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1066 Py_XDECREF(errorHandler
);
1068 return (PyObject
*)unicode
;
1071 Py_XDECREF(errorHandler
);
1078 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1081 int encodeWhiteSpace
,
1085 /* It might be possible to tighten this worst case */
1086 Py_ssize_t cbAllocated
= 5 * size
;
1089 unsigned int bitsleft
= 0;
1090 unsigned long charsleft
= 0;
1095 return PyString_FromStringAndSize(NULL
, 0);
1097 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
1101 start
= out
= PyString_AS_STRING(v
);
1102 for (;i
< size
; ++i
) {
1103 Py_UNICODE ch
= s
[i
];
1109 } else if (SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1113 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1114 inShift
= bitsleft
> 0;
1119 if (!SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1120 *out
++ = B64(charsleft
<< (6-bitsleft
));
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch
) || ch
== '-') {
1132 charsleft
= (charsleft
<< 16) | ch
;
1133 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1135 /* If the next character is special then we dont' need to terminate
1136 the shift sequence. If the next character is not a BASE64 character
1137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1140 if (bitsleft
== 0) {
1142 Py_UNICODE ch2
= s
[i
+1];
1144 if (SPECIAL(ch2
, encodeSetO
, encodeWhiteSpace
)) {
1146 } else if (B64CHAR(ch2
) || ch2
== '-') {
1163 *out
++= B64(charsleft
<< (6-bitsleft
) );
1167 _PyString_Resize(&v
, out
- start
);
1178 /* --- UTF-8 Codec -------------------------------------------------------- */
1181 char utf8_code_length
[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1202 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1206 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1209 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1212 Py_ssize_t
*consumed
)
1214 const char *starts
= s
;
1216 Py_ssize_t startinpos
;
1217 Py_ssize_t endinpos
;
1220 PyUnicodeObject
*unicode
;
1222 const char *errmsg
= "";
1223 PyObject
*errorHandler
= NULL
;
1224 PyObject
*exc
= NULL
;
1226 /* Note: size will always be longer than the resulting Unicode
1228 unicode
= _PyUnicode_New(size
);
1234 return (PyObject
*)unicode
;
1237 /* Unpack UTF-8 encoded data */
1242 Py_UCS4 ch
= (unsigned char)*s
;
1245 *p
++ = (Py_UNICODE
)ch
;
1250 n
= utf8_code_length
[ch
];
1256 errmsg
= "unexpected end of data";
1257 startinpos
= s
-starts
;
1266 errmsg
= "unexpected code byte";
1267 startinpos
= s
-starts
;
1268 endinpos
= startinpos
+1;
1272 errmsg
= "internal error";
1273 startinpos
= s
-starts
;
1274 endinpos
= startinpos
+1;
1278 if ((s
[1] & 0xc0) != 0x80) {
1279 errmsg
= "invalid data";
1280 startinpos
= s
-starts
;
1281 endinpos
= startinpos
+2;
1284 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1286 startinpos
= s
-starts
;
1287 endinpos
= startinpos
+2;
1288 errmsg
= "illegal encoding";
1292 *p
++ = (Py_UNICODE
)ch
;
1296 if ((s
[1] & 0xc0) != 0x80 ||
1297 (s
[2] & 0xc0) != 0x80) {
1298 errmsg
= "invalid data";
1299 startinpos
= s
-starts
;
1300 endinpos
= startinpos
+3;
1303 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1305 /* Note: UTF-8 encodings of surrogates are considered
1306 legal UTF-8 sequences;
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1312 errmsg
= "illegal encoding";
1313 startinpos
= s
-starts
;
1314 endinpos
= startinpos
+3;
1318 *p
++ = (Py_UNICODE
)ch
;
1322 if ((s
[1] & 0xc0) != 0x80 ||
1323 (s
[2] & 0xc0) != 0x80 ||
1324 (s
[3] & 0xc0) != 0x80) {
1325 errmsg
= "invalid data";
1326 startinpos
= s
-starts
;
1327 endinpos
= startinpos
+4;
1330 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
1331 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
1333 if ((ch
< 0x10000) /* minimum value allowed for 4
1335 || (ch
> 0x10ffff)) /* maximum value allowed for
1338 errmsg
= "illegal encoding";
1339 startinpos
= s
-starts
;
1340 endinpos
= startinpos
+4;
1343 #ifdef Py_UNICODE_WIDE
1344 *p
++ = (Py_UNICODE
)ch
;
1346 /* compute and append the two surrogates: */
1348 /* translate from 10000..10FFFF to 0..FFFF */
1351 /* high surrogate = top 10 bits added to D800 */
1352 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
1354 /* low surrogate = bottom 10 bits added to DC00 */
1355 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
1360 /* Other sizes are only needed for UCS-4 */
1361 errmsg
= "unsupported Unicode code range";
1362 startinpos
= s
-starts
;
1363 endinpos
= startinpos
+n
;
1370 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1371 if (unicode_decode_call_errorhandler(
1372 errors
, &errorHandler
,
1374 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1375 (PyObject
**)&unicode
, &outpos
, &p
))
1379 *consumed
= s
-starts
;
1382 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
1385 Py_XDECREF(errorHandler
);
1387 return (PyObject
*)unicode
;
1390 Py_XDECREF(errorHandler
);
1396 /* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
1402 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
1406 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1408 Py_ssize_t i
; /* index into s of next input byte */
1409 PyObject
*v
; /* result string object */
1410 char *p
; /* next free byte in output buffer */
1411 Py_ssize_t nallocated
; /* number of result bytes allocated */
1412 Py_ssize_t nneeded
; /* number of result bytes needed */
1413 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
1418 if (size
<= MAX_SHORT_UNICHARS
) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1423 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
1424 v
= NULL
; /* will allocate after we're done */
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated
= size
* 4;
1430 if (nallocated
/ 4 != size
) /* overflow! */
1431 return PyErr_NoMemory();
1432 v
= PyString_FromStringAndSize(NULL
, nallocated
);
1435 p
= PyString_AS_STRING(v
);
1438 for (i
= 0; i
< size
;) {
1439 Py_UCS4 ch
= s
[i
++];
1445 else if (ch
< 0x0800) {
1446 /* Encode Latin-1 */
1447 *p
++ = (char)(0xc0 | (ch
>> 6));
1448 *p
++ = (char)(0x80 | (ch
& 0x3f));
1451 /* Encode UCS2 Unicode ordinals */
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1459 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
1463 /* Fall through: handles isolated high surrogates */
1465 *p
++ = (char)(0xe0 | (ch
>> 12));
1466 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1467 *p
++ = (char)(0x80 | (ch
& 0x3f));
1471 /* Encode UCS4 Unicode ordinals */
1472 *p
++ = (char)(0xf0 | (ch
>> 18));
1473 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
1474 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1475 *p
++ = (char)(0x80 | (ch
& 0x3f));
1480 /* This was stack allocated. */
1481 nneeded
= p
- stackbuf
;
1482 assert(nneeded
<= nallocated
);
1483 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
1486 /* Cut back to size actually needed. */
1487 nneeded
= p
- PyString_AS_STRING(v
);
1488 assert(nneeded
<= nallocated
);
1489 _PyString_Resize(&v
, nneeded
);
1493 #undef MAX_SHORT_UNICHARS
1496 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
1498 if (!PyUnicode_Check(unicode
)) {
1499 PyErr_BadArgument();
1502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
1503 PyUnicode_GET_SIZE(unicode
),
1507 /* --- UTF-16 Codec ------------------------------------------------------- */
1510 PyUnicode_DecodeUTF16(const char *s
,
1515 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
1519 PyUnicode_DecodeUTF16Stateful(const char *s
,
1523 Py_ssize_t
*consumed
)
1525 const char *starts
= s
;
1526 Py_ssize_t startinpos
;
1527 Py_ssize_t endinpos
;
1529 PyUnicodeObject
*unicode
;
1531 const unsigned char *q
, *e
;
1532 int bo
= 0; /* assume native ordering by default */
1533 const char *errmsg
= "";
1534 /* Offsets from q for retrieving byte pairs in the right order. */
1535 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi
= 1, ilo
= 0;
1538 int ihi
= 0, ilo
= 1;
1540 PyObject
*errorHandler
= NULL
;
1541 PyObject
*exc
= NULL
;
1543 /* Note: size will always be longer than the resulting Unicode
1545 unicode
= _PyUnicode_New(size
);
1549 return (PyObject
*)unicode
;
1551 /* Unpack UTF-16 encoded data */
1553 q
= (unsigned char *)s
;
1559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1565 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
1566 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1567 if (bom
== 0xFEFF) {
1571 else if (bom
== 0xFFFE) {
1576 if (bom
== 0xFEFF) {
1580 else if (bom
== 0xFFFE) {
1601 /* remaining bytes at the end? (size should be even) */
1605 errmsg
= "truncated data";
1606 startinpos
= ((const char *)q
)-starts
;
1607 endinpos
= ((const char *)e
)-starts
;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1612 ch
= (q
[ihi
] << 8) | q
[ilo
];
1616 if (ch
< 0xD800 || ch
> 0xDFFF) {
1621 /* UTF-16 code pair: */
1623 errmsg
= "unexpected end of data";
1624 startinpos
= (((const char *)q
)-2)-starts
;
1625 endinpos
= ((const char *)e
)-starts
;
1628 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1629 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
1631 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1632 #ifndef Py_UNICODE_WIDE
1636 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
1641 errmsg
= "illegal UTF-16 surrogate";
1642 startinpos
= (((const char *)q
)-4)-starts
;
1643 endinpos
= startinpos
+2;
1648 errmsg
= "illegal encoding";
1649 startinpos
= (((const char *)q
)-2)-starts
;
1650 endinpos
= startinpos
+2;
1651 /* Fall through to report the error */
1654 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1655 if (unicode_decode_call_errorhandler(
1656 errors
, &errorHandler
,
1658 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
1659 (PyObject
**)&unicode
, &outpos
, &p
))
1667 *consumed
= (const char *)q
-starts
;
1670 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
1673 Py_XDECREF(errorHandler
);
1675 return (PyObject
*)unicode
;
1679 Py_XDECREF(errorHandler
);
1685 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1692 #ifdef Py_UNICODE_WIDE
1695 const int pairs
= 0;
1697 /* Offsets from p for storing byte pairs in the right order. */
1698 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi
= 1, ilo
= 0;
1701 int ihi
= 0, ilo
= 1;
1704 #define STORECHAR(CH) \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1711 #ifdef Py_UNICODE_WIDE
1712 for (i
= pairs
= 0; i
< size
; i
++)
1713 if (s
[i
] >= 0x10000)
1716 v
= PyString_FromStringAndSize(NULL
,
1717 2 * (size
+ pairs
+ (byteorder
== 0)));
1721 p
= (unsigned char *)PyString_AS_STRING(v
);
1727 if (byteorder
== -1) {
1732 else if (byteorder
== 1) {
1738 while (size
-- > 0) {
1739 Py_UNICODE ch
= *s
++;
1741 #ifdef Py_UNICODE_WIDE
1742 if (ch
>= 0x10000) {
1743 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1744 ch
= 0xD800 | ((ch
-0x10000) >> 10);
1755 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1757 if (!PyUnicode_Check(unicode
)) {
1758 PyErr_BadArgument();
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1762 PyUnicode_GET_SIZE(unicode
),
1767 /* --- Unicode Escape Codec ----------------------------------------------- */
1769 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
1771 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1775 const char *starts
= s
;
1776 Py_ssize_t startinpos
;
1777 Py_ssize_t endinpos
;
1784 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
1785 PyObject
*errorHandler
= NULL
;
1786 PyObject
*exc
= NULL
;
1788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
1790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
1793 v
= _PyUnicode_New(size
);
1797 return (PyObject
*)v
;
1799 p
= PyUnicode_AS_UNICODE(v
);
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1809 *p
++ = (unsigned char) *s
++;
1813 startinpos
= s
-starts
;
1820 case '\\': *p
++ = '\\'; break;
1821 case '\'': *p
++ = '\''; break;
1822 case '\"': *p
++ = '\"'; break;
1823 case 'b': *p
++ = '\b'; break;
1824 case 'f': *p
++ = '\014'; break; /* FF */
1825 case 't': *p
++ = '\t'; break;
1826 case 'n': *p
++ = '\n'; break;
1827 case 'r': *p
++ = '\r'; break;
1828 case 'v': *p
++ = '\013'; break; /* VT */
1829 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
1835 if ('0' <= *s
&& *s
<= '7') {
1836 x
= (x
<<3) + *s
++ - '0';
1837 if ('0' <= *s
&& *s
<= '7')
1838 x
= (x
<<3) + *s
++ - '0';
1847 message
= "truncated \\xXX escape";
1853 message
= "truncated \\uXXXX escape";
1859 message
= "truncated \\UXXXXXXXX escape";
1862 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1865 if (unicode_decode_call_errorhandler(
1866 errors
, &errorHandler
,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1869 (PyObject
**)&v
, &outpos
, &p
))
1873 for (i
= 0; i
< digits
; ++i
) {
1874 c
= (unsigned char) s
[i
];
1876 endinpos
= (s
+i
+1)-starts
;
1877 if (unicode_decode_call_errorhandler(
1878 errors
, &errorHandler
,
1879 "unicodeescape", message
,
1880 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1881 (PyObject
**)&v
, &outpos
, &p
))
1885 chr
= (chr
<<4) & ~0xF;
1886 if (c
>= '0' && c
<= '9')
1888 else if (c
>= 'a' && c
<= 'f')
1889 chr
+= 10 + c
- 'a';
1891 chr
+= 10 + c
- 'A';
1894 if (chr
== 0xffffffff && PyErr_Occurred())
1895 /* _decoding_error will have already written into the
1899 /* when we get here, chr is a 32-bit unicode character */
1901 /* UCS-2 character */
1902 *p
++ = (Py_UNICODE
) chr
;
1903 else if (chr
<= 0x10ffff) {
1904 /* UCS-4 character. Either store directly, or as
1906 #ifdef Py_UNICODE_WIDE
1910 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
1911 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
1914 endinpos
= s
-starts
;
1915 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1916 if (unicode_decode_call_errorhandler(
1917 errors
, &errorHandler
,
1918 "unicodeescape", "illegal Unicode character",
1919 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1920 (PyObject
**)&v
, &outpos
, &p
))
1927 message
= "malformed \\N character escape";
1928 if (ucnhash_CAPI
== NULL
) {
1929 /* load the unicode data module */
1931 m
= PyImport_ImportModule("unicodedata");
1934 api
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
1938 ucnhash_CAPI
= (_PyUnicode_Name_CAPI
*)PyCObject_AsVoidPtr(api
);
1940 if (ucnhash_CAPI
== NULL
)
1944 const char *start
= s
+1;
1945 /* look for the closing brace */
1946 while (*s
!= '}' && s
< end
)
1948 if (s
> start
&& s
< end
&& *s
== '}') {
1949 /* found a name. look it up in the unicode database */
1950 message
= "unknown Unicode character name";
1952 if (ucnhash_CAPI
->getcode(NULL
, start
, (int)(s
-start
-1), &chr
))
1956 endinpos
= s
-starts
;
1957 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1958 if (unicode_decode_call_errorhandler(
1959 errors
, &errorHandler
,
1960 "unicodeescape", message
,
1961 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1962 (PyObject
**)&v
, &outpos
, &p
))
1968 message
= "\\ at end of string";
1970 endinpos
= s
-starts
;
1971 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1972 if (unicode_decode_call_errorhandler(
1973 errors
, &errorHandler
,
1974 "unicodeescape", message
,
1975 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1976 (PyObject
**)&v
, &outpos
, &p
))
1981 *p
++ = (unsigned char)s
[-1];
1988 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
1990 Py_XDECREF(errorHandler
);
1992 return (PyObject
*)v
;
1997 "\\N escapes not supported (can't load unicodedata module)"
2000 Py_XDECREF(errorHandler
);
2006 Py_XDECREF(errorHandler
);
2011 /* Return a Unicode-Escape string version of the Unicode object.
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2018 Py_LOCAL_INLINE(const Py_UNICODE
*) findchar(const Py_UNICODE
*s
,
2022 /* like wcschr, but doesn't stop at NULL characters */
2024 while (size
-- > 0) {
2034 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
2041 static const char *hexdigit
= "0123456789abcdef";
2043 /* XXX(nnorwitz): rather than over-allocating, it would be
2044 better to choose a different scheme. Perhaps scan the
2045 first N-chars of the string and allocate based on that size.
2047 /* Initial allocation is based on the longest-possible unichr
2050 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2051 unichr, so in this case it's the longest unichr escape. In
2052 narrow (UTF-16) builds this is five chars per source unichr
2053 since there are two unichrs in the surrogate pair, so in narrow
2054 (UTF-16) builds it's not the longest unichr escape.
2056 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2057 so in the narrow (UTF-16) build case it's the longest unichr
2061 repr
= PyString_FromStringAndSize(NULL
,
2063 #ifdef Py_UNICODE_WIDE
2072 p
= PyString_AS_STRING(repr
);
2076 *p
++ = (findchar(s
, size
, '\'') &&
2077 !findchar(s
, size
, '"')) ? '"' : '\'';
2079 while (size
-- > 0) {
2080 Py_UNICODE ch
= *s
++;
2082 /* Escape quotes and backslashes */
2084 ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1]) || ch
== '\\') {
2090 #ifdef Py_UNICODE_WIDE
2091 /* Map 21-bit characters to '\U00xxxxxx' */
2092 else if (ch
>= 0x10000) {
2095 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
2096 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
2097 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
2098 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
2099 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
2100 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
2101 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
2102 *p
++ = hexdigit
[ch
& 0x0000000F];
2106 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2107 else if (ch
>= 0xD800 && ch
< 0xDC00) {
2113 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
2114 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
2117 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
2118 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
2119 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
2120 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
2121 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
2122 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
2123 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
2124 *p
++ = hexdigit
[ucs
& 0x0000000F];
2127 /* Fall through: isolated surrogates are copied as-is */
2133 /* Map 16-bit characters to '\uxxxx' */
2137 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
2138 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
2139 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
2140 *p
++ = hexdigit
[ch
& 0x000F];
2143 /* Map special whitespace to '\t', \n', '\r' */
2144 else if (ch
== '\t') {
2148 else if (ch
== '\n') {
2152 else if (ch
== '\r') {
2157 /* Map non-printable US ASCII to '\xhh' */
2158 else if (ch
< ' ' || ch
>= 0x7F) {
2161 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
2162 *p
++ = hexdigit
[ch
& 0x000F];
2165 /* Copy everything else as-is */
2170 *p
++ = PyString_AS_STRING(repr
)[1];
2173 _PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
));
2177 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
2180 return unicodeescape_string(s
, size
, 0);
2183 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
2185 if (!PyUnicode_Check(unicode
)) {
2186 PyErr_BadArgument();
2189 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2190 PyUnicode_GET_SIZE(unicode
));
2193 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2195 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
2199 const char *starts
= s
;
2200 Py_ssize_t startinpos
;
2201 Py_ssize_t endinpos
;
2207 PyObject
*errorHandler
= NULL
;
2208 PyObject
*exc
= NULL
;
2210 /* Escaped strings will always be longer than the resulting
2211 Unicode string, so we start with size here and then reduce the
2212 length after conversion to the true value. (But decoding error
2213 handler might have to resize the string) */
2214 v
= _PyUnicode_New(size
);
2218 return (PyObject
*)v
;
2219 p
= PyUnicode_AS_UNICODE(v
);
2227 /* Non-escape characters are interpreted as Unicode ordinals */
2229 *p
++ = (unsigned char)*s
++;
2232 startinpos
= s
-starts
;
2234 /* \u-escapes are only interpreted iff the number of leading
2235 backslashes if odd */
2240 *p
++ = (unsigned char)*s
++;
2242 if (((s
- bs
) & 1) == 0 ||
2244 (*s
!= 'u' && *s
!= 'U')) {
2248 count
= *s
=='u' ? 4 : 8;
2251 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2252 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2253 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
2254 c
= (unsigned char)*s
;
2256 endinpos
= s
-starts
;
2257 if (unicode_decode_call_errorhandler(
2258 errors
, &errorHandler
,
2259 "rawunicodeescape", "truncated \\uXXXX",
2260 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2261 (PyObject
**)&v
, &outpos
, &p
))
2266 if (c
>= '0' && c
<= '9')
2268 else if (c
>= 'a' && c
<= 'f')
2273 #ifndef Py_UNICODE_WIDE
2275 if (unicode_decode_call_errorhandler(
2276 errors
, &errorHandler
,
2277 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2278 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2279 (PyObject
**)&v
, &outpos
, &p
))
2287 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2289 Py_XDECREF(errorHandler
);
2291 return (PyObject
*)v
;
2295 Py_XDECREF(errorHandler
);
2300 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
2307 static const char *hexdigit
= "0123456789abcdef";
2309 #ifdef Py_UNICODE_WIDE
2310 repr
= PyString_FromStringAndSize(NULL
, 10 * size
);
2312 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
2319 p
= q
= PyString_AS_STRING(repr
);
2320 while (size
-- > 0) {
2321 Py_UNICODE ch
= *s
++;
2322 #ifdef Py_UNICODE_WIDE
2323 /* Map 32-bit characters to '\Uxxxxxxxx' */
2324 if (ch
>= 0x10000) {
2327 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
2328 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
2329 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
2330 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
2331 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
2332 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
2333 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
2334 *p
++ = hexdigit
[ch
& 15];
2338 /* Map 16-bit characters to '\uxxxx' */
2342 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
2343 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
2344 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
2345 *p
++ = hexdigit
[ch
& 15];
2347 /* Copy everything else as-is */
2352 _PyString_Resize(&repr
, p
- q
);
2356 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
2358 if (!PyUnicode_Check(unicode
)) {
2359 PyErr_BadArgument();
2362 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2363 PyUnicode_GET_SIZE(unicode
));
2366 /* --- Unicode Internal Codec ------------------------------------------- */
2368 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
2372 const char *starts
= s
;
2373 Py_ssize_t startinpos
;
2374 Py_ssize_t endinpos
;
2380 PyObject
*errorHandler
= NULL
;
2381 PyObject
*exc
= NULL
;
2383 #ifdef Py_UNICODE_WIDE
2384 Py_UNICODE unimax
= PyUnicode_GetMax();
2387 /* XXX overflow detection missing */
2388 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
2391 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
2392 return (PyObject
*)v
;
2393 p
= PyUnicode_AS_UNICODE(v
);
2397 memcpy(p
, s
, sizeof(Py_UNICODE
));
2398 /* We have to sanity check the raw data, otherwise doom looms for
2399 some malformed UCS-4 data. */
2401 #ifdef Py_UNICODE_WIDE
2402 *p
> unimax
|| *p
< 0 ||
2404 end
-s
< Py_UNICODE_SIZE
2407 startinpos
= s
- starts
;
2408 if (end
-s
< Py_UNICODE_SIZE
) {
2409 endinpos
= end
-starts
;
2410 reason
= "truncated input";
2413 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
2414 reason
= "illegal code point (> 0x10FFFF)";
2416 outpos
= p
- PyUnicode_AS_UNICODE(v
);
2417 if (unicode_decode_call_errorhandler(
2418 errors
, &errorHandler
,
2419 "unicode_internal", reason
,
2420 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2421 (PyObject
**)&v
, &outpos
, &p
)) {
2427 s
+= Py_UNICODE_SIZE
;
2431 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2433 Py_XDECREF(errorHandler
);
2435 return (PyObject
*)v
;
2439 Py_XDECREF(errorHandler
);
2444 /* --- Latin-1 Codec ------------------------------------------------------ */
2446 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
2453 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2455 Py_UNICODE r
= *(unsigned char*)s
;
2456 return PyUnicode_FromUnicode(&r
, 1);
2459 v
= _PyUnicode_New(size
);
2463 return (PyObject
*)v
;
2464 p
= PyUnicode_AS_UNICODE(v
);
2466 *p
++ = (unsigned char)*s
++;
2467 return (PyObject
*)v
;
2474 /* create or adjust a UnicodeEncodeError */
2475 static void make_encode_exception(PyObject
**exceptionObject
,
2476 const char *encoding
,
2477 const Py_UNICODE
*unicode
, Py_ssize_t size
,
2478 Py_ssize_t startpos
, Py_ssize_t endpos
,
2481 if (*exceptionObject
== NULL
) {
2482 *exceptionObject
= PyUnicodeEncodeError_Create(
2483 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2486 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
2488 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
2490 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
2494 Py_DECREF(*exceptionObject
);
2495 *exceptionObject
= NULL
;
2499 /* raises a UnicodeEncodeError */
2500 static void raise_encode_exception(PyObject
**exceptionObject
,
2501 const char *encoding
,
2502 const Py_UNICODE
*unicode
, Py_ssize_t size
,
2503 Py_ssize_t startpos
, Py_ssize_t endpos
,
2506 make_encode_exception(exceptionObject
,
2507 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2508 if (*exceptionObject
!= NULL
)
2509 PyCodec_StrictErrors(*exceptionObject
);
2512 /* error handling callback helper:
2513 build arguments, call the callback and check the arguments,
2514 put the result into newpos and return the replacement string, which
2515 has to be freed by the caller */
2516 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
2517 PyObject
**errorHandler
,
2518 const char *encoding
, const char *reason
,
2519 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
2520 Py_ssize_t startpos
, Py_ssize_t endpos
,
2523 static char *argparse
= "O!n;encoding error handler must return (unicode, int) tuple";
2526 PyObject
*resunicode
;
2528 if (*errorHandler
== NULL
) {
2529 *errorHandler
= PyCodec_LookupError(errors
);
2530 if (*errorHandler
== NULL
)
2534 make_encode_exception(exceptionObject
,
2535 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2536 if (*exceptionObject
== NULL
)
2539 restuple
= PyObject_CallFunctionObjArgs(
2540 *errorHandler
, *exceptionObject
, NULL
);
2541 if (restuple
== NULL
)
2543 if (!PyTuple_Check(restuple
)) {
2544 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
2545 Py_DECREF(restuple
);
2548 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
2549 &resunicode
, newpos
)) {
2550 Py_DECREF(restuple
);
2554 *newpos
= size
+*newpos
;
2555 if (*newpos
<0 || *newpos
>size
) {
2556 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
2557 Py_DECREF(restuple
);
2560 Py_INCREF(resunicode
);
2561 Py_DECREF(restuple
);
2565 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
2572 /* pointers to the beginning and end+1 of input */
2573 const Py_UNICODE
*startp
= p
;
2574 const Py_UNICODE
*endp
= p
+ size
;
2575 /* pointer to the beginning of the unencodable characters */
2576 /* const Py_UNICODE *badp = NULL; */
2577 /* pointer into the output */
2579 /* current output position */
2580 Py_ssize_t respos
= 0;
2582 const char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
2583 const char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2584 PyObject
*errorHandler
= NULL
;
2585 PyObject
*exc
= NULL
;
2586 /* the following variable is used for caching string comparisons
2587 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2588 int known_errorHandler
= -1;
2590 /* allocate enough for a simple encoding without
2591 replacements, if we need more, we'll resize */
2592 res
= PyString_FromStringAndSize(NULL
, size
);
2597 str
= PyString_AS_STRING(res
);
2603 /* can we encode this? */
2605 /* no overflow check, because we know that the space is enough */
2610 Py_ssize_t unicodepos
= p
-startp
;
2611 Py_ssize_t requiredsize
;
2612 PyObject
*repunicode
;
2617 /* startpos for collecting unencodable chars */
2618 const Py_UNICODE
*collstart
= p
;
2619 const Py_UNICODE
*collend
= p
;
2620 /* find all unecodable characters */
2621 while ((collend
< endp
) && ((*collend
)>=limit
))
2623 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2624 if (known_errorHandler
==-1) {
2625 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
2626 known_errorHandler
= 1;
2627 else if (!strcmp(errors
, "replace"))
2628 known_errorHandler
= 2;
2629 else if (!strcmp(errors
, "ignore"))
2630 known_errorHandler
= 3;
2631 else if (!strcmp(errors
, "xmlcharrefreplace"))
2632 known_errorHandler
= 4;
2634 known_errorHandler
= 0;
2636 switch (known_errorHandler
) {
2637 case 1: /* strict */
2638 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
2640 case 2: /* replace */
2641 while (collstart
++<collend
)
2642 *str
++ = '?'; /* fall through */
2643 case 3: /* ignore */
2646 case 4: /* xmlcharrefreplace */
2647 respos
= str
-PyString_AS_STRING(res
);
2648 /* determine replacement size (temporarily (mis)uses p) */
2649 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
2658 #ifndef Py_UNICODE_WIDE
2664 else if (*p
<1000000)
2670 requiredsize
= respos
+repsize
+(endp
-collend
);
2671 if (requiredsize
> ressize
) {
2672 if (requiredsize
<2*ressize
)
2673 requiredsize
= 2*ressize
;
2674 if (_PyString_Resize(&res
, requiredsize
))
2676 str
= PyString_AS_STRING(res
) + respos
;
2677 ressize
= requiredsize
;
2679 /* generate replacement (temporarily (mis)uses p) */
2680 for (p
= collstart
; p
< collend
; ++p
) {
2681 str
+= sprintf(str
, "&#%d;", (int)*p
);
2686 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
2687 encoding
, reason
, startp
, size
, &exc
,
2688 collstart
-startp
, collend
-startp
, &newpos
);
2689 if (repunicode
== NULL
)
2691 /* need more space? (at least enough for what we
2692 have+the replacement+the rest of the string, so
2693 we won't have to check space for encodable characters) */
2694 respos
= str
-PyString_AS_STRING(res
);
2695 repsize
= PyUnicode_GET_SIZE(repunicode
);
2696 requiredsize
= respos
+repsize
+(endp
-collend
);
2697 if (requiredsize
> ressize
) {
2698 if (requiredsize
<2*ressize
)
2699 requiredsize
= 2*ressize
;
2700 if (_PyString_Resize(&res
, requiredsize
)) {
2701 Py_DECREF(repunicode
);
2704 str
= PyString_AS_STRING(res
) + respos
;
2705 ressize
= requiredsize
;
2707 /* check if there is anything unencodable in the replacement
2708 and copy it to the output */
2709 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
2712 raise_encode_exception(&exc
, encoding
, startp
, size
,
2713 unicodepos
, unicodepos
+1, reason
);
2714 Py_DECREF(repunicode
);
2719 p
= startp
+ newpos
;
2720 Py_DECREF(repunicode
);
2724 /* Resize if we allocated to much */
2725 respos
= str
-PyString_AS_STRING(res
);
2727 /* If this falls res will be NULL */
2728 _PyString_Resize(&res
, respos
);
2729 Py_XDECREF(errorHandler
);
2735 Py_XDECREF(errorHandler
);
2740 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
2744 return unicode_encode_ucs1(p
, size
, errors
, 256);
2747 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
2749 if (!PyUnicode_Check(unicode
)) {
2750 PyErr_BadArgument();
2753 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
2754 PyUnicode_GET_SIZE(unicode
),
2758 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2760 PyObject
*PyUnicode_DecodeASCII(const char *s
,
2764 const char *starts
= s
;
2767 Py_ssize_t startinpos
;
2768 Py_ssize_t endinpos
;
2771 PyObject
*errorHandler
= NULL
;
2772 PyObject
*exc
= NULL
;
2774 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2775 if (size
== 1 && *(unsigned char*)s
< 128) {
2776 Py_UNICODE r
= *(unsigned char*)s
;
2777 return PyUnicode_FromUnicode(&r
, 1);
2780 v
= _PyUnicode_New(size
);
2784 return (PyObject
*)v
;
2785 p
= PyUnicode_AS_UNICODE(v
);
2788 register unsigned char c
= (unsigned char)*s
;
2794 startinpos
= s
-starts
;
2795 endinpos
= startinpos
+ 1;
2796 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
2797 if (unicode_decode_call_errorhandler(
2798 errors
, &errorHandler
,
2799 "ascii", "ordinal not in range(128)",
2800 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2801 (PyObject
**)&v
, &outpos
, &p
))
2805 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
2806 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2808 Py_XDECREF(errorHandler
);
2810 return (PyObject
*)v
;
2814 Py_XDECREF(errorHandler
);
2819 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
2823 return unicode_encode_ucs1(p
, size
, errors
, 128);
2826 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
2828 if (!PyUnicode_Check(unicode
)) {
2829 PyErr_BadArgument();
2832 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
2833 PyUnicode_GET_SIZE(unicode
),
2837 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2839 /* --- MBCS codecs for Windows -------------------------------------------- */
2841 #if SIZEOF_INT < SIZEOF_SSIZE_T
2845 /* XXX This code is limited to "true" double-byte encodings, as
2846 a) it assumes an incomplete character consists of a single byte, and
2847 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2848 encodings, see IsDBCSLeadByteEx documentation. */
2850 static int is_dbcs_lead_byte(const char *s
, int offset
)
2852 const char *curr
= s
+ offset
;
2854 if (IsDBCSLeadByte(*curr
)) {
2855 const char *prev
= CharPrev(s
, curr
);
2856 return (prev
== curr
) || !IsDBCSLeadByte(*prev
) || (curr
- prev
== 2);
2862 * Decode MBCS string into unicode object. If 'final' is set, converts
2863 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2865 static int decode_mbcs(PyUnicodeObject
**v
,
2866 const char *s
, /* MBCS string */
2867 int size
, /* sizeof MBCS string */
2876 /* Skip trailing lead-byte unless 'final' is set */
2877 if (!final
&& size
>= 1 && is_dbcs_lead_byte(s
, size
- 1))
2880 /* First get the size of the result */
2882 usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
2884 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2890 /* Create unicode object */
2891 *v
= _PyUnicode_New(usize
);
2896 /* Extend unicode object */
2897 n
= PyUnicode_GET_SIZE(*v
);
2898 if (_PyUnicode_Resize(v
, n
+ usize
) < 0)
2902 /* Do the conversion */
2904 p
= PyUnicode_AS_UNICODE(*v
) + n
;
2905 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
2906 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2914 PyObject
*PyUnicode_DecodeMBCSStateful(const char *s
,
2917 Py_ssize_t
*consumed
)
2919 PyUnicodeObject
*v
= NULL
;
2928 done
= decode_mbcs(&v
, s
, INT_MAX
, 0);
2931 done
= decode_mbcs(&v
, s
, (int)size
, !consumed
);
2942 if (size
> INT_MAX
) {
2949 return (PyObject
*)v
;
2952 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
2956 return PyUnicode_DecodeMBCSStateful(s
, size
, errors
, NULL
);
2960 * Convert unicode into string object (MBCS).
2961 * Returns 0 if succeed, -1 otherwise.
2963 static int encode_mbcs(PyObject
**repr
,
2964 const Py_UNICODE
*p
, /* unicode */
2965 int size
) /* size of unicode */
2972 /* First get the size of the result */
2974 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
2975 if (mbcssize
== 0) {
2976 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2981 if (*repr
== NULL
) {
2982 /* Create string object */
2983 *repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
2988 /* Extend string object */
2989 n
= PyString_Size(*repr
);
2990 if (_PyString_Resize(repr
, n
+ mbcssize
) < 0)
2994 /* Do the conversion */
2996 char *s
= PyString_AS_STRING(*repr
) + n
;
2997 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
2998 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3006 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
3010 PyObject
*repr
= NULL
;
3016 ret
= encode_mbcs(&repr
, p
, INT_MAX
);
3019 ret
= encode_mbcs(&repr
, p
, (int)size
);
3027 if (size
> INT_MAX
) {
3037 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
3039 if (!PyUnicode_Check(unicode
)) {
3040 PyErr_BadArgument();
3043 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
3044 PyUnicode_GET_SIZE(unicode
),
3050 #endif /* MS_WINDOWS */
3052 /* --- Character Mapping Codec -------------------------------------------- */
3054 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
3059 const char *starts
= s
;
3060 Py_ssize_t startinpos
;
3061 Py_ssize_t endinpos
;
3066 Py_ssize_t extrachars
= 0;
3067 PyObject
*errorHandler
= NULL
;
3068 PyObject
*exc
= NULL
;
3069 Py_UNICODE
*mapstring
= NULL
;
3070 Py_ssize_t maplen
= 0;
3072 /* Default to Latin-1 */
3073 if (mapping
== NULL
)
3074 return PyUnicode_DecodeLatin1(s
, size
, errors
);
3076 v
= _PyUnicode_New(size
);
3080 return (PyObject
*)v
;
3081 p
= PyUnicode_AS_UNICODE(v
);
3083 if (PyUnicode_CheckExact(mapping
)) {
3084 mapstring
= PyUnicode_AS_UNICODE(mapping
);
3085 maplen
= PyUnicode_GET_SIZE(mapping
);
3087 unsigned char ch
= *s
;
3088 Py_UNICODE x
= 0xfffe; /* illegal value */
3094 /* undefined mapping */
3095 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3096 startinpos
= s
-starts
;
3097 endinpos
= startinpos
+1;
3098 if (unicode_decode_call_errorhandler(
3099 errors
, &errorHandler
,
3100 "charmap", "character maps to <undefined>",
3101 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3102 (PyObject
**)&v
, &outpos
, &p
)) {
3113 unsigned char ch
= *s
;
3116 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3117 w
= PyInt_FromLong((long)ch
);
3120 x
= PyObject_GetItem(mapping
, w
);
3123 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3124 /* No mapping found means: mapping is undefined. */
3133 if (PyInt_Check(x
)) {
3134 long value
= PyInt_AS_LONG(x
);
3135 if (value
< 0 || value
> 65535) {
3136 PyErr_SetString(PyExc_TypeError
,
3137 "character mapping must be in range(65536)");
3141 *p
++ = (Py_UNICODE
)value
;
3143 else if (x
== Py_None
) {
3144 /* undefined mapping */
3145 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3146 startinpos
= s
-starts
;
3147 endinpos
= startinpos
+1;
3148 if (unicode_decode_call_errorhandler(
3149 errors
, &errorHandler
,
3150 "charmap", "character maps to <undefined>",
3151 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3152 (PyObject
**)&v
, &outpos
, &p
)) {
3159 else if (PyUnicode_Check(x
)) {
3160 Py_ssize_t targetsize
= PyUnicode_GET_SIZE(x
);
3162 if (targetsize
== 1)
3164 *p
++ = *PyUnicode_AS_UNICODE(x
);
3166 else if (targetsize
> 1) {
3168 if (targetsize
> extrachars
) {
3170 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
3171 Py_ssize_t needed
= (targetsize
- extrachars
) + \
3173 extrachars
+= needed
;
3174 /* XXX overflow detection missing */
3175 if (_PyUnicode_Resize(&v
,
3176 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
3180 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
3183 PyUnicode_AS_UNICODE(x
),
3186 extrachars
-= targetsize
;
3188 /* 1-0 mapping: skip the character */
3191 /* wrong return value */
3192 PyErr_SetString(PyExc_TypeError
,
3193 "character mapping must return integer, None or unicode");
3201 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
3202 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3204 Py_XDECREF(errorHandler
);
3206 return (PyObject
*)v
;
3209 Py_XDECREF(errorHandler
);
3215 /* Charmap encoding: the lookup table */
3217 struct encoding_map
{
3219 unsigned char level1
[32];
3221 unsigned char level23
[1];
3225 encoding_map_size(PyObject
*obj
, PyObject
* args
)
3227 struct encoding_map
*map
= (struct encoding_map
*)obj
;
3228 return PyInt_FromLong(sizeof(*map
) - 1 + 16*map
->count2
+
3232 static PyMethodDef encoding_map_methods
[] = {
3233 {"size", encoding_map_size
, METH_NOARGS
,
3234 PyDoc_STR("Return the size (in bytes) of this object") },
3239 encoding_map_dealloc(PyObject
* o
)
3244 static PyTypeObject EncodingMapType
= {
3245 PyObject_HEAD_INIT(NULL
)
3247 "EncodingMap", /*tp_name*/
3248 sizeof(struct encoding_map
), /*tp_basicsize*/
3251 encoding_map_dealloc
, /*tp_dealloc*/
3258 0, /*tp_as_sequence*/
3259 0, /*tp_as_mapping*/
3266 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
3270 0, /*tp_richcompare*/
3271 0, /*tp_weaklistoffset*/
3274 encoding_map_methods
, /*tp_methods*/
3281 0, /*tp_dictoffset*/
3290 PyUnicode_BuildEncodingMap(PyObject
* string
)
3294 struct encoding_map
*mresult
;
3297 unsigned char level1
[32];
3298 unsigned char level2
[512];
3299 unsigned char *mlevel1
, *mlevel2
, *mlevel3
;
3300 int count2
= 0, count3
= 0;
3302 if (!PyUnicode_Check(string
) || PyUnicode_GetSize(string
) != 256) {
3303 PyErr_BadArgument();
3306 decode
= PyUnicode_AS_UNICODE(string
);
3307 memset(level1
, 0xFF, sizeof level1
);
3308 memset(level2
, 0xFF, sizeof level2
);
3310 /* If there isn't a one-to-one mapping of NULL to \0,
3311 or if there are non-BMP characters, we need to use
3312 a mapping dictionary. */
3315 for (i
= 1; i
< 256; i
++) {
3318 #ifdef Py_UNICODE_WIDE
3319 || decode
[i
] > 0xFFFF
3325 if (decode
[i
] == 0xFFFE)
3326 /* unmapped character */
3328 l1
= decode
[i
] >> 11;
3329 l2
= decode
[i
] >> 7;
3330 if (level1
[l1
] == 0xFF)
3331 level1
[l1
] = count2
++;
3332 if (level2
[l2
] == 0xFF)
3333 level2
[l2
] = count3
++;
3336 if (count2
>= 0xFF || count3
>= 0xFF)
3340 PyObject
*result
= PyDict_New();
3341 PyObject
*key
, *value
;
3344 for (i
= 0; i
< 256; i
++) {
3346 key
= PyInt_FromLong(decode
[i
]);
3347 value
= PyInt_FromLong(i
);
3350 if (PyDict_SetItem(result
, key
, value
) == -1)
3363 /* Create a three-level trie */
3364 result
= PyObject_MALLOC(sizeof(struct encoding_map
) +
3365 16*count2
+ 128*count3
- 1);
3367 return PyErr_NoMemory();
3368 PyObject_Init(result
, &EncodingMapType
);
3369 mresult
= (struct encoding_map
*)result
;
3370 mresult
->count2
= count2
;
3371 mresult
->count3
= count3
;
3372 mlevel1
= mresult
->level1
;
3373 mlevel2
= mresult
->level23
;
3374 mlevel3
= mresult
->level23
+ 16*count2
;
3375 memcpy(mlevel1
, level1
, 32);
3376 memset(mlevel2
, 0xFF, 16*count2
);
3377 memset(mlevel3
, 0, 128*count3
);
3379 for (i
= 1; i
< 256; i
++) {
3380 int o1
, o2
, o3
, i2
, i3
;
3381 if (decode
[i
] == 0xFFFE)
3382 /* unmapped character */
3385 o2
= (decode
[i
]>>7) & 0xF;
3386 i2
= 16*mlevel1
[o1
] + o2
;
3387 if (mlevel2
[i2
] == 0xFF)
3388 mlevel2
[i2
] = count3
++;
3389 o3
= decode
[i
] & 0x7F;
3390 i3
= 128*mlevel2
[i2
] + o3
;
3397 encoding_map_lookup(Py_UNICODE c
, PyObject
*mapping
)
3399 struct encoding_map
*map
= (struct encoding_map
*)mapping
;
3401 int l2
= (c
>>7) & 0xF;
3405 #ifdef Py_UNICODE_WIDE
3413 i
= map
->level1
[l1
];
3418 i
= map
->level23
[16*i
+l2
];
3423 i
= map
->level23
[16*map
->count2
+ 128*i
+ l3
];
3430 /* Lookup the character ch in the mapping. If the character
3431 can't be found, Py_None is returned (or NULL, if another
3433 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
3435 PyObject
*w
= PyInt_FromLong((long)c
);
3440 x
= PyObject_GetItem(mapping
, w
);
3443 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3444 /* No mapping found means: mapping is undefined. */
3452 else if (x
== Py_None
)
3454 else if (PyInt_Check(x
)) {
3455 long value
= PyInt_AS_LONG(x
);
3456 if (value
< 0 || value
> 255) {
3457 PyErr_SetString(PyExc_TypeError
,
3458 "character mapping must be in range(256)");
3464 else if (PyString_Check(x
))
3467 /* wrong return value */
3468 PyErr_SetString(PyExc_TypeError
,
3469 "character mapping must return integer, None or str");
3476 charmapencode_resize(PyObject
**outobj
, Py_ssize_t
*outpos
, Py_ssize_t requiredsize
)
3478 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
3479 /* exponentially overallocate to minimize reallocations */
3480 if (requiredsize
< 2*outsize
)
3481 requiredsize
= 2*outsize
;
3482 if (_PyString_Resize(outobj
, requiredsize
)) {
3488 typedef enum charmapencode_result
{
3489 enc_SUCCESS
, enc_FAILED
, enc_EXCEPTION
3490 }charmapencode_result
;
3491 /* lookup the character, put the result in the output string and adjust
3492 various state variables. Reallocate the output string if not enough
3493 space is available. Return a new reference to the object that
3494 was put in the output buffer, or Py_None, if the mapping was undefined
3495 (in which case no character was written) or NULL, if a
3496 reallocation error occurred. The caller must decref the result */
3498 charmapencode_result
charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
3499 PyObject
**outobj
, Py_ssize_t
*outpos
)
3503 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
3505 if (mapping
->ob_type
== &EncodingMapType
) {
3506 int res
= encoding_map_lookup(c
, mapping
);
3507 Py_ssize_t requiredsize
= *outpos
+1;
3510 if (outsize
<requiredsize
)
3511 if (!charmapencode_resize(outobj
, outpos
, requiredsize
))
3512 return enc_EXCEPTION
;
3513 outstart
= PyString_AS_STRING(*outobj
);
3514 outstart
[(*outpos
)++] = (char)res
;
3518 rep
= charmapencode_lookup(c
, mapping
);
3520 return enc_EXCEPTION
;
3521 else if (rep
==Py_None
) {
3525 if (PyInt_Check(rep
)) {
3526 Py_ssize_t requiredsize
= *outpos
+1;
3527 if (outsize
<requiredsize
)
3528 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
3530 return enc_EXCEPTION
;
3532 outstart
= PyString_AS_STRING(*outobj
);
3533 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
3536 const char *repchars
= PyString_AS_STRING(rep
);
3537 Py_ssize_t repsize
= PyString_GET_SIZE(rep
);
3538 Py_ssize_t requiredsize
= *outpos
+repsize
;
3539 if (outsize
<requiredsize
)
3540 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
3542 return enc_EXCEPTION
;
3544 outstart
= PyString_AS_STRING(*outobj
);
3545 memcpy(outstart
+ *outpos
, repchars
, repsize
);
3553 /* handle an error in PyUnicode_EncodeCharmap
3554 Return 0 on success, -1 on error */
3556 int charmap_encoding_error(
3557 const Py_UNICODE
*p
, Py_ssize_t size
, Py_ssize_t
*inpos
, PyObject
*mapping
,
3558 PyObject
**exceptionObject
,
3559 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
3560 PyObject
**res
, Py_ssize_t
*respos
)
3562 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
3566 /* startpos for collecting unencodable chars */
3567 Py_ssize_t collstartpos
= *inpos
;
3568 Py_ssize_t collendpos
= *inpos
+1;
3570 char *encoding
= "charmap";
3571 char *reason
= "character maps to <undefined>";
3572 charmapencode_result x
;
3574 /* find all unencodable characters */
3575 while (collendpos
< size
) {
3577 if (mapping
->ob_type
== &EncodingMapType
) {
3578 int res
= encoding_map_lookup(p
[collendpos
], mapping
);
3585 rep
= charmapencode_lookup(p
[collendpos
], mapping
);
3588 else if (rep
!=Py_None
) {
3595 /* cache callback name lookup
3596 * (if not done yet, i.e. it's the first error) */
3597 if (*known_errorHandler
==-1) {
3598 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3599 *known_errorHandler
= 1;
3600 else if (!strcmp(errors
, "replace"))
3601 *known_errorHandler
= 2;
3602 else if (!strcmp(errors
, "ignore"))
3603 *known_errorHandler
= 3;
3604 else if (!strcmp(errors
, "xmlcharrefreplace"))
3605 *known_errorHandler
= 4;
3607 *known_errorHandler
= 0;
3609 switch (*known_errorHandler
) {
3610 case 1: /* strict */
3611 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3613 case 2: /* replace */
3614 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
3615 x
= charmapencode_output('?', mapping
, res
, respos
);
3616 if (x
==enc_EXCEPTION
) {
3619 else if (x
==enc_FAILED
) {
3620 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3625 case 3: /* ignore */
3626 *inpos
= collendpos
;
3628 case 4: /* xmlcharrefreplace */
3629 /* generate replacement (temporarily (mis)uses p) */
3630 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
3631 char buffer
[2+29+1+1];
3633 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
3634 for (cp
= buffer
; *cp
; ++cp
) {
3635 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
3636 if (x
==enc_EXCEPTION
)
3638 else if (x
==enc_FAILED
) {
3639 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3644 *inpos
= collendpos
;
3647 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
3648 encoding
, reason
, p
, size
, exceptionObject
,
3649 collstartpos
, collendpos
, &newpos
);
3650 if (repunicode
== NULL
)
3652 /* generate replacement */
3653 repsize
= PyUnicode_GET_SIZE(repunicode
);
3654 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
3655 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
3656 if (x
==enc_EXCEPTION
) {
3659 else if (x
==enc_FAILED
) {
3660 Py_DECREF(repunicode
);
3661 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3666 Py_DECREF(repunicode
);
3671 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
3677 PyObject
*res
= NULL
;
3678 /* current input position */
3679 Py_ssize_t inpos
= 0;
3680 /* current output position */
3681 Py_ssize_t respos
= 0;
3682 PyObject
*errorHandler
= NULL
;
3683 PyObject
*exc
= NULL
;
3684 /* the following variable is used for caching string comparisons
3685 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3686 * 3=ignore, 4=xmlcharrefreplace */
3687 int known_errorHandler
= -1;
3689 /* Default to Latin-1 */
3690 if (mapping
== NULL
)
3691 return PyUnicode_EncodeLatin1(p
, size
, errors
);
3693 /* allocate enough for a simple encoding without
3694 replacements, if we need more, we'll resize */
3695 res
= PyString_FromStringAndSize(NULL
, size
);
3701 while (inpos
<size
) {
3702 /* try to encode it */
3703 charmapencode_result x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
3704 if (x
==enc_EXCEPTION
) /* error */
3706 if (x
==enc_FAILED
) { /* unencodable character */
3707 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
3709 &known_errorHandler
, &errorHandler
, errors
,
3715 /* done with this character => adjust input position */
3719 /* Resize if we allocated to much */
3720 if (respos
<PyString_GET_SIZE(res
)) {
3721 if (_PyString_Resize(&res
, respos
))
3725 Py_XDECREF(errorHandler
);
3731 Py_XDECREF(errorHandler
);
3735 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
3738 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
3739 PyErr_BadArgument();
3742 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
3743 PyUnicode_GET_SIZE(unicode
),
3748 /* create or adjust a UnicodeTranslateError */
3749 static void make_translate_exception(PyObject
**exceptionObject
,
3750 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3751 Py_ssize_t startpos
, Py_ssize_t endpos
,
3754 if (*exceptionObject
== NULL
) {
3755 *exceptionObject
= PyUnicodeTranslateError_Create(
3756 unicode
, size
, startpos
, endpos
, reason
);
3759 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
3761 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
3763 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
3767 Py_DECREF(*exceptionObject
);
3768 *exceptionObject
= NULL
;
3772 /* raises a UnicodeTranslateError */
3773 static void raise_translate_exception(PyObject
**exceptionObject
,
3774 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3775 Py_ssize_t startpos
, Py_ssize_t endpos
,
3778 make_translate_exception(exceptionObject
,
3779 unicode
, size
, startpos
, endpos
, reason
);
3780 if (*exceptionObject
!= NULL
)
3781 PyCodec_StrictErrors(*exceptionObject
);
3784 /* error handling callback helper:
3785 build arguments, call the callback and check the arguments,
3786 put the result into newpos and return the replacement string, which
3787 has to be freed by the caller */
3788 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
3789 PyObject
**errorHandler
,
3791 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
3792 Py_ssize_t startpos
, Py_ssize_t endpos
,
3795 static char *argparse
= "O!n;translating error handler must return (unicode, int) tuple";
3797 Py_ssize_t i_newpos
;
3799 PyObject
*resunicode
;
3801 if (*errorHandler
== NULL
) {
3802 *errorHandler
= PyCodec_LookupError(errors
);
3803 if (*errorHandler
== NULL
)
3807 make_translate_exception(exceptionObject
,
3808 unicode
, size
, startpos
, endpos
, reason
);
3809 if (*exceptionObject
== NULL
)
3812 restuple
= PyObject_CallFunctionObjArgs(
3813 *errorHandler
, *exceptionObject
, NULL
);
3814 if (restuple
== NULL
)
3816 if (!PyTuple_Check(restuple
)) {
3817 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
3818 Py_DECREF(restuple
);
3821 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3822 &resunicode
, &i_newpos
)) {
3823 Py_DECREF(restuple
);
3827 *newpos
= size
+i_newpos
;
3830 if (*newpos
<0 || *newpos
>size
) {
3831 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
3832 Py_DECREF(restuple
);
3835 Py_INCREF(resunicode
);
3836 Py_DECREF(restuple
);
3840 /* Lookup the character ch in the mapping and put the result in result,
3841 which must be decrefed by the caller.
3842 Return 0 on success, -1 on error */
3844 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
3846 PyObject
*w
= PyInt_FromLong((long)c
);
3851 x
= PyObject_GetItem(mapping
, w
);
3854 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3855 /* No mapping found means: use 1:1 mapping. */
3862 else if (x
== Py_None
) {
3866 else if (PyInt_Check(x
)) {
3867 long value
= PyInt_AS_LONG(x
);
3868 long max
= PyUnicode_GetMax();
3869 if (value
< 0 || value
> max
) {
3870 PyErr_Format(PyExc_TypeError
,
3871 "character mapping must be in range(0x%lx)", max
+1);
3878 else if (PyUnicode_Check(x
)) {
3883 /* wrong return value */
3884 PyErr_SetString(PyExc_TypeError
,
3885 "character mapping must return integer, None or unicode");
3890 /* ensure that *outobj is at least requiredsize characters long,
3891 if not reallocate and adjust various state variables.
3892 Return 0 on success, -1 on error */
3894 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
3895 Py_ssize_t requiredsize
)
3897 Py_ssize_t oldsize
= PyUnicode_GET_SIZE(*outobj
);
3898 if (requiredsize
> oldsize
) {
3899 /* remember old output position */
3900 Py_ssize_t outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
3901 /* exponentially overallocate to minimize reallocations */
3902 if (requiredsize
< 2 * oldsize
)
3903 requiredsize
= 2 * oldsize
;
3904 if (_PyUnicode_Resize(outobj
, requiredsize
) < 0)
3906 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
3910 /* lookup the character, put the result in the output string and adjust
3911 various state variables. Return a new reference to the object that
3912 was put in the output buffer in *result, or Py_None, if the mapping was
3913 undefined (in which case no character was written).
3914 The called must decref result.
3915 Return 0 on success, -1 on error. */
3917 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
3918 Py_ssize_t insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
3921 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
3924 /* not found => default to 1:1 mapping */
3925 *(*outp
)++ = *curinp
;
3927 else if (*res
==Py_None
)
3929 else if (PyInt_Check(*res
)) {
3930 /* no overflow check, because we know that the space is enough */
3931 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
3933 else if (PyUnicode_Check(*res
)) {
3934 Py_ssize_t repsize
= PyUnicode_GET_SIZE(*res
);
3936 /* no overflow check, because we know that the space is enough */
3937 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
3939 else if (repsize
!=0) {
3940 /* more than one character */
3941 Py_ssize_t requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
3942 (insize
- (curinp
-startinp
)) +
3944 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
3946 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
3955 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
3961 PyObject
*res
= NULL
;
3962 /* pointers to the beginning and end+1 of input */
3963 const Py_UNICODE
*startp
= p
;
3964 const Py_UNICODE
*endp
= p
+ size
;
3965 /* pointer into the output */
3967 /* current output position */
3968 Py_ssize_t respos
= 0;
3969 char *reason
= "character maps to <undefined>";
3970 PyObject
*errorHandler
= NULL
;
3971 PyObject
*exc
= NULL
;
3972 /* the following variable is used for caching string comparisons
3973 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3974 * 3=ignore, 4=xmlcharrefreplace */
3975 int known_errorHandler
= -1;
3977 if (mapping
== NULL
) {
3978 PyErr_BadArgument();
3982 /* allocate enough for a simple 1:1 translation without
3983 replacements, if we need more, we'll resize */
3984 res
= PyUnicode_FromUnicode(NULL
, size
);
3989 str
= PyUnicode_AS_UNICODE(res
);
3992 /* try to encode it */
3994 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
3999 if (x
!=Py_None
) /* it worked => adjust input pointer */
4001 else { /* untranslatable character */
4002 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4006 /* startpos for collecting untranslatable chars */
4007 const Py_UNICODE
*collstart
= p
;
4008 const Py_UNICODE
*collend
= p
+1;
4009 const Py_UNICODE
*coll
;
4011 /* find all untranslatable characters */
4012 while (collend
< endp
) {
4013 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
4020 /* cache callback name lookup
4021 * (if not done yet, i.e. it's the first error) */
4022 if (known_errorHandler
==-1) {
4023 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4024 known_errorHandler
= 1;
4025 else if (!strcmp(errors
, "replace"))
4026 known_errorHandler
= 2;
4027 else if (!strcmp(errors
, "ignore"))
4028 known_errorHandler
= 3;
4029 else if (!strcmp(errors
, "xmlcharrefreplace"))
4030 known_errorHandler
= 4;
4032 known_errorHandler
= 0;
4034 switch (known_errorHandler
) {
4035 case 1: /* strict */
4036 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
4038 case 2: /* replace */
4039 /* No need to check for space, this is a 1:1 replacement */
4040 for (coll
= collstart
; coll
<collend
; ++coll
)
4043 case 3: /* ignore */
4046 case 4: /* xmlcharrefreplace */
4047 /* generate replacement (temporarily (mis)uses p) */
4048 for (p
= collstart
; p
< collend
; ++p
) {
4049 char buffer
[2+29+1+1];
4051 sprintf(buffer
, "&#%d;", (int)*p
);
4052 if (charmaptranslate_makespace(&res
, &str
,
4053 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
4055 for (cp
= buffer
; *cp
; ++cp
)
4061 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
4062 reason
, startp
, size
, &exc
,
4063 collstart
-startp
, collend
-startp
, &newpos
);
4064 if (repunicode
== NULL
)
4066 /* generate replacement */
4067 repsize
= PyUnicode_GET_SIZE(repunicode
);
4068 if (charmaptranslate_makespace(&res
, &str
,
4069 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
4070 Py_DECREF(repunicode
);
4073 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
4075 p
= startp
+ newpos
;
4076 Py_DECREF(repunicode
);
4080 /* Resize if we allocated to much */
4081 respos
= str
-PyUnicode_AS_UNICODE(res
);
4082 if (respos
<PyUnicode_GET_SIZE(res
)) {
4083 if (_PyUnicode_Resize(&res
, respos
) < 0)
4087 Py_XDECREF(errorHandler
);
4093 Py_XDECREF(errorHandler
);
4097 PyObject
*PyUnicode_Translate(PyObject
*str
,
4103 str
= PyUnicode_FromObject(str
);
4106 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
4107 PyUnicode_GET_SIZE(str
),
4118 /* --- Decimal Encoder ---------------------------------------------------- */
4120 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
4125 Py_UNICODE
*p
, *end
;
4126 PyObject
*errorHandler
= NULL
;
4127 PyObject
*exc
= NULL
;
4128 const char *encoding
= "decimal";
4129 const char *reason
= "invalid decimal Unicode string";
4130 /* the following variable is used for caching string comparisons
4131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4132 int known_errorHandler
= -1;
4134 if (output
== NULL
) {
4135 PyErr_BadArgument();
4142 register Py_UNICODE ch
= *p
;
4144 PyObject
*repunicode
;
4148 Py_UNICODE
*collstart
;
4149 Py_UNICODE
*collend
;
4151 if (Py_UNICODE_ISSPACE(ch
)) {
4156 decimal
= Py_UNICODE_TODECIMAL(ch
);
4158 *output
++ = '0' + decimal
;
4162 if (0 < ch
&& ch
< 256) {
4163 *output
++ = (char)ch
;
4167 /* All other characters are considered unencodable */
4170 while (collend
< end
) {
4171 if ((0 < *collend
&& *collend
< 256) ||
4172 !Py_UNICODE_ISSPACE(*collend
) ||
4173 Py_UNICODE_TODECIMAL(*collend
))
4176 /* cache callback name lookup
4177 * (if not done yet, i.e. it's the first error) */
4178 if (known_errorHandler
==-1) {
4179 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4180 known_errorHandler
= 1;
4181 else if (!strcmp(errors
, "replace"))
4182 known_errorHandler
= 2;
4183 else if (!strcmp(errors
, "ignore"))
4184 known_errorHandler
= 3;
4185 else if (!strcmp(errors
, "xmlcharrefreplace"))
4186 known_errorHandler
= 4;
4188 known_errorHandler
= 0;
4190 switch (known_errorHandler
) {
4191 case 1: /* strict */
4192 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
4194 case 2: /* replace */
4195 for (p
= collstart
; p
< collend
; ++p
)
4198 case 3: /* ignore */
4201 case 4: /* xmlcharrefreplace */
4202 /* generate replacement (temporarily (mis)uses p) */
4203 for (p
= collstart
; p
< collend
; ++p
)
4204 output
+= sprintf(output
, "&#%d;", (int)*p
);
4208 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
4209 encoding
, reason
, s
, length
, &exc
,
4210 collstart
-s
, collend
-s
, &newpos
);
4211 if (repunicode
== NULL
)
4213 /* generate replacement */
4214 repsize
= PyUnicode_GET_SIZE(repunicode
);
4215 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
4216 Py_UNICODE ch
= *uni2
;
4217 if (Py_UNICODE_ISSPACE(ch
))
4220 decimal
= Py_UNICODE_TODECIMAL(ch
);
4222 *output
++ = '0' + decimal
;
4223 else if (0 < ch
&& ch
< 256)
4224 *output
++ = (char)ch
;
4226 Py_DECREF(repunicode
);
4227 raise_encode_exception(&exc
, encoding
,
4228 s
, length
, collstart
-s
, collend
-s
, reason
);
4234 Py_DECREF(repunicode
);
4237 /* 0-terminate the output string */
4240 Py_XDECREF(errorHandler
);
4245 Py_XDECREF(errorHandler
);
4249 /* --- Helpers ------------------------------------------------------------ */
4251 #define STRINGLIB_CHAR Py_UNICODE
4253 #define STRINGLIB_LEN PyUnicode_GET_SIZE
4254 #define STRINGLIB_NEW PyUnicode_FromUnicode
4255 #define STRINGLIB_STR PyUnicode_AS_UNICODE
4257 Py_LOCAL_INLINE(int)
4258 STRINGLIB_CMP(const Py_UNICODE
* str
, const Py_UNICODE
* other
, Py_ssize_t len
)
4260 if (str
[0] != other
[0])
4262 return memcmp((void*) str
, (void*) other
, len
* sizeof(Py_UNICODE
));
4265 #define STRINGLIB_EMPTY unicode_empty
4267 #include "stringlib/fastsearch.h"
4269 #include "stringlib/count.h"
4270 #include "stringlib/find.h"
4271 #include "stringlib/partition.h"
4273 /* helper macro to fixup start/end slice values */
4274 #define FIX_START_END(obj) \
4276 start += (obj)->length; \
4279 if (end > (obj)->length) \
4280 end = (obj)->length; \
4282 end += (obj)->length; \
4286 Py_ssize_t
PyUnicode_Count(PyObject
*str
,
4292 PyUnicodeObject
* str_obj
;
4293 PyUnicodeObject
* sub_obj
;
4295 str_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(str
);
4298 sub_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(substr
);
4304 FIX_START_END(str_obj
);
4306 result
= stringlib_count(
4307 str_obj
->str
+ start
, end
- start
, sub_obj
->str
, sub_obj
->length
4316 Py_ssize_t
PyUnicode_Find(PyObject
*str
,
4324 str
= PyUnicode_FromObject(str
);
4327 sub
= PyUnicode_FromObject(sub
);
4334 result
= stringlib_find_slice(
4335 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
4336 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
4340 result
= stringlib_rfind_slice(
4341 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
4342 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
4353 int tailmatch(PyUnicodeObject
*self
,
4354 PyUnicodeObject
*substring
,
4359 if (substring
->length
== 0)
4362 FIX_START_END(self
);
4364 end
-= substring
->length
;
4368 if (direction
> 0) {
4369 if (Py_UNICODE_MATCH(self
, end
, substring
))
4372 if (Py_UNICODE_MATCH(self
, start
, substring
))
4379 Py_ssize_t
PyUnicode_Tailmatch(PyObject
*str
,
4387 str
= PyUnicode_FromObject(str
);
4390 substr
= PyUnicode_FromObject(substr
);
4391 if (substr
== NULL
) {
4396 result
= tailmatch((PyUnicodeObject
*)str
,
4397 (PyUnicodeObject
*)substr
,
4398 start
, end
, direction
);
4404 /* Apply fixfct filter to the Unicode object self and return a
4405 reference to the modified object */
4408 PyObject
*fixup(PyUnicodeObject
*self
,
4409 int (*fixfct
)(PyUnicodeObject
*s
))
4414 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
4418 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
4420 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
4421 /* fixfct should return TRUE if it modified the buffer. If
4422 FALSE, return a reference to the original buffer instead
4423 (to save space, not time) */
4426 return (PyObject
*) self
;
4428 return (PyObject
*) u
;
4432 int fixupper(PyUnicodeObject
*self
)
4434 Py_ssize_t len
= self
->length
;
4435 Py_UNICODE
*s
= self
->str
;
4439 register Py_UNICODE ch
;
4441 ch
= Py_UNICODE_TOUPPER(*s
);
4453 int fixlower(PyUnicodeObject
*self
)
4455 Py_ssize_t len
= self
->length
;
4456 Py_UNICODE
*s
= self
->str
;
4460 register Py_UNICODE ch
;
4462 ch
= Py_UNICODE_TOLOWER(*s
);
4474 int fixswapcase(PyUnicodeObject
*self
)
4476 Py_ssize_t len
= self
->length
;
4477 Py_UNICODE
*s
= self
->str
;
4481 if (Py_UNICODE_ISUPPER(*s
)) {
4482 *s
= Py_UNICODE_TOLOWER(*s
);
4484 } else if (Py_UNICODE_ISLOWER(*s
)) {
4485 *s
= Py_UNICODE_TOUPPER(*s
);
4495 int fixcapitalize(PyUnicodeObject
*self
)
4497 Py_ssize_t len
= self
->length
;
4498 Py_UNICODE
*s
= self
->str
;
4503 if (Py_UNICODE_ISLOWER(*s
)) {
4504 *s
= Py_UNICODE_TOUPPER(*s
);
4509 if (Py_UNICODE_ISUPPER(*s
)) {
4510 *s
= Py_UNICODE_TOLOWER(*s
);
4519 int fixtitle(PyUnicodeObject
*self
)
4521 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4522 register Py_UNICODE
*e
;
4523 int previous_is_cased
;
4525 /* Shortcut for single character strings */
4526 if (PyUnicode_GET_SIZE(self
) == 1) {
4527 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
4536 e
= p
+ PyUnicode_GET_SIZE(self
);
4537 previous_is_cased
= 0;
4538 for (; p
< e
; p
++) {
4539 register const Py_UNICODE ch
= *p
;
4541 if (previous_is_cased
)
4542 *p
= Py_UNICODE_TOLOWER(ch
);
4544 *p
= Py_UNICODE_TOTITLE(ch
);
4546 if (Py_UNICODE_ISLOWER(ch
) ||
4547 Py_UNICODE_ISUPPER(ch
) ||
4548 Py_UNICODE_ISTITLE(ch
))
4549 previous_is_cased
= 1;
4551 previous_is_cased
= 0;
4557 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
4559 PyObject
*internal_separator
= NULL
;
4560 const Py_UNICODE blank
= ' ';
4561 const Py_UNICODE
*sep
= &blank
;
4562 Py_ssize_t seplen
= 1;
4563 PyUnicodeObject
*res
= NULL
; /* the result */
4564 Py_ssize_t res_alloc
= 100; /* # allocated bytes for string in res */
4565 Py_ssize_t res_used
; /* # used bytes */
4566 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
4567 PyObject
*fseq
; /* PySequence_Fast(seq) */
4568 Py_ssize_t seqlen
; /* len(fseq) -- number of items in sequence */
4572 fseq
= PySequence_Fast(seq
, "");
4577 /* Grrrr. A codec may be invoked to convert str objects to
4578 * Unicode, and so it's possible to call back into Python code
4579 * during PyUnicode_FromObject(), and so it's possible for a sick
4580 * codec to change the size of fseq (if seq is a list). Therefore
4581 * we have to keep refetching the size -- can't assume seqlen
4584 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4585 /* If empty sequence, return u"". */
4587 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
4590 /* If singleton sequence with an exact Unicode, return that. */
4592 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
4593 if (PyUnicode_CheckExact(item
)) {
4595 res
= (PyUnicodeObject
*)item
;
4600 /* At least two items to join, or one that isn't exact Unicode. */
4602 /* Set up sep and seplen -- they're needed. */
4603 if (separator
== NULL
) {
4608 internal_separator
= PyUnicode_FromObject(separator
);
4609 if (internal_separator
== NULL
)
4611 sep
= PyUnicode_AS_UNICODE(internal_separator
);
4612 seplen
= PyUnicode_GET_SIZE(internal_separator
);
4613 /* In case PyUnicode_FromObject() mutated seq. */
4614 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4619 res
= _PyUnicode_New(res_alloc
);
4622 res_p
= PyUnicode_AS_UNICODE(res
);
4625 for (i
= 0; i
< seqlen
; ++i
) {
4627 Py_ssize_t new_res_used
;
4629 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
4630 /* Convert item to Unicode. */
4631 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
4632 PyErr_Format(PyExc_TypeError
,
4633 "sequence item %zd: expected string or Unicode,"
4635 i
, item
->ob_type
->tp_name
);
4638 item
= PyUnicode_FromObject(item
);
4641 /* We own a reference to item from here on. */
4643 /* In case PyUnicode_FromObject() mutated seq. */
4644 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4646 /* Make sure we have enough space for the separator and the item. */
4647 itemlen
= PyUnicode_GET_SIZE(item
);
4648 new_res_used
= res_used
+ itemlen
;
4649 if (new_res_used
< 0)
4651 if (i
< seqlen
- 1) {
4652 new_res_used
+= seplen
;
4653 if (new_res_used
< 0)
4656 if (new_res_used
> res_alloc
) {
4657 /* double allocated size until it's big enough */
4659 res_alloc
+= res_alloc
;
4662 } while (new_res_used
> res_alloc
);
4663 if (_PyUnicode_Resize(&res
, res_alloc
) < 0) {
4667 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
4670 /* Copy item, and maybe the separator. */
4671 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), itemlen
);
4673 if (i
< seqlen
- 1) {
4674 Py_UNICODE_COPY(res_p
, sep
, seplen
);
4678 res_used
= new_res_used
;
4681 /* Shrink res to match the used area; this probably can't fail,
4682 * but it's cheap to check.
4684 if (_PyUnicode_Resize(&res
, res_used
) < 0)
4688 Py_XDECREF(internal_separator
);
4690 return (PyObject
*)res
;
4693 PyErr_SetString(PyExc_OverflowError
,
4694 "join() result is too long for a Python string");
4699 Py_XDECREF(internal_separator
);
4706 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
4718 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
4723 u
= _PyUnicode_New(left
+ self
->length
+ right
);
4726 Py_UNICODE_FILL(u
->str
, fill
, left
);
4727 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
4729 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
4735 #define SPLIT_APPEND(data, left, right) \
4736 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4739 if (PyList_Append(list, str)) { \
4747 PyObject
*split_whitespace(PyUnicodeObject
*self
,
4749 Py_ssize_t maxcount
)
4751 register Py_ssize_t i
;
4752 register Py_ssize_t j
;
4753 Py_ssize_t len
= self
->length
;
4756 for (i
= j
= 0; i
< len
; ) {
4758 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
4761 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
4764 if (maxcount
-- <= 0)
4766 SPLIT_APPEND(self
->str
, j
, i
);
4767 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
4773 SPLIT_APPEND(self
->str
, j
, len
);
4782 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
4785 register Py_ssize_t i
;
4786 register Py_ssize_t j
;
4792 string
= PyUnicode_FromObject(string
);
4795 data
= PyUnicode_AS_UNICODE(string
);
4796 len
= PyUnicode_GET_SIZE(string
);
4798 list
= PyList_New(0);
4802 for (i
= j
= 0; i
< len
; ) {
4805 /* Find a line and append it */
4806 while (i
< len
&& !BLOOM_LINEBREAK(data
[i
]))
4809 /* Skip the line break reading CRLF as one line break */
4812 if (data
[i
] == '\r' && i
+ 1 < len
&&
4820 SPLIT_APPEND(data
, j
, eol
);
4824 SPLIT_APPEND(data
, j
, len
);
4837 PyObject
*split_char(PyUnicodeObject
*self
,
4840 Py_ssize_t maxcount
)
4842 register Py_ssize_t i
;
4843 register Py_ssize_t j
;
4844 Py_ssize_t len
= self
->length
;
4847 for (i
= j
= 0; i
< len
; ) {
4848 if (self
->str
[i
] == ch
) {
4849 if (maxcount
-- <= 0)
4851 SPLIT_APPEND(self
->str
, j
, i
);
4857 SPLIT_APPEND(self
->str
, j
, len
);
4867 PyObject
*split_substring(PyUnicodeObject
*self
,
4869 PyUnicodeObject
*substring
,
4870 Py_ssize_t maxcount
)
4872 register Py_ssize_t i
;
4873 register Py_ssize_t j
;
4874 Py_ssize_t len
= self
->length
;
4875 Py_ssize_t sublen
= substring
->length
;
4878 for (i
= j
= 0; i
<= len
- sublen
; ) {
4879 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
4880 if (maxcount
-- <= 0)
4882 SPLIT_APPEND(self
->str
, j
, i
);
4888 SPLIT_APPEND(self
->str
, j
, len
);
4898 PyObject
*rsplit_whitespace(PyUnicodeObject
*self
,
4900 Py_ssize_t maxcount
)
4902 register Py_ssize_t i
;
4903 register Py_ssize_t j
;
4904 Py_ssize_t len
= self
->length
;
4907 for (i
= j
= len
- 1; i
>= 0; ) {
4909 while (i
>= 0 && Py_UNICODE_ISSPACE(self
->str
[i
]))
4912 while (i
>= 0 && !Py_UNICODE_ISSPACE(self
->str
[i
]))
4915 if (maxcount
-- <= 0)
4917 SPLIT_APPEND(self
->str
, i
+ 1, j
+ 1);
4918 while (i
>= 0 && Py_UNICODE_ISSPACE(self
->str
[i
]))
4924 SPLIT_APPEND(self
->str
, 0, j
+ 1);
4926 if (PyList_Reverse(list
) < 0)
4936 PyObject
*rsplit_char(PyUnicodeObject
*self
,
4939 Py_ssize_t maxcount
)
4941 register Py_ssize_t i
;
4942 register Py_ssize_t j
;
4943 Py_ssize_t len
= self
->length
;
4946 for (i
= j
= len
- 1; i
>= 0; ) {
4947 if (self
->str
[i
] == ch
) {
4948 if (maxcount
-- <= 0)
4950 SPLIT_APPEND(self
->str
, i
+ 1, j
+ 1);
4956 SPLIT_APPEND(self
->str
, 0, j
+ 1);
4958 if (PyList_Reverse(list
) < 0)
4968 PyObject
*rsplit_substring(PyUnicodeObject
*self
,
4970 PyUnicodeObject
*substring
,
4971 Py_ssize_t maxcount
)
4973 register Py_ssize_t i
;
4974 register Py_ssize_t j
;
4975 Py_ssize_t len
= self
->length
;
4976 Py_ssize_t sublen
= substring
->length
;
4979 for (i
= len
- sublen
, j
= len
; i
>= 0; ) {
4980 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
4981 if (maxcount
-- <= 0)
4983 SPLIT_APPEND(self
->str
, i
+ sublen
, j
);
4990 SPLIT_APPEND(self
->str
, 0, j
);
4992 if (PyList_Reverse(list
) < 0)
5004 PyObject
*split(PyUnicodeObject
*self
,
5005 PyUnicodeObject
*substring
,
5006 Py_ssize_t maxcount
)
5011 maxcount
= PY_SSIZE_T_MAX
;
5013 list
= PyList_New(0);
5017 if (substring
== NULL
)
5018 return split_whitespace(self
,list
,maxcount
);
5020 else if (substring
->length
== 1)
5021 return split_char(self
,list
,substring
->str
[0],maxcount
);
5023 else if (substring
->length
== 0) {
5025 PyErr_SetString(PyExc_ValueError
, "empty separator");
5029 return split_substring(self
,list
,substring
,maxcount
);
5033 PyObject
*rsplit(PyUnicodeObject
*self
,
5034 PyUnicodeObject
*substring
,
5035 Py_ssize_t maxcount
)
5040 maxcount
= PY_SSIZE_T_MAX
;
5042 list
= PyList_New(0);
5046 if (substring
== NULL
)
5047 return rsplit_whitespace(self
,list
,maxcount
);
5049 else if (substring
->length
== 1)
5050 return rsplit_char(self
,list
,substring
->str
[0],maxcount
);
5052 else if (substring
->length
== 0) {
5054 PyErr_SetString(PyExc_ValueError
, "empty separator");
5058 return rsplit_substring(self
,list
,substring
,maxcount
);
5062 PyObject
*replace(PyUnicodeObject
*self
,
5063 PyUnicodeObject
*str1
,
5064 PyUnicodeObject
*str2
,
5065 Py_ssize_t maxcount
)
5070 maxcount
= PY_SSIZE_T_MAX
;
5072 if (str1
->length
== str2
->length
) {
5075 if (str1
->length
== 1) {
5076 /* replace characters */
5078 if (!findchar(self
->str
, self
->length
, str1
->str
[0]))
5080 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5083 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5086 for (i
= 0; i
< u
->length
; i
++)
5087 if (u
->str
[i
] == u1
) {
5094 self
->str
, self
->length
, str1
->str
, str1
->length
, FAST_SEARCH
5098 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5101 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5102 while (i
<= self
->length
- str1
->length
)
5103 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
5106 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5113 Py_ssize_t n
, i
, j
, e
;
5114 Py_ssize_t product
, new_size
, delta
;
5117 /* replace strings */
5118 n
= stringlib_count(self
->str
, self
->length
, str1
->str
, str1
->length
);
5123 /* new_size = self->length + n * (str2->length - str1->length)); */
5124 delta
= (str2
->length
- str1
->length
);
5126 new_size
= self
->length
;
5128 product
= n
* (str2
->length
- str1
->length
);
5129 if ((product
/ (str2
->length
- str1
->length
)) != n
) {
5130 PyErr_SetString(PyExc_OverflowError
,
5131 "replace string is too long");
5134 new_size
= self
->length
+ product
;
5136 PyErr_SetString(PyExc_OverflowError
,
5137 "replace string is too long");
5141 u
= _PyUnicode_New(new_size
);
5146 e
= self
->length
- str1
->length
;
5147 if (str1
->length
> 0) {
5149 /* look for next match */
5152 if (Py_UNICODE_MATCH(self
, j
, str1
))
5159 /* copy unchanged part [i:j] */
5160 Py_UNICODE_COPY(p
, self
->str
+i
, j
-i
);
5163 /* copy substitution string */
5164 if (str2
->length
> 0) {
5165 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5168 i
= j
+ str1
->length
;
5170 if (i
< self
->length
)
5171 /* copy tail [i:] */
5172 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5176 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5180 *p
++ = self
->str
[i
++];
5182 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5185 return (PyObject
*) u
;
5188 /* nothing to replace; return original string (when possible) */
5189 if (PyUnicode_CheckExact(self
)) {
5191 return (PyObject
*) self
;
5193 return PyUnicode_FromUnicode(self
->str
, self
->length
);
5196 /* --- Unicode Object Methods --------------------------------------------- */
5198 PyDoc_STRVAR(title__doc__
,
5199 "S.title() -> unicode\n\
5201 Return a titlecased version of S, i.e. words start with title case\n\
5202 characters, all remaining cased characters have lower case.");
5205 unicode_title(PyUnicodeObject
*self
)
5207 return fixup(self
, fixtitle
);
5210 PyDoc_STRVAR(capitalize__doc__
,
5211 "S.capitalize() -> unicode\n\
5213 Return a capitalized version of S, i.e. make the first character\n\
5217 unicode_capitalize(PyUnicodeObject
*self
)
5219 return fixup(self
, fixcapitalize
);
5223 PyDoc_STRVAR(capwords__doc__
,
5224 "S.capwords() -> unicode\n\
5226 Apply .capitalize() to all words in S and return the result with\n\
5227 normalized whitespace (all whitespace strings are replaced by ' ').");
5230 unicode_capwords(PyUnicodeObject
*self
)
5236 /* Split into words */
5237 list
= split(self
, NULL
, -1);
5241 /* Capitalize each word */
5242 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
5243 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
5247 Py_DECREF(PyList_GET_ITEM(list
, i
));
5248 PyList_SET_ITEM(list
, i
, item
);
5251 /* Join the words to form a new string */
5252 item
= PyUnicode_Join(NULL
, list
);
5256 return (PyObject
*)item
;
5260 /* Argument converter. Coerces to a single unicode character */
5263 convert_uc(PyObject
*obj
, void *addr
)
5265 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
5269 uniobj
= PyUnicode_FromObject(obj
);
5270 if (uniobj
== NULL
) {
5271 PyErr_SetString(PyExc_TypeError
,
5272 "The fill character cannot be converted to Unicode");
5275 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
5276 PyErr_SetString(PyExc_TypeError
,
5277 "The fill character must be exactly one character long");
5281 unistr
= PyUnicode_AS_UNICODE(uniobj
);
5282 *fillcharloc
= unistr
[0];
5287 PyDoc_STRVAR(center__doc__
,
5288 "S.center(width[, fillchar]) -> unicode\n\
5290 Return S centered in a Unicode string of length width. Padding is\n\
5291 done using the specified fill character (default is a space)");
5294 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
5296 Py_ssize_t marg
, left
;
5298 Py_UNICODE fillchar
= ' ';
5300 if (!PyArg_ParseTuple(args
, "n|O&:center", &width
, convert_uc
, &fillchar
))
5303 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
5305 return (PyObject
*) self
;
5308 marg
= width
- self
->length
;
5309 left
= marg
/ 2 + (marg
& width
& 1);
5311 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
5316 /* This code should go into some future Unicode collation support
5317 module. The basic comparison should compare ordinals on a naive
5318 basis (this is what Java does and thus JPython too). */
5320 /* speedy UTF-16 code point order comparison */
5322 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5324 static short utf16Fixup
[32] =
5326 0, 0, 0, 0, 0, 0, 0, 0,
5327 0, 0, 0, 0, 0, 0, 0, 0,
5328 0, 0, 0, 0, 0, 0, 0, 0,
5329 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5333 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
5335 Py_ssize_t len1
, len2
;
5337 Py_UNICODE
*s1
= str1
->str
;
5338 Py_UNICODE
*s2
= str2
->str
;
5340 len1
= str1
->length
;
5341 len2
= str2
->length
;
5343 while (len1
> 0 && len2
> 0) {
5349 if (c1
> (1<<11) * 26)
5350 c1
+= utf16Fixup
[c1
>>11];
5351 if (c2
> (1<<11) * 26)
5352 c2
+= utf16Fixup
[c2
>>11];
5353 /* now c1 and c2 are in UTF-32-compatible order */
5356 return (c1
< c2
) ? -1 : 1;
5361 return (len1
< len2
) ? -1 : (len1
!= len2
);
5367 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
5369 register Py_ssize_t len1
, len2
;
5371 Py_UNICODE
*s1
= str1
->str
;
5372 Py_UNICODE
*s2
= str2
->str
;
5374 len1
= str1
->length
;
5375 len2
= str2
->length
;
5377 while (len1
> 0 && len2
> 0) {
5384 return (c1
< c2
) ? -1 : 1;
5389 return (len1
< len2
) ? -1 : (len1
!= len2
);
5394 int PyUnicode_Compare(PyObject
*left
,
5397 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
5400 /* Coerce the two arguments */
5401 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
5404 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
5408 /* Shortcut for empty or interned objects */
5415 result
= unicode_compare(u
, v
);
5427 PyObject
*PyUnicode_RichCompare(PyObject
*left
,
5433 result
= PyUnicode_Compare(left
, right
);
5434 if (result
== -1 && PyErr_Occurred())
5437 /* Convert the return value to a Boolean */
5440 result
= (result
== 0);
5443 result
= (result
!= 0);
5446 result
= (result
<= 0);
5449 result
= (result
>= 0);
5452 result
= (result
== -1);
5455 result
= (result
== 1);
5458 return PyBool_FromLong(result
);
5464 Type errors mean that PyUnicode_FromObject() could not convert
5465 one of the arguments (usually the right hand side) to Unicode,
5466 ie. we can't handle the comparison request. However, it is
5467 possible that the other object knows a comparison method, which
5468 is why we return Py_NotImplemented to give the other object a
5472 if (PyErr_ExceptionMatches(PyExc_TypeError
)) {
5474 Py_INCREF(Py_NotImplemented
);
5475 return Py_NotImplemented
;
5477 if (op
!= Py_EQ
&& op
!= Py_NE
)
5480 /* Equality comparison.
5482 This is a special case: we silence any PyExc_UnicodeDecodeError
5483 and instead turn it into a PyErr_UnicodeWarning.
5486 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
))
5489 if (PyErr_Warn(PyExc_UnicodeWarning
,
5491 "Unicode equal comparison "
5492 "failed to convert both arguments to Unicode - "
5493 "interpreting them as being unequal" :
5494 "Unicode unequal comparison "
5495 "failed to convert both arguments to Unicode - "
5496 "interpreting them as being unequal"
5499 result
= (op
== Py_NE
);
5500 return PyBool_FromLong(result
);
5503 int PyUnicode_Contains(PyObject
*container
,
5506 PyObject
*str
, *sub
;
5509 /* Coerce the two arguments */
5510 sub
= PyUnicode_FromObject(element
);
5512 PyErr_SetString(PyExc_TypeError
,
5513 "'in <string>' requires string as left operand");
5517 str
= PyUnicode_FromObject(container
);
5523 result
= stringlib_contains_obj(str
, sub
);
5531 /* Concat to string or Unicode object giving a new Unicode object. */
5533 PyObject
*PyUnicode_Concat(PyObject
*left
,
5536 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
5538 /* Coerce the two arguments */
5539 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
5542 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
5547 if (v
== unicode_empty
) {
5549 return (PyObject
*)u
;
5551 if (u
== unicode_empty
) {
5553 return (PyObject
*)v
;
5556 /* Concat the two Unicode strings */
5557 w
= _PyUnicode_New(u
->length
+ v
->length
);
5560 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
5561 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
5565 return (PyObject
*)w
;
5573 PyDoc_STRVAR(count__doc__
,
5574 "S.count(sub[, start[, end]]) -> int\n\
5576 Return the number of non-overlapping occurrences of substring sub in\n\
5577 Unicode string S[start:end]. Optional arguments start and end are\n\
5578 interpreted as in slice notation.");
5581 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
5583 PyUnicodeObject
*substring
;
5584 Py_ssize_t start
= 0;
5585 Py_ssize_t end
= PY_SSIZE_T_MAX
;
5588 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
5589 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5592 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
5593 (PyObject
*)substring
);
5594 if (substring
== NULL
)
5597 FIX_START_END(self
);
5599 result
= PyInt_FromSsize_t(
5600 stringlib_count(self
->str
+ start
, end
- start
,
5601 substring
->str
, substring
->length
)
5604 Py_DECREF(substring
);
5609 PyDoc_STRVAR(encode__doc__
,
5610 "S.encode([encoding[,errors]]) -> string or unicode\n\
5612 Encodes S using the codec registered for encoding. encoding defaults\n\
5613 to the default encoding. errors may be given to set a different error\n\
5614 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5615 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5616 'xmlcharrefreplace' as well as any other name registered with\n\
5617 codecs.register_error that can handle UnicodeEncodeErrors.");
5620 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
5622 char *encoding
= NULL
;
5623 char *errors
= NULL
;
5626 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
5628 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
5631 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
5632 PyErr_Format(PyExc_TypeError
,
5633 "encoder did not return a string/unicode object "
5635 v
->ob_type
->tp_name
);
5645 PyDoc_STRVAR(decode__doc__
,
5646 "S.decode([encoding[,errors]]) -> string or unicode\n\
5648 Decodes S using the codec registered for encoding. encoding defaults\n\
5649 to the default encoding. errors may be given to set a different error\n\
5650 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5651 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5652 as well as any other name registerd with codecs.register_error that is\n\
5653 able to handle UnicodeDecodeErrors.");
5656 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
)
5658 char *encoding
= NULL
;
5659 char *errors
= NULL
;
5662 if (!PyArg_ParseTuple(args
, "|ss:decode", &encoding
, &errors
))
5664 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
5667 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
5668 PyErr_Format(PyExc_TypeError
,
5669 "decoder did not return a string/unicode object "
5671 v
->ob_type
->tp_name
);
5681 PyDoc_STRVAR(expandtabs__doc__
,
5682 "S.expandtabs([tabsize]) -> unicode\n\
5684 Return a copy of S where all tab characters are expanded using spaces.\n\
5685 If tabsize is not given, a tab size of 8 characters is assumed.");
5688 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
5697 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
5700 /* First pass: determine size of output string */
5702 e
= self
->str
+ self
->length
;
5703 for (p
= self
->str
; p
< e
; p
++)
5706 j
+= tabsize
- (j
% tabsize
);
5710 if (*p
== '\n' || *p
== '\r') {
5716 /* Second pass: create output string and fill it */
5717 u
= _PyUnicode_New(i
+ j
);
5724 for (p
= self
->str
; p
< e
; p
++)
5727 i
= tabsize
- (j
% tabsize
);
5736 if (*p
== '\n' || *p
== '\r')
5740 return (PyObject
*) u
;
5743 PyDoc_STRVAR(find__doc__
,
5744 "S.find(sub [,start [,end]]) -> int\n\
5746 Return the lowest index in S where substring sub is found,\n\
5747 such that sub is contained within s[start,end]. Optional\n\
5748 arguments start and end are interpreted as in slice notation.\n\
5750 Return -1 on failure.");
5753 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
5755 PyObject
*substring
;
5756 Py_ssize_t start
= 0;
5757 Py_ssize_t end
= PY_SSIZE_T_MAX
;
5760 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
5761 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5763 substring
= PyUnicode_FromObject(substring
);
5767 result
= stringlib_find_slice(
5768 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
5769 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
5773 Py_DECREF(substring
);
5775 return PyInt_FromSsize_t(result
);
5779 unicode_getitem(PyUnicodeObject
*self
, Py_ssize_t index
)
5781 if (index
< 0 || index
>= self
->length
) {
5782 PyErr_SetString(PyExc_IndexError
, "string index out of range");
5786 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
5790 unicode_hash(PyUnicodeObject
*self
)
5792 /* Since Unicode objects compare equal to their ASCII string
5793 counterparts, they should use the individual character values
5794 as basis for their hash value. This is needed to assure that
5795 strings and Unicode objects behave in the same way as
5798 register Py_ssize_t len
;
5799 register Py_UNICODE
*p
;
5802 if (self
->hash
!= -1)
5804 len
= PyUnicode_GET_SIZE(self
);
5805 p
= PyUnicode_AS_UNICODE(self
);
5808 x
= (1000003*x
) ^ *p
++;
5809 x
^= PyUnicode_GET_SIZE(self
);
5816 PyDoc_STRVAR(index__doc__
,
5817 "S.index(sub [,start [,end]]) -> int\n\
5819 Like S.find() but raise ValueError when the substring is not found.");
5822 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
5825 PyObject
*substring
;
5826 Py_ssize_t start
= 0;
5827 Py_ssize_t end
= PY_SSIZE_T_MAX
;
5829 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
5830 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5832 substring
= PyUnicode_FromObject(substring
);
5836 result
= stringlib_find_slice(
5837 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
5838 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
5842 Py_DECREF(substring
);
5845 PyErr_SetString(PyExc_ValueError
, "substring not found");
5849 return PyInt_FromSsize_t(result
);
5852 PyDoc_STRVAR(islower__doc__
,
5853 "S.islower() -> bool\n\
5855 Return True if all cased characters in S are lowercase and there is\n\
5856 at least one cased character in S, False otherwise.");
5859 unicode_islower(PyUnicodeObject
*self
)
5861 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5862 register const Py_UNICODE
*e
;
5865 /* Shortcut for single character strings */
5866 if (PyUnicode_GET_SIZE(self
) == 1)
5867 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
5869 /* Special case for empty strings */
5870 if (PyUnicode_GET_SIZE(self
) == 0)
5871 return PyBool_FromLong(0);
5873 e
= p
+ PyUnicode_GET_SIZE(self
);
5875 for (; p
< e
; p
++) {
5876 register const Py_UNICODE ch
= *p
;
5878 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
5879 return PyBool_FromLong(0);
5880 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
5883 return PyBool_FromLong(cased
);
5886 PyDoc_STRVAR(isupper__doc__
,
5887 "S.isupper() -> bool\n\
5889 Return True if all cased characters in S are uppercase and there is\n\
5890 at least one cased character in S, False otherwise.");
5893 unicode_isupper(PyUnicodeObject
*self
)
5895 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5896 register const Py_UNICODE
*e
;
5899 /* Shortcut for single character strings */
5900 if (PyUnicode_GET_SIZE(self
) == 1)
5901 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
5903 /* Special case for empty strings */
5904 if (PyUnicode_GET_SIZE(self
) == 0)
5905 return PyBool_FromLong(0);
5907 e
= p
+ PyUnicode_GET_SIZE(self
);
5909 for (; p
< e
; p
++) {
5910 register const Py_UNICODE ch
= *p
;
5912 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
5913 return PyBool_FromLong(0);
5914 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
5917 return PyBool_FromLong(cased
);
5920 PyDoc_STRVAR(istitle__doc__
,
5921 "S.istitle() -> bool\n\
5923 Return True if S is a titlecased string and there is at least one\n\
5924 character in S, i.e. upper- and titlecase characters may only\n\
5925 follow uncased characters and lowercase characters only cased ones.\n\
5926 Return False otherwise.");
5929 unicode_istitle(PyUnicodeObject
*self
)
5931 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5932 register const Py_UNICODE
*e
;
5933 int cased
, previous_is_cased
;
5935 /* Shortcut for single character strings */
5936 if (PyUnicode_GET_SIZE(self
) == 1)
5937 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
5938 (Py_UNICODE_ISUPPER(*p
) != 0));
5940 /* Special case for empty strings */
5941 if (PyUnicode_GET_SIZE(self
) == 0)
5942 return PyBool_FromLong(0);
5944 e
= p
+ PyUnicode_GET_SIZE(self
);
5946 previous_is_cased
= 0;
5947 for (; p
< e
; p
++) {
5948 register const Py_UNICODE ch
= *p
;
5950 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
5951 if (previous_is_cased
)
5952 return PyBool_FromLong(0);
5953 previous_is_cased
= 1;
5956 else if (Py_UNICODE_ISLOWER(ch
)) {
5957 if (!previous_is_cased
)
5958 return PyBool_FromLong(0);
5959 previous_is_cased
= 1;
5963 previous_is_cased
= 0;
5965 return PyBool_FromLong(cased
);
5968 PyDoc_STRVAR(isspace__doc__
,
5969 "S.isspace() -> bool\n\
5971 Return True if all characters in S are whitespace\n\
5972 and there is at least one character in S, False otherwise.");
5975 unicode_isspace(PyUnicodeObject
*self
)
5977 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5978 register const Py_UNICODE
*e
;
5980 /* Shortcut for single character strings */
5981 if (PyUnicode_GET_SIZE(self
) == 1 &&
5982 Py_UNICODE_ISSPACE(*p
))
5983 return PyBool_FromLong(1);
5985 /* Special case for empty strings */
5986 if (PyUnicode_GET_SIZE(self
) == 0)
5987 return PyBool_FromLong(0);
5989 e
= p
+ PyUnicode_GET_SIZE(self
);
5990 for (; p
< e
; p
++) {
5991 if (!Py_UNICODE_ISSPACE(*p
))
5992 return PyBool_FromLong(0);
5994 return PyBool_FromLong(1);
5997 PyDoc_STRVAR(isalpha__doc__
,
5998 "S.isalpha() -> bool\n\
6000 Return True if all characters in S are alphabetic\n\
6001 and there is at least one character in S, False otherwise.");
6004 unicode_isalpha(PyUnicodeObject
*self
)
6006 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6007 register const Py_UNICODE
*e
;
6009 /* Shortcut for single character strings */
6010 if (PyUnicode_GET_SIZE(self
) == 1 &&
6011 Py_UNICODE_ISALPHA(*p
))
6012 return PyBool_FromLong(1);
6014 /* Special case for empty strings */
6015 if (PyUnicode_GET_SIZE(self
) == 0)
6016 return PyBool_FromLong(0);
6018 e
= p
+ PyUnicode_GET_SIZE(self
);
6019 for (; p
< e
; p
++) {
6020 if (!Py_UNICODE_ISALPHA(*p
))
6021 return PyBool_FromLong(0);
6023 return PyBool_FromLong(1);
6026 PyDoc_STRVAR(isalnum__doc__
,
6027 "S.isalnum() -> bool\n\
6029 Return True if all characters in S are alphanumeric\n\
6030 and there is at least one character in S, False otherwise.");
6033 unicode_isalnum(PyUnicodeObject
*self
)
6035 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6036 register const Py_UNICODE
*e
;
6038 /* Shortcut for single character strings */
6039 if (PyUnicode_GET_SIZE(self
) == 1 &&
6040 Py_UNICODE_ISALNUM(*p
))
6041 return PyBool_FromLong(1);
6043 /* Special case for empty strings */
6044 if (PyUnicode_GET_SIZE(self
) == 0)
6045 return PyBool_FromLong(0);
6047 e
= p
+ PyUnicode_GET_SIZE(self
);
6048 for (; p
< e
; p
++) {
6049 if (!Py_UNICODE_ISALNUM(*p
))
6050 return PyBool_FromLong(0);
6052 return PyBool_FromLong(1);
6055 PyDoc_STRVAR(isdecimal__doc__
,
6056 "S.isdecimal() -> bool\n\
6058 Return True if there are only decimal characters in S,\n\
6062 unicode_isdecimal(PyUnicodeObject
*self
)
6064 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6065 register const Py_UNICODE
*e
;
6067 /* Shortcut for single character strings */
6068 if (PyUnicode_GET_SIZE(self
) == 1 &&
6069 Py_UNICODE_ISDECIMAL(*p
))
6070 return PyBool_FromLong(1);
6072 /* Special case for empty strings */
6073 if (PyUnicode_GET_SIZE(self
) == 0)
6074 return PyBool_FromLong(0);
6076 e
= p
+ PyUnicode_GET_SIZE(self
);
6077 for (; p
< e
; p
++) {
6078 if (!Py_UNICODE_ISDECIMAL(*p
))
6079 return PyBool_FromLong(0);
6081 return PyBool_FromLong(1);
6084 PyDoc_STRVAR(isdigit__doc__
,
6085 "S.isdigit() -> bool\n\
6087 Return True if all characters in S are digits\n\
6088 and there is at least one character in S, False otherwise.");
6091 unicode_isdigit(PyUnicodeObject
*self
)
6093 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6094 register const Py_UNICODE
*e
;
6096 /* Shortcut for single character strings */
6097 if (PyUnicode_GET_SIZE(self
) == 1 &&
6098 Py_UNICODE_ISDIGIT(*p
))
6099 return PyBool_FromLong(1);
6101 /* Special case for empty strings */
6102 if (PyUnicode_GET_SIZE(self
) == 0)
6103 return PyBool_FromLong(0);
6105 e
= p
+ PyUnicode_GET_SIZE(self
);
6106 for (; p
< e
; p
++) {
6107 if (!Py_UNICODE_ISDIGIT(*p
))
6108 return PyBool_FromLong(0);
6110 return PyBool_FromLong(1);
6113 PyDoc_STRVAR(isnumeric__doc__
,
6114 "S.isnumeric() -> bool\n\
6116 Return True if there are only numeric characters in S,\n\
6120 unicode_isnumeric(PyUnicodeObject
*self
)
6122 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6123 register const Py_UNICODE
*e
;
6125 /* Shortcut for single character strings */
6126 if (PyUnicode_GET_SIZE(self
) == 1 &&
6127 Py_UNICODE_ISNUMERIC(*p
))
6128 return PyBool_FromLong(1);
6130 /* Special case for empty strings */
6131 if (PyUnicode_GET_SIZE(self
) == 0)
6132 return PyBool_FromLong(0);
6134 e
= p
+ PyUnicode_GET_SIZE(self
);
6135 for (; p
< e
; p
++) {
6136 if (!Py_UNICODE_ISNUMERIC(*p
))
6137 return PyBool_FromLong(0);
6139 return PyBool_FromLong(1);
6142 PyDoc_STRVAR(join__doc__
,
6143 "S.join(sequence) -> unicode\n\
6145 Return a string which is the concatenation of the strings in the\n\
6146 sequence. The separator between elements is S.");
6149 unicode_join(PyObject
*self
, PyObject
*data
)
6151 return PyUnicode_Join(self
, data
);
6155 unicode_length(PyUnicodeObject
*self
)
6157 return self
->length
;
6160 PyDoc_STRVAR(ljust__doc__
,
6161 "S.ljust(width[, fillchar]) -> int\n\
6163 Return S left justified in a Unicode string of length width. Padding is\n\
6164 done using the specified fill character (default is a space).");
6167 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
6170 Py_UNICODE fillchar
= ' ';
6172 if (!PyArg_ParseTuple(args
, "n|O&:ljust", &width
, convert_uc
, &fillchar
))
6175 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6177 return (PyObject
*) self
;
6180 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
6183 PyDoc_STRVAR(lower__doc__
,
6184 "S.lower() -> unicode\n\
6186 Return a copy of the string S converted to lowercase.");
6189 unicode_lower(PyUnicodeObject
*self
)
6191 return fixup(self
, fixlower
);
6195 #define RIGHTSTRIP 1
6198 /* Arrays indexed by above */
6199 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6201 #define STRIPNAME(i) (stripformat[i]+3)
6203 /* externally visible for str.strip(unicode) */
6205 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
6207 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6208 Py_ssize_t len
= PyUnicode_GET_SIZE(self
);
6209 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
6210 Py_ssize_t seplen
= PyUnicode_GET_SIZE(sepobj
);
6213 BLOOM_MASK sepmask
= make_bloom_mask(sep
, seplen
);
6216 if (striptype
!= RIGHTSTRIP
) {
6217 while (i
< len
&& BLOOM_MEMBER(sepmask
, s
[i
], sep
, seplen
)) {
6223 if (striptype
!= LEFTSTRIP
) {
6226 } while (j
>= i
&& BLOOM_MEMBER(sepmask
, s
[j
], sep
, seplen
));
6230 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
6232 return (PyObject
*)self
;
6235 return PyUnicode_FromUnicode(s
+i
, j
-i
);
6240 do_strip(PyUnicodeObject
*self
, int striptype
)
6242 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6243 Py_ssize_t len
= PyUnicode_GET_SIZE(self
), i
, j
;
6246 if (striptype
!= RIGHTSTRIP
) {
6247 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
6253 if (striptype
!= LEFTSTRIP
) {
6256 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
6260 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
6262 return (PyObject
*)self
;
6265 return PyUnicode_FromUnicode(s
+i
, j
-i
);
6270 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
6272 PyObject
*sep
= NULL
;
6274 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
6277 if (sep
!= NULL
&& sep
!= Py_None
) {
6278 if (PyUnicode_Check(sep
))
6279 return _PyUnicode_XStrip(self
, striptype
, sep
);
6280 else if (PyString_Check(sep
)) {
6282 sep
= PyUnicode_FromObject(sep
);
6285 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
6290 PyErr_Format(PyExc_TypeError
,
6291 "%s arg must be None, unicode or str",
6292 STRIPNAME(striptype
));
6297 return do_strip(self
, striptype
);
6301 PyDoc_STRVAR(strip__doc__
,
6302 "S.strip([chars]) -> unicode\n\
6304 Return a copy of the string S with leading and trailing\n\
6305 whitespace removed.\n\
6306 If chars is given and not None, remove characters in chars instead.\n\
6307 If chars is a str, it will be converted to unicode before stripping");
6310 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
6312 if (PyTuple_GET_SIZE(args
) == 0)
6313 return do_strip(self
, BOTHSTRIP
); /* Common case */
6315 return do_argstrip(self
, BOTHSTRIP
, args
);
6319 PyDoc_STRVAR(lstrip__doc__
,
6320 "S.lstrip([chars]) -> unicode\n\
6322 Return a copy of the string S with leading whitespace removed.\n\
6323 If chars is given and not None, remove characters in chars instead.\n\
6324 If chars is a str, it will be converted to unicode before stripping");
6327 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
6329 if (PyTuple_GET_SIZE(args
) == 0)
6330 return do_strip(self
, LEFTSTRIP
); /* Common case */
6332 return do_argstrip(self
, LEFTSTRIP
, args
);
6336 PyDoc_STRVAR(rstrip__doc__
,
6337 "S.rstrip([chars]) -> unicode\n\
6339 Return a copy of the string S with trailing whitespace removed.\n\
6340 If chars is given and not None, remove characters in chars instead.\n\
6341 If chars is a str, it will be converted to unicode before stripping");
6344 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
6346 if (PyTuple_GET_SIZE(args
) == 0)
6347 return do_strip(self
, RIGHTSTRIP
); /* Common case */
6349 return do_argstrip(self
, RIGHTSTRIP
, args
);
6354 unicode_repeat(PyUnicodeObject
*str
, Py_ssize_t len
)
6364 if (len
== 1 && PyUnicode_CheckExact(str
)) {
6365 /* no repeat, return original string */
6367 return (PyObject
*) str
;
6370 /* ensure # of chars needed doesn't overflow int and # of bytes
6371 * needed doesn't overflow size_t
6373 nchars
= len
* str
->length
;
6374 if (len
&& nchars
/ len
!= str
->length
) {
6375 PyErr_SetString(PyExc_OverflowError
,
6376 "repeated string is too long");
6379 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
6380 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
6381 PyErr_SetString(PyExc_OverflowError
,
6382 "repeated string is too long");
6385 u
= _PyUnicode_New(nchars
);
6391 if (str
->length
== 1 && len
> 0) {
6392 Py_UNICODE_FILL(p
, str
->str
[0], len
);
6394 Py_ssize_t done
= 0; /* number of characters copied this far */
6395 if (done
< nchars
) {
6396 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
6399 while (done
< nchars
) {
6400 int n
= (done
<= nchars
-done
) ? done
: nchars
-done
;
6401 Py_UNICODE_COPY(p
+done
, p
, n
);
6406 return (PyObject
*) u
;
6409 PyObject
*PyUnicode_Replace(PyObject
*obj
,
6412 Py_ssize_t maxcount
)
6419 self
= PyUnicode_FromObject(obj
);
6422 str1
= PyUnicode_FromObject(subobj
);
6427 str2
= PyUnicode_FromObject(replobj
);
6433 result
= replace((PyUnicodeObject
*)self
,
6434 (PyUnicodeObject
*)str1
,
6435 (PyUnicodeObject
*)str2
,
6443 PyDoc_STRVAR(replace__doc__
,
6444 "S.replace (old, new[, maxsplit]) -> unicode\n\
6446 Return a copy of S with all occurrences of substring\n\
6447 old replaced by new. If the optional argument maxsplit is\n\
6448 given, only the first maxsplit occurrences are replaced.");
6451 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
6453 PyUnicodeObject
*str1
;
6454 PyUnicodeObject
*str2
;
6455 Py_ssize_t maxcount
= -1;
6458 if (!PyArg_ParseTuple(args
, "OO|n:replace", &str1
, &str2
, &maxcount
))
6460 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
6463 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
6469 result
= replace(self
, str1
, str2
, maxcount
);
6477 PyObject
*unicode_repr(PyObject
*unicode
)
6479 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
6480 PyUnicode_GET_SIZE(unicode
),
6484 PyDoc_STRVAR(rfind__doc__
,
6485 "S.rfind(sub [,start [,end]]) -> int\n\
6487 Return the highest index in S where substring sub is found,\n\
6488 such that sub is contained within s[start,end]. Optional\n\
6489 arguments start and end are interpreted as in slice notation.\n\
6491 Return -1 on failure.");
6494 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
6496 PyObject
*substring
;
6497 Py_ssize_t start
= 0;
6498 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6501 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
6502 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6504 substring
= PyUnicode_FromObject(substring
);
6508 result
= stringlib_rfind_slice(
6509 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6510 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6514 Py_DECREF(substring
);
6516 return PyInt_FromSsize_t(result
);
6519 PyDoc_STRVAR(rindex__doc__
,
6520 "S.rindex(sub [,start [,end]]) -> int\n\
6522 Like S.rfind() but raise ValueError when the substring is not found.");
6525 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
6527 PyObject
*substring
;
6528 Py_ssize_t start
= 0;
6529 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6532 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
6533 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6535 substring
= PyUnicode_FromObject(substring
);
6539 result
= stringlib_rfind_slice(
6540 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6541 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6545 Py_DECREF(substring
);
6548 PyErr_SetString(PyExc_ValueError
, "substring not found");
6551 return PyInt_FromSsize_t(result
);
6554 PyDoc_STRVAR(rjust__doc__
,
6555 "S.rjust(width[, fillchar]) -> unicode\n\
6557 Return S right justified in a Unicode string of length width. Padding is\n\
6558 done using the specified fill character (default is a space).");
6561 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
6564 Py_UNICODE fillchar
= ' ';
6566 if (!PyArg_ParseTuple(args
, "n|O&:rjust", &width
, convert_uc
, &fillchar
))
6569 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6571 return (PyObject
*) self
;
6574 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
6578 unicode_slice(PyUnicodeObject
*self
, Py_ssize_t start
, Py_ssize_t end
)
6580 /* standard clamping */
6585 if (end
> self
->length
)
6587 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
6588 /* full slice, return original string */
6590 return (PyObject
*) self
;
6595 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
6599 PyObject
*PyUnicode_Split(PyObject
*s
,
6601 Py_ssize_t maxsplit
)
6605 s
= PyUnicode_FromObject(s
);
6609 sep
= PyUnicode_FromObject(sep
);
6616 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
6623 PyDoc_STRVAR(split__doc__
,
6624 "S.split([sep [,maxsplit]]) -> list of strings\n\
6626 Return a list of the words in S, using sep as the\n\
6627 delimiter string. If maxsplit is given, at most maxsplit\n\
6628 splits are done. If sep is not specified or is None,\n\
6629 any whitespace string is a separator.");
6632 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
6634 PyObject
*substring
= Py_None
;
6635 Py_ssize_t maxcount
= -1;
6637 if (!PyArg_ParseTuple(args
, "|On:split", &substring
, &maxcount
))
6640 if (substring
== Py_None
)
6641 return split(self
, NULL
, maxcount
);
6642 else if (PyUnicode_Check(substring
))
6643 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
6645 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
6649 PyUnicode_Partition(PyObject
*str_in
, PyObject
*sep_in
)
6655 str_obj
= PyUnicode_FromObject(str_in
);
6658 sep_obj
= PyUnicode_FromObject(sep_in
);
6664 out
= stringlib_partition(
6665 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
6666 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
6677 PyUnicode_RPartition(PyObject
*str_in
, PyObject
*sep_in
)
6683 str_obj
= PyUnicode_FromObject(str_in
);
6686 sep_obj
= PyUnicode_FromObject(sep_in
);
6692 out
= stringlib_rpartition(
6693 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
6694 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
6703 PyDoc_STRVAR(partition__doc__
,
6704 "S.partition(sep) -> (head, sep, tail)\n\
6706 Searches for the separator sep in S, and returns the part before it,\n\
6707 the separator itself, and the part after it. If the separator is not\n\
6708 found, returns S and two empty strings.");
6711 unicode_partition(PyUnicodeObject
*self
, PyObject
*separator
)
6713 return PyUnicode_Partition((PyObject
*)self
, separator
);
6716 PyDoc_STRVAR(rpartition__doc__
,
6717 "S.rpartition(sep) -> (tail, sep, head)\n\
6719 Searches for the separator sep in S, starting at the end of S, and returns\n\
6720 the part before it, the separator itself, and the part after it. If the\n\
6721 separator is not found, returns two empty strings and S.");
6724 unicode_rpartition(PyUnicodeObject
*self
, PyObject
*separator
)
6726 return PyUnicode_RPartition((PyObject
*)self
, separator
);
6729 PyObject
*PyUnicode_RSplit(PyObject
*s
,
6731 Py_ssize_t maxsplit
)
6735 s
= PyUnicode_FromObject(s
);
6739 sep
= PyUnicode_FromObject(sep
);
6746 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
6753 PyDoc_STRVAR(rsplit__doc__
,
6754 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6756 Return a list of the words in S, using sep as the\n\
6757 delimiter string, starting at the end of the string and\n\
6758 working to the front. If maxsplit is given, at most maxsplit\n\
6759 splits are done. If sep is not specified, any whitespace string\n\
6763 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
6765 PyObject
*substring
= Py_None
;
6766 Py_ssize_t maxcount
= -1;
6768 if (!PyArg_ParseTuple(args
, "|On:rsplit", &substring
, &maxcount
))
6771 if (substring
== Py_None
)
6772 return rsplit(self
, NULL
, maxcount
);
6773 else if (PyUnicode_Check(substring
))
6774 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
6776 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
6779 PyDoc_STRVAR(splitlines__doc__
,
6780 "S.splitlines([keepends]]) -> list of strings\n\
6782 Return a list of the lines in S, breaking at line boundaries.\n\
6783 Line breaks are not included in the resulting list unless keepends\n\
6784 is given and true.");
6787 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
6791 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
6794 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
6798 PyObject
*unicode_str(PyUnicodeObject
*self
)
6800 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
6803 PyDoc_STRVAR(swapcase__doc__
,
6804 "S.swapcase() -> unicode\n\
6806 Return a copy of S with uppercase characters converted to lowercase\n\
6810 unicode_swapcase(PyUnicodeObject
*self
)
6812 return fixup(self
, fixswapcase
);
6815 PyDoc_STRVAR(translate__doc__
,
6816 "S.translate(table) -> unicode\n\
6818 Return a copy of the string S, where all characters have been mapped\n\
6819 through the given translation table, which must be a mapping of\n\
6820 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6821 Unmapped characters are left untouched. Characters mapped to None\n\
6825 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
6827 return PyUnicode_TranslateCharmap(self
->str
,
6833 PyDoc_STRVAR(upper__doc__
,
6834 "S.upper() -> unicode\n\
6836 Return a copy of S converted to uppercase.");
6839 unicode_upper(PyUnicodeObject
*self
)
6841 return fixup(self
, fixupper
);
6844 PyDoc_STRVAR(zfill__doc__
,
6845 "S.zfill(width) -> unicode\n\
6847 Pad a numeric string x with zeros on the left, to fill a field\n\
6848 of the specified width. The string x is never truncated.");
6851 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
6857 if (!PyArg_ParseTuple(args
, "n:zfill", &width
))
6860 if (self
->length
>= width
) {
6861 if (PyUnicode_CheckExact(self
)) {
6863 return (PyObject
*) self
;
6866 return PyUnicode_FromUnicode(
6867 PyUnicode_AS_UNICODE(self
),
6868 PyUnicode_GET_SIZE(self
)
6872 fill
= width
- self
->length
;
6874 u
= pad(self
, fill
, 0, '0');
6879 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
6880 /* move sign to beginning of string */
6881 u
->str
[0] = u
->str
[fill
];
6885 return (PyObject
*) u
;
6890 unicode_freelistsize(PyUnicodeObject
*self
)
6892 return PyInt_FromLong(unicode_freelist_size
);
6896 PyDoc_STRVAR(startswith__doc__
,
6897 "S.startswith(prefix[, start[, end]]) -> bool\n\
6899 Return True if S starts with the specified prefix, False otherwise.\n\
6900 With optional start, test S beginning at that position.\n\
6901 With optional end, stop comparing S at that position.\n\
6902 prefix can also be a tuple of strings to try.");
6905 unicode_startswith(PyUnicodeObject
*self
,
6909 PyUnicodeObject
*substring
;
6910 Py_ssize_t start
= 0;
6911 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6914 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &subobj
,
6915 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6917 if (PyTuple_Check(subobj
)) {
6919 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
6920 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6921 PyTuple_GET_ITEM(subobj
, i
));
6922 if (substring
== NULL
)
6924 result
= tailmatch(self
, substring
, start
, end
, -1);
6925 Py_DECREF(substring
);
6930 /* nothing matched */
6933 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
6934 if (substring
== NULL
)
6936 result
= tailmatch(self
, substring
, start
, end
, -1);
6937 Py_DECREF(substring
);
6938 return PyBool_FromLong(result
);
6942 PyDoc_STRVAR(endswith__doc__
,
6943 "S.endswith(suffix[, start[, end]]) -> bool\n\
6945 Return True if S ends with the specified suffix, False otherwise.\n\
6946 With optional start, test S beginning at that position.\n\
6947 With optional end, stop comparing S at that position.\n\
6948 suffix can also be a tuple of strings to try.");
6951 unicode_endswith(PyUnicodeObject
*self
,
6955 PyUnicodeObject
*substring
;
6956 Py_ssize_t start
= 0;
6957 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6960 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &subobj
,
6961 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6963 if (PyTuple_Check(subobj
)) {
6965 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
6966 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6967 PyTuple_GET_ITEM(subobj
, i
));
6968 if (substring
== NULL
)
6970 result
= tailmatch(self
, substring
, start
, end
, +1);
6971 Py_DECREF(substring
);
6978 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
6979 if (substring
== NULL
)
6982 result
= tailmatch(self
, substring
, start
, end
, +1);
6983 Py_DECREF(substring
);
6984 return PyBool_FromLong(result
);
6990 unicode_getnewargs(PyUnicodeObject
*v
)
6992 return Py_BuildValue("(u#)", v
->str
, v
->length
);
6996 static PyMethodDef unicode_methods
[] = {
6998 /* Order is according to common usage: often used methods should
6999 appear first, since lookup is done sequentially. */
7001 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
, encode__doc__
},
7002 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
7003 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
7004 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
7005 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
7006 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
7007 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
7008 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
7009 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
7010 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
7011 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
7012 {"partition", (PyCFunction
) unicode_partition
, METH_O
, partition__doc__
},
7013 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
7014 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
7015 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
7016 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
7017 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
, decode__doc__
},
7018 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7019 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
7020 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
7021 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
7022 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
7023 {"rpartition", (PyCFunction
) unicode_rpartition
, METH_O
, rpartition__doc__
},
7024 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
7025 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
7026 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
7027 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
7028 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
7029 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
7030 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
7031 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
7032 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
7033 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
7034 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
7035 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
7036 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
7037 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
7038 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
7039 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
7040 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
7042 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
7046 /* This one is just used for debugging the implementation. */
7047 {"freelistsize", (PyCFunction
) unicode_freelistsize
, METH_NOARGS
},
7050 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
7055 unicode_mod(PyObject
*v
, PyObject
*w
)
7057 if (!PyUnicode_Check(v
)) {
7058 Py_INCREF(Py_NotImplemented
);
7059 return Py_NotImplemented
;
7061 return PyUnicode_Format(v
, w
);
7064 static PyNumberMethods unicode_as_number
= {
7069 unicode_mod
, /*nb_remainder*/
7072 static PySequenceMethods unicode_as_sequence
= {
7073 (lenfunc
) unicode_length
, /* sq_length */
7074 PyUnicode_Concat
, /* sq_concat */
7075 (ssizeargfunc
) unicode_repeat
, /* sq_repeat */
7076 (ssizeargfunc
) unicode_getitem
, /* sq_item */
7077 (ssizessizeargfunc
) unicode_slice
, /* sq_slice */
7078 0, /* sq_ass_item */
7079 0, /* sq_ass_slice */
7080 PyUnicode_Contains
, /* sq_contains */
7084 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
7086 if (PyIndex_Check(item
)) {
7087 Py_ssize_t i
= PyNumber_AsSsize_t(item
, PyExc_IndexError
);
7088 if (i
== -1 && PyErr_Occurred())
7091 i
+= PyUnicode_GET_SIZE(self
);
7092 return unicode_getitem(self
, i
);
7093 } else if (PySlice_Check(item
)) {
7094 Py_ssize_t start
, stop
, step
, slicelength
, cur
, i
;
7095 Py_UNICODE
* source_buf
;
7096 Py_UNICODE
* result_buf
;
7099 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyUnicode_GET_SIZE(self
),
7100 &start
, &stop
, &step
, &slicelength
) < 0) {
7104 if (slicelength
<= 0) {
7105 return PyUnicode_FromUnicode(NULL
, 0);
7107 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
7108 result_buf
= (Py_UNICODE
*)PyMem_MALLOC(slicelength
*
7109 sizeof(Py_UNICODE
));
7111 if (result_buf
== NULL
)
7112 return PyErr_NoMemory();
7114 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
7115 result_buf
[i
] = source_buf
[cur
];
7118 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
7119 PyMem_FREE(result_buf
);
7123 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
7128 static PyMappingMethods unicode_as_mapping
= {
7129 (lenfunc
)unicode_length
, /* mp_length */
7130 (binaryfunc
)unicode_subscript
, /* mp_subscript */
7131 (objobjargproc
)0, /* mp_ass_subscript */
7135 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
7140 PyErr_SetString(PyExc_SystemError
,
7141 "accessing non-existent unicode segment");
7144 *ptr
= (void *) self
->str
;
7145 return PyUnicode_GET_DATA_SIZE(self
);
7149 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, Py_ssize_t index
,
7152 PyErr_SetString(PyExc_TypeError
,
7153 "cannot use unicode as modifiable buffer");
7158 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
7162 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
7167 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
7174 PyErr_SetString(PyExc_SystemError
,
7175 "accessing non-existent unicode segment");
7178 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
7181 *ptr
= (void *) PyString_AS_STRING(str
);
7182 return PyString_GET_SIZE(str
);
7185 /* Helpers for PyUnicode_Format() */
7188 getnextarg(PyObject
*args
, Py_ssize_t arglen
, Py_ssize_t
*p_argidx
)
7190 Py_ssize_t argidx
= *p_argidx
;
7191 if (argidx
< arglen
) {
7196 return PyTuple_GetItem(args
, argidx
);
7198 PyErr_SetString(PyExc_TypeError
,
7199 "not enough arguments for format string");
7203 #define F_LJUST (1<<0)
7204 #define F_SIGN (1<<1)
7205 #define F_BLANK (1<<2)
7206 #define F_ALT (1<<3)
7207 #define F_ZERO (1<<4)
7210 strtounicode(Py_UNICODE
*buffer
, const char *charbuffer
)
7212 register Py_ssize_t i
;
7213 Py_ssize_t len
= strlen(charbuffer
);
7214 for (i
= len
- 1; i
>= 0; i
--)
7215 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
7221 doubletounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, double x
)
7225 PyOS_ascii_formatd((char *)buffer
, len
, format
, x
);
7226 result
= strtounicode(buffer
, (char *)buffer
);
7227 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
7231 longtounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, long x
)
7235 PyOS_snprintf((char *)buffer
, len
, format
, x
);
7236 result
= strtounicode(buffer
, (char *)buffer
);
7237 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
7240 /* XXX To save some code duplication, formatfloat/long/int could have been
7241 shared with stringobject.c, converting from 8-bit to Unicode after the
7242 formatting is done. */
7245 formatfloat(Py_UNICODE
*buf
,
7252 /* fmt = '%#.' + `prec` + `type`
7253 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7257 x
= PyFloat_AsDouble(v
);
7258 if (x
== -1.0 && PyErr_Occurred())
7262 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
7264 /* Worst case length calc to ensure no buffer overrun:
7268 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7269 for any double rep.)
7270 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7273 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7274 len = 1 + 50 + 1 + prec = 52 + prec
7276 If prec=0 the effective precision is 1 (the leading digit is
7277 always given), therefore increase the length by one.
7280 if ((type
== 'g' && buflen
<= (size_t)10 + (size_t)prec
) ||
7281 (type
== 'f' && buflen
<= (size_t)53 + (size_t)prec
)) {
7282 PyErr_SetString(PyExc_OverflowError
,
7283 "formatted float is too long (precision too large?)");
7286 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%d%c",
7287 (flags
&F_ALT
) ? "#" : "",
7289 return doubletounicode(buf
, buflen
, fmt
, x
);
7293 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
7297 PyObject
*str
; /* temporary string object. */
7298 PyUnicodeObject
*result
;
7300 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
7303 result
= _PyUnicode_New(len
);
7308 for (i
= 0; i
< len
; i
++)
7309 result
->str
[i
] = buf
[i
];
7310 result
->str
[len
] = 0;
7312 return (PyObject
*)result
;
7316 formatint(Py_UNICODE
*buf
,
7323 /* fmt = '%#.' + `prec` + 'l' + `type`
7324 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7328 char fmt
[64]; /* plenty big enough! */
7332 x
= PyInt_AsLong(v
);
7333 if (x
== -1 && PyErr_Occurred())
7335 if (x
< 0 && type
== 'u') {
7338 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
7345 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7346 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7348 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
7349 PyErr_SetString(PyExc_OverflowError
,
7350 "formatted integer is too long (precision too large?)");
7354 if ((flags
& F_ALT
) &&
7355 (type
== 'x' || type
== 'X')) {
7356 /* When converting under %#x or %#X, there are a number
7357 * of issues that cause pain:
7358 * - when 0 is being converted, the C standard leaves off
7359 * the '0x' or '0X', which is inconsistent with other
7360 * %#x/%#X conversions and inconsistent with Python's
7362 * - there are platforms that violate the standard and
7363 * convert 0 with the '0x' or '0X'
7364 * (Metrowerks, Compaq Tru64)
7365 * - there are platforms that give '0x' when converting
7366 * under %#X, but convert 0 in accordance with the
7367 * standard (OS/2 EMX)
7369 * We can achieve the desired consistency by inserting our
7370 * own '0x' or '0X' prefix, and substituting %x/%X in place
7373 * Note that this is the same approach as used in
7374 * formatint() in stringobject.c
7376 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
7377 sign
, type
, prec
, type
);
7380 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
7381 sign
, (flags
&F_ALT
) ? "#" : "",
7385 return longtounicode(buf
, buflen
, fmt
, -x
);
7387 return longtounicode(buf
, buflen
, fmt
, x
);
7391 formatchar(Py_UNICODE
*buf
,
7395 /* presume that the buffer is at least 2 characters long */
7396 if (PyUnicode_Check(v
)) {
7397 if (PyUnicode_GET_SIZE(v
) != 1)
7399 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
7402 else if (PyString_Check(v
)) {
7403 if (PyString_GET_SIZE(v
) != 1)
7405 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
7409 /* Integer input truncated to a character */
7411 x
= PyInt_AsLong(v
);
7412 if (x
== -1 && PyErr_Occurred())
7414 #ifdef Py_UNICODE_WIDE
7415 if (x
< 0 || x
> 0x10ffff) {
7416 PyErr_SetString(PyExc_OverflowError
,
7417 "%c arg not in range(0x110000) "
7418 "(wide Python build)");
7422 if (x
< 0 || x
> 0xffff) {
7423 PyErr_SetString(PyExc_OverflowError
,
7424 "%c arg not in range(0x10000) "
7425 "(narrow Python build)");
7429 buf
[0] = (Py_UNICODE
) x
;
7435 PyErr_SetString(PyExc_TypeError
,
7436 "%c requires int or char");
7440 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7442 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7443 chars are formatted. XXX This is a magic number. Each formatting
7444 routine does bounds checking to ensure no overflow, but a better
7445 solution may be to malloc a buffer of appropriate size for each
7446 format. For now, the current solution is sufficient.
7448 #define FORMATBUFLEN (size_t)120
7450 PyObject
*PyUnicode_Format(PyObject
*format
,
7453 Py_UNICODE
*fmt
, *res
;
7454 Py_ssize_t fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
7456 PyUnicodeObject
*result
= NULL
;
7457 PyObject
*dict
= NULL
;
7460 if (format
== NULL
|| args
== NULL
) {
7461 PyErr_BadInternalCall();
7464 uformat
= PyUnicode_FromObject(format
);
7465 if (uformat
== NULL
)
7467 fmt
= PyUnicode_AS_UNICODE(uformat
);
7468 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
7470 reslen
= rescnt
= fmtcnt
+ 100;
7471 result
= _PyUnicode_New(reslen
);
7474 res
= PyUnicode_AS_UNICODE(result
);
7476 if (PyTuple_Check(args
)) {
7477 arglen
= PyTuple_Size(args
);
7484 if (args
->ob_type
->tp_as_mapping
&& !PyTuple_Check(args
) &&
7485 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
7488 while (--fmtcnt
>= 0) {
7491 rescnt
= fmtcnt
+ 100;
7493 if (_PyUnicode_Resize(&result
, reslen
) < 0)
7495 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
7501 /* Got a format specifier */
7503 Py_ssize_t width
= -1;
7505 Py_UNICODE c
= '\0';
7508 PyObject
*temp
= NULL
;
7512 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
7516 Py_UNICODE
*keystart
;
7522 PyErr_SetString(PyExc_TypeError
,
7523 "format requires a mapping");
7529 /* Skip over balanced parentheses */
7530 while (pcount
> 0 && --fmtcnt
>= 0) {
7533 else if (*fmt
== '(')
7537 keylen
= fmt
- keystart
- 1;
7538 if (fmtcnt
< 0 || pcount
> 0) {
7539 PyErr_SetString(PyExc_ValueError
,
7540 "incomplete format key");
7544 /* keys are converted to strings using UTF-8 and
7545 then looked up since Python uses strings to hold
7546 variables names etc. in its namespaces and we
7547 wouldn't want to break common idioms. */
7548 key
= PyUnicode_EncodeUTF8(keystart
,
7552 key
= PyUnicode_FromUnicode(keystart
, keylen
);
7560 args
= PyObject_GetItem(dict
, key
);
7569 while (--fmtcnt
>= 0) {
7570 switch (c
= *fmt
++) {
7571 case '-': flags
|= F_LJUST
; continue;
7572 case '+': flags
|= F_SIGN
; continue;
7573 case ' ': flags
|= F_BLANK
; continue;
7574 case '#': flags
|= F_ALT
; continue;
7575 case '0': flags
|= F_ZERO
; continue;
7580 v
= getnextarg(args
, arglen
, &argidx
);
7583 if (!PyInt_Check(v
)) {
7584 PyErr_SetString(PyExc_TypeError
,
7588 width
= PyInt_AsLong(v
);
7596 else if (c
>= '0' && c
<= '9') {
7598 while (--fmtcnt
>= 0) {
7600 if (c
< '0' || c
> '9')
7602 if ((width
*10) / 10 != width
) {
7603 PyErr_SetString(PyExc_ValueError
,
7607 width
= width
*10 + (c
- '0');
7615 v
= getnextarg(args
, arglen
, &argidx
);
7618 if (!PyInt_Check(v
)) {
7619 PyErr_SetString(PyExc_TypeError
,
7623 prec
= PyInt_AsLong(v
);
7629 else if (c
>= '0' && c
<= '9') {
7631 while (--fmtcnt
>= 0) {
7632 c
= Py_CHARMASK(*fmt
++);
7633 if (c
< '0' || c
> '9')
7635 if ((prec
*10) / 10 != prec
) {
7636 PyErr_SetString(PyExc_ValueError
,
7640 prec
= prec
*10 + (c
- '0');
7645 if (c
== 'h' || c
== 'l' || c
== 'L') {
7651 PyErr_SetString(PyExc_ValueError
,
7652 "incomplete format");
7656 v
= getnextarg(args
, arglen
, &argidx
);
7666 /* presume that buffer length is at least 1 */
7673 if (PyUnicode_Check(v
) && c
== 's') {
7680 temp
= PyObject_Unicode(v
);
7682 temp
= PyObject_Repr(v
);
7685 if (PyUnicode_Check(temp
))
7686 /* nothing to do */;
7687 else if (PyString_Check(temp
)) {
7688 /* convert to string to Unicode */
7689 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
7690 PyString_GET_SIZE(temp
),
7700 PyErr_SetString(PyExc_TypeError
,
7701 "%s argument has non-string str()");
7705 pbuf
= PyUnicode_AS_UNICODE(temp
);
7706 len
= PyUnicode_GET_SIZE(temp
);
7707 if (prec
>= 0 && len
> prec
)
7719 if (PyLong_Check(v
)) {
7720 temp
= formatlong(v
, flags
, prec
, c
);
7723 pbuf
= PyUnicode_AS_UNICODE(temp
);
7724 len
= PyUnicode_GET_SIZE(temp
);
7729 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
7748 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
7759 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
7765 PyErr_Format(PyExc_ValueError
,
7766 "unsupported format character '%c' (0x%x) "
7768 (31<=c
&& c
<=126) ? (char)c
: '?',
7770 (Py_ssize_t
)(fmt
- 1 -
7771 PyUnicode_AS_UNICODE(uformat
)));
7775 if (*pbuf
== '-' || *pbuf
== '+') {
7779 else if (flags
& F_SIGN
)
7781 else if (flags
& F_BLANK
)
7788 if (rescnt
- (sign
!= 0) < width
) {
7790 rescnt
= width
+ fmtcnt
+ 100;
7797 if (_PyUnicode_Resize(&result
, reslen
) < 0) {
7801 res
= PyUnicode_AS_UNICODE(result
)
7811 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
7812 assert(pbuf
[0] == '0');
7813 assert(pbuf
[1] == c
);
7824 if (width
> len
&& !(flags
& F_LJUST
)) {
7828 } while (--width
> len
);
7833 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
7834 assert(pbuf
[0] == '0');
7835 assert(pbuf
[1] == c
);
7840 Py_UNICODE_COPY(res
, pbuf
, len
);
7843 while (--width
>= len
) {
7847 if (dict
&& (argidx
< arglen
) && c
!= '%') {
7848 PyErr_SetString(PyExc_TypeError
,
7849 "not all arguments converted during string formatting");
7856 if (argidx
< arglen
&& !dict
) {
7857 PyErr_SetString(PyExc_TypeError
,
7858 "not all arguments converted during string formatting");
7862 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
7868 return (PyObject
*)result
;
7879 static PyBufferProcs unicode_as_buffer
= {
7880 (readbufferproc
) unicode_buffer_getreadbuf
,
7881 (writebufferproc
) unicode_buffer_getwritebuf
,
7882 (segcountproc
) unicode_buffer_getsegcount
,
7883 (charbufferproc
) unicode_buffer_getcharbuf
,
7887 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
7890 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
7893 static char *kwlist
[] = {"string", "encoding", "errors", 0};
7894 char *encoding
= NULL
;
7895 char *errors
= NULL
;
7897 if (type
!= &PyUnicode_Type
)
7898 return unicode_subtype_new(type
, args
, kwds
);
7899 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
7900 kwlist
, &x
, &encoding
, &errors
))
7903 return (PyObject
*)_PyUnicode_New(0);
7904 if (encoding
== NULL
&& errors
== NULL
)
7905 return PyObject_Unicode(x
);
7907 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
7911 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
7913 PyUnicodeObject
*tmp
, *pnew
;
7916 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
7917 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
7920 assert(PyUnicode_Check(tmp
));
7921 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
7926 pnew
->str
= PyMem_NEW(Py_UNICODE
, n
+1);
7927 if (pnew
->str
== NULL
) {
7928 _Py_ForgetReference((PyObject
*)pnew
);
7931 return PyErr_NoMemory();
7933 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
7935 pnew
->hash
= tmp
->hash
;
7937 return (PyObject
*)pnew
;
7940 PyDoc_STRVAR(unicode_doc
,
7941 "unicode(string [, encoding[, errors]]) -> object\n\
7943 Create a new Unicode object from the given encoded string.\n\
7944 encoding defaults to the current default string encoding.\n\
7945 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7947 PyTypeObject PyUnicode_Type
= {
7948 PyObject_HEAD_INIT(&PyType_Type
)
7950 "unicode", /* tp_name */
7951 sizeof(PyUnicodeObject
), /* tp_size */
7952 0, /* tp_itemsize */
7954 (destructor
)unicode_dealloc
, /* tp_dealloc */
7959 unicode_repr
, /* tp_repr */
7960 &unicode_as_number
, /* tp_as_number */
7961 &unicode_as_sequence
, /* tp_as_sequence */
7962 &unicode_as_mapping
, /* tp_as_mapping */
7963 (hashfunc
) unicode_hash
, /* tp_hash*/
7965 (reprfunc
) unicode_str
, /* tp_str */
7966 PyObject_GenericGetAttr
, /* tp_getattro */
7967 0, /* tp_setattro */
7968 &unicode_as_buffer
, /* tp_as_buffer */
7969 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
7970 Py_TPFLAGS_BASETYPE
, /* tp_flags */
7971 unicode_doc
, /* tp_doc */
7972 0, /* tp_traverse */
7974 PyUnicode_RichCompare
, /* tp_richcompare */
7975 0, /* tp_weaklistoffset */
7977 0, /* tp_iternext */
7978 unicode_methods
, /* tp_methods */
7981 &PyBaseString_Type
, /* tp_base */
7983 0, /* tp_descr_get */
7984 0, /* tp_descr_set */
7985 0, /* tp_dictoffset */
7988 unicode_new
, /* tp_new */
7989 PyObject_Del
, /* tp_free */
7992 /* Initialize the Unicode implementation */
7994 void _PyUnicode_Init(void)
7998 /* XXX - move this array to unicodectype.c ? */
7999 Py_UNICODE linebreak
[] = {
8000 0x000A, /* LINE FEED */
8001 0x000D, /* CARRIAGE RETURN */
8002 0x001C, /* FILE SEPARATOR */
8003 0x001D, /* GROUP SEPARATOR */
8004 0x001E, /* RECORD SEPARATOR */
8005 0x0085, /* NEXT LINE */
8006 0x2028, /* LINE SEPARATOR */
8007 0x2029, /* PARAGRAPH SEPARATOR */
8010 /* Init the implementation */
8011 unicode_freelist
= NULL
;
8012 unicode_freelist_size
= 0;
8013 unicode_empty
= _PyUnicode_New(0);
8017 strcpy(unicode_default_encoding
, "ascii");
8018 for (i
= 0; i
< 256; i
++)
8019 unicode_latin1
[i
] = NULL
;
8020 if (PyType_Ready(&PyUnicode_Type
) < 0)
8021 Py_FatalError("Can't initialize 'unicode'");
8023 /* initialize the linebreak bloom filter */
8024 bloom_linebreak
= make_bloom_mask(
8025 linebreak
, sizeof(linebreak
) / sizeof(linebreak
[0])
8028 PyType_Ready(&EncodingMapType
);
8031 /* Finalize the Unicode implementation */
8034 _PyUnicode_Fini(void)
8039 Py_XDECREF(unicode_empty
);
8040 unicode_empty
= NULL
;
8042 for (i
= 0; i
< 256; i
++) {
8043 if (unicode_latin1
[i
]) {
8044 Py_DECREF(unicode_latin1
[i
]);
8045 unicode_latin1
[i
] = NULL
;
8049 for (u
= unicode_freelist
; u
!= NULL
;) {
8050 PyUnicodeObject
*v
= u
;
8051 u
= *(PyUnicodeObject
**)u
;
8054 Py_XDECREF(v
->defenc
);
8057 unicode_freelist
= NULL
;
8058 unicode_freelist_size
= 0;
8069 indent-tabs-mode: nil