3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
45 #include "unicodeobject.h"
52 /* Limit for the Unicode object free list */
54 #define MAX_UNICODE_FREELIST_SIZE 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
80 # define BYTEORDER_IS_LITTLE_ENDIAN
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
95 /* Free list for Unicode objects */
96 static PyUnicodeObject
*unicode_freelist
;
97 static int unicode_freelist_size
;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject
*unicode_empty
;
102 /* Single character Unicode strings in the Latin-1 range are being
104 static PyUnicodeObject
*unicode_latin1
[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding
[100];
116 PyUnicode_GetMax(void)
118 #ifdef Py_UNICODE_WIDE
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
127 /* --- Bloom Filters ----------------------------------------------------- */
129 /* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
133 /* the linebreak mask is set up by Unicode_Init below */
135 #define BLOOM_MASK unsigned long
137 static BLOOM_MASK bloom_linebreak
;
139 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
141 #define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
144 Py_LOCAL_INLINE(BLOOM_MASK
) make_bloom_mask(Py_UNICODE
* ptr
, Py_ssize_t len
)
146 /* calculate simple bloom-style bitmask for a given unicode string */
152 for (i
= 0; i
< len
; i
++)
153 mask
|= (1 << (ptr
[i
] & 0x1F));
158 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr
, Py_UNICODE
* set
, Py_ssize_t setlen
)
162 for (i
= 0; i
< setlen
; i
++)
169 #define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
172 /* --- Unicode Object ----------------------------------------------------- */
175 int unicode_resize(register PyUnicodeObject
*unicode
,
180 /* Shortcut if there's nothing much to do. */
181 if (unicode
->length
== length
)
184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
188 if (unicode
== unicode_empty
||
189 (unicode
->length
== 1 &&
190 unicode
->str
[0] < 256U &&
191 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
192 PyErr_SetString(PyExc_SystemError
,
193 "can't resize shared unicode objects");
197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
199 safe to look at str[length] (without making any assumptions about what
202 oldstr
= unicode
->str
;
203 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
205 unicode
->str
= (Py_UNICODE
*)oldstr
;
209 unicode
->str
[length
] = 0;
210 unicode
->length
= length
;
213 /* Reset the object caches */
214 if (unicode
->defenc
) {
215 Py_DECREF(unicode
->defenc
);
216 unicode
->defenc
= NULL
;
223 /* We allocate one more byte to make sure the string is
224 Ux0000 terminated -- XXX is this needed ?
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
232 PyUnicodeObject
*_PyUnicode_New(Py_ssize_t length
)
234 register PyUnicodeObject
*unicode
;
236 /* Optimization for empty strings */
237 if (length
== 0 && unicode_empty
!= NULL
) {
238 Py_INCREF(unicode_empty
);
239 return unicode_empty
;
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist
) {
244 unicode
= unicode_freelist
;
245 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
246 unicode_freelist_size
--;
248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
250 if ((unicode
->length
< length
) &&
251 unicode_resize(unicode
, length
) < 0) {
252 PyMem_DEL(unicode
->str
);
257 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
259 PyObject_INIT(unicode
, &PyUnicode_Type
);
262 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
265 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
272 /* Initialize the first element to guard against cases where
273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
280 unicode
->str
[length
] = 0;
281 unicode
->length
= length
;
283 unicode
->defenc
= NULL
;
287 _Py_ForgetReference((PyObject
*)unicode
);
288 PyObject_Del(unicode
);
293 void unicode_dealloc(register PyUnicodeObject
*unicode
)
295 if (PyUnicode_CheckExact(unicode
) &&
296 unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
297 /* Keep-Alive optimization */
298 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
299 PyMem_DEL(unicode
->str
);
303 if (unicode
->defenc
) {
304 Py_DECREF(unicode
->defenc
);
305 unicode
->defenc
= NULL
;
307 /* Add to free list */
308 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
309 unicode_freelist
= unicode
;
310 unicode_freelist_size
++;
313 PyMem_DEL(unicode
->str
);
314 Py_XDECREF(unicode
->defenc
);
315 unicode
->ob_type
->tp_free((PyObject
*)unicode
);
319 int PyUnicode_Resize(PyObject
**unicode
, Py_ssize_t length
)
321 register PyUnicodeObject
*v
;
323 /* Argument checks */
324 if (unicode
== NULL
) {
325 PyErr_BadInternalCall();
328 v
= (PyUnicodeObject
*)*unicode
;
329 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1 || length
< 0) {
330 PyErr_BadInternalCall();
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
337 if (v
->length
!= length
&&
338 (v
== unicode_empty
|| v
->length
== 1)) {
339 PyUnicodeObject
*w
= _PyUnicode_New(length
);
342 Py_UNICODE_COPY(w
->str
, v
->str
,
343 length
< v
->length
? length
: v
->length
);
345 *unicode
= (PyObject
*)w
;
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v
, length
);
354 /* Internal API for use in unicodeobject.c only ! */
355 #define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
358 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
361 PyUnicodeObject
*unicode
;
363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
367 /* Optimization for empty strings */
368 if (size
== 0 && unicode_empty
!= NULL
) {
369 Py_INCREF(unicode_empty
);
370 return (PyObject
*)unicode_empty
;
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size
== 1 && *u
< 256) {
376 unicode
= unicode_latin1
[*u
];
378 unicode
= _PyUnicode_New(1);
381 unicode
->str
[0] = *u
;
382 unicode_latin1
[*u
] = unicode
;
385 return (PyObject
*)unicode
;
389 unicode
= _PyUnicode_New(size
);
393 /* Copy the Unicode data into the new object */
395 Py_UNICODE_COPY(unicode
->str
, u
, size
);
397 return (PyObject
*)unicode
;
402 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
405 PyUnicodeObject
*unicode
;
408 PyErr_BadInternalCall();
412 unicode
= _PyUnicode_New(size
);
416 /* Copy the wchar_t data into the new object */
417 #ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
421 register Py_UNICODE
*u
;
422 register Py_ssize_t i
;
423 u
= PyUnicode_AS_UNICODE(unicode
);
424 for (i
= size
; i
> 0; i
--)
429 return (PyObject
*)unicode
;
432 Py_ssize_t
PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
436 if (unicode
== NULL
) {
437 PyErr_BadInternalCall();
441 /* If possible, try to copy the 0-termination as well */
442 if (size
> PyUnicode_GET_SIZE(unicode
))
443 size
= PyUnicode_GET_SIZE(unicode
) + 1;
445 #ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
449 register Py_UNICODE
*u
;
450 register Py_ssize_t i
;
451 u
= PyUnicode_AS_UNICODE(unicode
);
452 for (i
= size
; i
> 0; i
--)
457 if (size
> PyUnicode_GET_SIZE(unicode
))
458 return PyUnicode_GET_SIZE(unicode
);
465 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
469 #ifdef Py_UNICODE_WIDE
470 if (ordinal
< 0 || ordinal
> 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError
,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
477 if (ordinal
< 0 || ordinal
> 0xffff) {
478 PyErr_SetString(PyExc_ValueError
,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
485 s
[0] = (Py_UNICODE
)ordinal
;
486 return PyUnicode_FromUnicode(s
, 1);
489 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj
)) {
497 if (PyUnicode_Check(obj
)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
501 PyUnicode_GET_SIZE(obj
));
503 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
506 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
507 const char *encoding
,
510 const char *s
= NULL
;
515 PyErr_BadInternalCall();
520 /* For b/w compatibility we also accept Unicode objects provided
521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
529 if (PyUnicode_Check(obj
)) {
531 PyErr_SetString(PyExc_TypeError
,
532 "decoding Unicode is not supported");
535 return PyObject_Unicode(obj
);
538 if (PyUnicode_Check(obj
)) {
539 PyErr_SetString(PyExc_TypeError
,
540 "decoding Unicode is not supported");
546 if (PyString_Check(obj
)) {
547 s
= PyString_AS_STRING(obj
);
548 len
= PyString_GET_SIZE(obj
);
550 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError
))
554 PyErr_Format(PyExc_TypeError
,
555 "coercing to Unicode: need string or buffer, "
557 obj
->ob_type
->tp_name
);
561 /* Convert to Unicode */
563 Py_INCREF(unicode_empty
);
564 v
= (PyObject
*)unicode_empty
;
567 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
575 PyObject
*PyUnicode_Decode(const char *s
,
577 const char *encoding
,
580 PyObject
*buffer
= NULL
, *unicode
;
582 if (encoding
== NULL
)
583 encoding
= PyUnicode_GetDefaultEncoding();
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding
, "utf-8") == 0)
587 return PyUnicode_DecodeUTF8(s
, size
, errors
);
588 else if (strcmp(encoding
, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s
, size
, errors
);
590 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding
, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s
, size
, errors
);
594 else if (strcmp(encoding
, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s
, size
, errors
);
597 /* Decode via the codec registry */
598 buffer
= PyBuffer_FromMemory((void *)s
, size
);
601 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
604 if (!PyUnicode_Check(unicode
)) {
605 PyErr_Format(PyExc_TypeError
,
606 "decoder did not return an unicode object (type=%.400s)",
607 unicode
->ob_type
->tp_name
);
619 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
620 const char *encoding
,
625 if (!PyUnicode_Check(unicode
)) {
630 if (encoding
== NULL
)
631 encoding
= PyUnicode_GetDefaultEncoding();
633 /* Decode via the codec registry */
634 v
= PyCodec_Decode(unicode
, encoding
, errors
);
643 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
645 const char *encoding
,
648 PyObject
*v
, *unicode
;
650 unicode
= PyUnicode_FromUnicode(s
, size
);
653 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
658 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
659 const char *encoding
,
664 if (!PyUnicode_Check(unicode
)) {
669 if (encoding
== NULL
)
670 encoding
= PyUnicode_GetDefaultEncoding();
672 /* Encode via the codec registry */
673 v
= PyCodec_Encode(unicode
, encoding
, errors
);
682 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
683 const char *encoding
,
688 if (!PyUnicode_Check(unicode
)) {
693 if (encoding
== NULL
)
694 encoding
= PyUnicode_GetDefaultEncoding();
696 /* Shortcuts for common default encodings */
697 if (errors
== NULL
) {
698 if (strcmp(encoding
, "utf-8") == 0)
699 return PyUnicode_AsUTF8String(unicode
);
700 else if (strcmp(encoding
, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode
);
702 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding
, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode
);
706 else if (strcmp(encoding
, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode
);
710 /* Encode via the codec registry */
711 v
= PyCodec_Encode(unicode
, encoding
, errors
);
714 if (!PyString_Check(v
)) {
715 PyErr_Format(PyExc_TypeError
,
716 "encoder did not return a string object (type=%.400s)",
717 v
->ob_type
->tp_name
);
727 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
730 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
734 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
735 if (v
&& errors
== NULL
)
736 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
740 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
742 if (!PyUnicode_Check(unicode
)) {
746 return PyUnicode_AS_UNICODE(unicode
);
752 Py_ssize_t
PyUnicode_GetSize(PyObject
*unicode
)
754 if (!PyUnicode_Check(unicode
)) {
758 return PyUnicode_GET_SIZE(unicode
);
764 const char *PyUnicode_GetDefaultEncoding(void)
766 return unicode_default_encoding
;
769 int PyUnicode_SetDefaultEncoding(const char *encoding
)
773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v
= _PyCodec_Lookup(encoding
);
779 strncpy(unicode_default_encoding
,
781 sizeof(unicode_default_encoding
));
788 /* error handling callback helper:
789 build arguments, call the callback and check the arguments,
790 if no exception occurred, copy the replacement to the output
791 and adjust various state variables.
792 return 0 on success, -1 on error
796 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
797 const char *encoding
, const char *reason
,
798 const char *input
, Py_ssize_t insize
, Py_ssize_t
*startinpos
, Py_ssize_t
*endinpos
, PyObject
**exceptionObject
, const char **inptr
,
799 PyObject
**output
, Py_ssize_t
*outpos
, Py_UNICODE
**outptr
)
801 static char *argparse
= "O!n;decoding error handler must return (unicode, int) tuple";
803 PyObject
*restuple
= NULL
;
804 PyObject
*repunicode
= NULL
;
805 Py_ssize_t outsize
= PyUnicode_GET_SIZE(*output
);
806 Py_ssize_t requiredsize
;
812 if (*errorHandler
== NULL
) {
813 *errorHandler
= PyCodec_LookupError(errors
);
814 if (*errorHandler
== NULL
)
818 if (*exceptionObject
== NULL
) {
819 *exceptionObject
= PyUnicodeDecodeError_Create(
820 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
821 if (*exceptionObject
== NULL
)
825 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
827 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
829 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
833 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
834 if (restuple
== NULL
)
836 if (!PyTuple_Check(restuple
)) {
837 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
840 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
843 newpos
= insize
+newpos
;
844 if (newpos
<0 || newpos
>insize
) {
845 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", newpos
);
849 /* need more space? (at least enough for what we
850 have+the replacement+the rest of the string (starting
851 at the new input position), so we won't have to check space
852 when there are no errors in the rest of the string) */
853 repptr
= PyUnicode_AS_UNICODE(repunicode
);
854 repsize
= PyUnicode_GET_SIZE(repunicode
);
855 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
856 if (requiredsize
> outsize
) {
857 if (requiredsize
<2*outsize
)
858 requiredsize
= 2*outsize
;
859 if (PyUnicode_Resize(output
, requiredsize
) < 0)
861 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
864 *inptr
= input
+ newpos
;
865 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
872 Py_XDECREF(restuple
);
876 /* --- UTF-7 Codec -------------------------------------------------------- */
878 /* see RFC2152 for details */
881 char utf7_special
[128] = {
882 /* indicate whether a UTF-7 character is special i.e. cannot be directly
886 2 - whitespace (optional)
887 3 - RFC2152 Set O (optional) */
888 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
890 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
891 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
892 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
893 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
894 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
899 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
900 warnings about the comparison always being false; since
901 utf7_special[0] is 1, we can safely make that one comparison
904 #define SPECIAL(c, encodeO, encodeWS) \
905 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
906 (encodeWS && (utf7_special[(c)] == 2)) || \
907 (encodeO && (utf7_special[(c)] == 3)))
910 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
912 (isalnum(c) || (c) == '+' || (c) == '/')
914 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
915 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
917 #define ENCODE(out, ch, bits) \
918 while (bits >= 6) { \
919 *out++ = B64(ch >> (bits-6)); \
923 #define DECODE(out, ch, bits, surrogate) \
924 while (bits >= 16) { \
925 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
928 /* We have already generated an error for the high surrogate \
929 so let's not bother seeing if the low surrogate is correct or not */ \
931 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
932 /* This is a surrogate pair. Unfortunately we can't represent \
933 it in a 16-bit character */ \
935 errmsg = "code pairs are not supported"; \
942 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
946 const char *starts
= s
;
947 Py_ssize_t startinpos
;
951 PyUnicodeObject
*unicode
;
953 const char *errmsg
= "";
955 unsigned int bitsleft
= 0;
956 unsigned long charsleft
= 0;
958 PyObject
*errorHandler
= NULL
;
959 PyObject
*exc
= NULL
;
961 unicode
= _PyUnicode_New(size
);
965 return (PyObject
*)unicode
;
976 if ((ch
== '-') || !B64CHAR(ch
)) {
980 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
982 /* The shift sequence has a partial character in it. If
983 bitsleft < 6 then we could just classify it as padding
984 but that is not the case here */
986 errmsg
= "partial character in shift sequence";
989 /* According to RFC2152 the remaining bits should be zero. We
990 choose to signal an error/insert a replacement character
991 here so indicate the potential of a misencoded character. */
993 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
994 if (bitsleft
&& charsleft
<< (sizeof(charsleft
) * 8 - bitsleft
)) {
995 errmsg
= "non-zero padding bits in shift sequence";
1000 if ((s
< e
) && (*(s
) == '-')) {
1004 } else if (SPECIAL(ch
,0,0)) {
1005 errmsg
= "unexpected special character";
1011 charsleft
= (charsleft
<< 6) | UB64(ch
);
1014 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
1017 else if ( ch
== '+' ) {
1018 startinpos
= s
-starts
;
1020 if (s
< e
&& *s
== '-') {
1029 else if (SPECIAL(ch
,0,0)) {
1030 errmsg
= "unexpected special character";
1040 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1041 endinpos
= s
-starts
;
1042 if (unicode_decode_call_errorhandler(
1043 errors
, &errorHandler
,
1045 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1046 (PyObject
**)&unicode
, &outpos
, &p
))
1051 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1053 if (unicode_decode_call_errorhandler(
1054 errors
, &errorHandler
,
1055 "utf7", "unterminated shift sequence",
1056 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1057 (PyObject
**)&unicode
, &outpos
, &p
))
1063 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1066 Py_XDECREF(errorHandler
);
1068 return (PyObject
*)unicode
;
1071 Py_XDECREF(errorHandler
);
1078 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1081 int encodeWhiteSpace
,
1085 /* It might be possible to tighten this worst case */
1086 Py_ssize_t cbAllocated
= 5 * size
;
1089 unsigned int bitsleft
= 0;
1090 unsigned long charsleft
= 0;
1095 return PyString_FromStringAndSize(NULL
, 0);
1097 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
1101 start
= out
= PyString_AS_STRING(v
);
1102 for (;i
< size
; ++i
) {
1103 Py_UNICODE ch
= s
[i
];
1109 } else if (SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1113 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1114 inShift
= bitsleft
> 0;
1119 if (!SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1120 *out
++ = B64(charsleft
<< (6-bitsleft
));
1123 /* Characters not in the BASE64 set implicitly unshift the sequence
1124 so no '-' is required, except if the character is itself a '-' */
1125 if (B64CHAR(ch
) || ch
== '-') {
1132 charsleft
= (charsleft
<< 16) | ch
;
1133 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1135 /* If the next character is special then we dont' need to terminate
1136 the shift sequence. If the next character is not a BASE64 character
1137 or '-' then the shift sequence will be terminated implicitly and we
1138 don't have to insert a '-'. */
1140 if (bitsleft
== 0) {
1142 Py_UNICODE ch2
= s
[i
+1];
1144 if (SPECIAL(ch2
, encodeSetO
, encodeWhiteSpace
)) {
1146 } else if (B64CHAR(ch2
) || ch2
== '-') {
1163 *out
++= B64(charsleft
<< (6-bitsleft
) );
1167 _PyString_Resize(&v
, out
- start
);
1178 /* --- UTF-8 Codec -------------------------------------------------------- */
1181 char utf8_code_length
[256] = {
1182 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1183 illegal prefix. see RFC 2279 for details */
1184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1197 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1198 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1199 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1202 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1206 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1209 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1212 Py_ssize_t
*consumed
)
1214 const char *starts
= s
;
1216 Py_ssize_t startinpos
;
1217 Py_ssize_t endinpos
;
1220 PyUnicodeObject
*unicode
;
1222 const char *errmsg
= "";
1223 PyObject
*errorHandler
= NULL
;
1224 PyObject
*exc
= NULL
;
1226 /* Note: size will always be longer than the resulting Unicode
1228 unicode
= _PyUnicode_New(size
);
1234 return (PyObject
*)unicode
;
1237 /* Unpack UTF-8 encoded data */
1242 Py_UCS4 ch
= (unsigned char)*s
;
1245 *p
++ = (Py_UNICODE
)ch
;
1250 n
= utf8_code_length
[ch
];
1256 errmsg
= "unexpected end of data";
1257 startinpos
= s
-starts
;
1266 errmsg
= "unexpected code byte";
1267 startinpos
= s
-starts
;
1268 endinpos
= startinpos
+1;
1272 errmsg
= "internal error";
1273 startinpos
= s
-starts
;
1274 endinpos
= startinpos
+1;
1278 if ((s
[1] & 0xc0) != 0x80) {
1279 errmsg
= "invalid data";
1280 startinpos
= s
-starts
;
1281 endinpos
= startinpos
+2;
1284 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1286 startinpos
= s
-starts
;
1287 endinpos
= startinpos
+2;
1288 errmsg
= "illegal encoding";
1292 *p
++ = (Py_UNICODE
)ch
;
1296 if ((s
[1] & 0xc0) != 0x80 ||
1297 (s
[2] & 0xc0) != 0x80) {
1298 errmsg
= "invalid data";
1299 startinpos
= s
-starts
;
1300 endinpos
= startinpos
+3;
1303 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1305 /* Note: UTF-8 encodings of surrogates are considered
1306 legal UTF-8 sequences;
1308 XXX For wide builds (UCS-4) we should probably try
1309 to recombine the surrogates into a single code
1312 errmsg
= "illegal encoding";
1313 startinpos
= s
-starts
;
1314 endinpos
= startinpos
+3;
1318 *p
++ = (Py_UNICODE
)ch
;
1322 if ((s
[1] & 0xc0) != 0x80 ||
1323 (s
[2] & 0xc0) != 0x80 ||
1324 (s
[3] & 0xc0) != 0x80) {
1325 errmsg
= "invalid data";
1326 startinpos
= s
-starts
;
1327 endinpos
= startinpos
+4;
1330 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
1331 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
1332 /* validate and convert to UTF-16 */
1333 if ((ch
< 0x10000) /* minimum value allowed for 4
1335 || (ch
> 0x10ffff)) /* maximum value allowed for
1338 errmsg
= "illegal encoding";
1339 startinpos
= s
-starts
;
1340 endinpos
= startinpos
+4;
1343 #ifdef Py_UNICODE_WIDE
1344 *p
++ = (Py_UNICODE
)ch
;
1346 /* compute and append the two surrogates: */
1348 /* translate from 10000..10FFFF to 0..FFFF */
1351 /* high surrogate = top 10 bits added to D800 */
1352 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
1354 /* low surrogate = bottom 10 bits added to DC00 */
1355 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
1360 /* Other sizes are only needed for UCS-4 */
1361 errmsg
= "unsupported Unicode code range";
1362 startinpos
= s
-starts
;
1363 endinpos
= startinpos
+n
;
1370 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1371 if (unicode_decode_call_errorhandler(
1372 errors
, &errorHandler
,
1374 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1375 (PyObject
**)&unicode
, &outpos
, &p
))
1379 *consumed
= s
-starts
;
1382 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
1385 Py_XDECREF(errorHandler
);
1387 return (PyObject
*)unicode
;
1390 Py_XDECREF(errorHandler
);
1396 /* Allocation strategy: if the string is short, convert into a stack buffer
1397 and allocate exactly as much space needed at the end. Else allocate the
1398 maximum possible needed (4 result bytes per Unicode character), and return
1399 the excess memory at the end.
1402 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
1406 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1408 Py_ssize_t i
; /* index into s of next input byte */
1409 PyObject
*v
; /* result string object */
1410 char *p
; /* next free byte in output buffer */
1411 Py_ssize_t nallocated
; /* number of result bytes allocated */
1412 Py_ssize_t nneeded
; /* number of result bytes needed */
1413 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
1418 if (size
<= MAX_SHORT_UNICHARS
) {
1419 /* Write into the stack buffer; nallocated can't overflow.
1420 * At the end, we'll allocate exactly as much heap space as it
1421 * turns out we need.
1423 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
1424 v
= NULL
; /* will allocate after we're done */
1428 /* Overallocate on the heap, and give the excess back at the end. */
1429 nallocated
= size
* 4;
1430 if (nallocated
/ 4 != size
) /* overflow! */
1431 return PyErr_NoMemory();
1432 v
= PyString_FromStringAndSize(NULL
, nallocated
);
1435 p
= PyString_AS_STRING(v
);
1438 for (i
= 0; i
< size
;) {
1439 Py_UCS4 ch
= s
[i
++];
1445 else if (ch
< 0x0800) {
1446 /* Encode Latin-1 */
1447 *p
++ = (char)(0xc0 | (ch
>> 6));
1448 *p
++ = (char)(0x80 | (ch
& 0x3f));
1451 /* Encode UCS2 Unicode ordinals */
1453 /* Special case: check for high surrogate */
1454 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
1456 /* Check for low surrogate and combine the two to
1457 form a UCS4 value */
1458 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1459 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
1463 /* Fall through: handles isolated high surrogates */
1465 *p
++ = (char)(0xe0 | (ch
>> 12));
1466 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1467 *p
++ = (char)(0x80 | (ch
& 0x3f));
1471 /* Encode UCS4 Unicode ordinals */
1472 *p
++ = (char)(0xf0 | (ch
>> 18));
1473 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
1474 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1475 *p
++ = (char)(0x80 | (ch
& 0x3f));
1480 /* This was stack allocated. */
1481 nneeded
= p
- stackbuf
;
1482 assert(nneeded
<= nallocated
);
1483 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
1486 /* Cut back to size actually needed. */
1487 nneeded
= p
- PyString_AS_STRING(v
);
1488 assert(nneeded
<= nallocated
);
1489 _PyString_Resize(&v
, nneeded
);
1493 #undef MAX_SHORT_UNICHARS
1496 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
1498 if (!PyUnicode_Check(unicode
)) {
1499 PyErr_BadArgument();
1502 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
1503 PyUnicode_GET_SIZE(unicode
),
1507 /* --- UTF-16 Codec ------------------------------------------------------- */
1510 PyUnicode_DecodeUTF16(const char *s
,
1515 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
1519 PyUnicode_DecodeUTF16Stateful(const char *s
,
1523 Py_ssize_t
*consumed
)
1525 const char *starts
= s
;
1526 Py_ssize_t startinpos
;
1527 Py_ssize_t endinpos
;
1529 PyUnicodeObject
*unicode
;
1531 const unsigned char *q
, *e
;
1532 int bo
= 0; /* assume native ordering by default */
1533 const char *errmsg
= "";
1534 /* Offsets from q for retrieving byte pairs in the right order. */
1535 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1536 int ihi
= 1, ilo
= 0;
1538 int ihi
= 0, ilo
= 1;
1540 PyObject
*errorHandler
= NULL
;
1541 PyObject
*exc
= NULL
;
1543 /* Note: size will always be longer than the resulting Unicode
1545 unicode
= _PyUnicode_New(size
);
1549 return (PyObject
*)unicode
;
1551 /* Unpack UTF-16 encoded data */
1553 q
= (unsigned char *)s
;
1559 /* Check for BOM marks (U+FEFF) in the input and adjust current
1560 byte order setting accordingly. In native mode, the leading BOM
1561 mark is skipped, in all other modes, it is copied to the output
1562 stream as-is (giving a ZWNBSP character). */
1565 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
1566 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1567 if (bom
== 0xFEFF) {
1571 else if (bom
== 0xFFFE) {
1576 if (bom
== 0xFEFF) {
1580 else if (bom
== 0xFFFE) {
1601 /* remaining bytes at the end? (size should be even) */
1605 errmsg
= "truncated data";
1606 startinpos
= ((const char *)q
)-starts
;
1607 endinpos
= ((const char *)e
)-starts
;
1609 /* The remaining input chars are ignored if the callback
1610 chooses to skip the input */
1612 ch
= (q
[ihi
] << 8) | q
[ilo
];
1616 if (ch
< 0xD800 || ch
> 0xDFFF) {
1621 /* UTF-16 code pair: */
1623 errmsg
= "unexpected end of data";
1624 startinpos
= (((const char *)q
)-2)-starts
;
1625 endinpos
= ((const char *)e
)-starts
;
1628 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1629 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
1631 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1632 #ifndef Py_UNICODE_WIDE
1636 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
1641 errmsg
= "illegal UTF-16 surrogate";
1642 startinpos
= (((const char *)q
)-4)-starts
;
1643 endinpos
= startinpos
+2;
1648 errmsg
= "illegal encoding";
1649 startinpos
= (((const char *)q
)-2)-starts
;
1650 endinpos
= startinpos
+2;
1651 /* Fall through to report the error */
1654 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1655 if (unicode_decode_call_errorhandler(
1656 errors
, &errorHandler
,
1658 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
1659 (PyObject
**)&unicode
, &outpos
, &p
))
1667 *consumed
= (const char *)q
-starts
;
1670 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
1673 Py_XDECREF(errorHandler
);
1675 return (PyObject
*)unicode
;
1679 Py_XDECREF(errorHandler
);
1685 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1692 #ifdef Py_UNICODE_WIDE
1695 const int pairs
= 0;
1697 /* Offsets from p for storing byte pairs in the right order. */
1698 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1699 int ihi
= 1, ilo
= 0;
1701 int ihi
= 0, ilo
= 1;
1704 #define STORECHAR(CH) \
1706 p[ihi] = ((CH) >> 8) & 0xff; \
1707 p[ilo] = (CH) & 0xff; \
1711 #ifdef Py_UNICODE_WIDE
1712 for (i
= pairs
= 0; i
< size
; i
++)
1713 if (s
[i
] >= 0x10000)
1716 v
= PyString_FromStringAndSize(NULL
,
1717 2 * (size
+ pairs
+ (byteorder
== 0)));
1721 p
= (unsigned char *)PyString_AS_STRING(v
);
1727 if (byteorder
== -1) {
1732 else if (byteorder
== 1) {
1738 while (size
-- > 0) {
1739 Py_UNICODE ch
= *s
++;
1741 #ifdef Py_UNICODE_WIDE
1742 if (ch
>= 0x10000) {
1743 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1744 ch
= 0xD800 | ((ch
-0x10000) >> 10);
1755 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1757 if (!PyUnicode_Check(unicode
)) {
1758 PyErr_BadArgument();
1761 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1762 PyUnicode_GET_SIZE(unicode
),
1767 /* --- Unicode Escape Codec ----------------------------------------------- */
1769 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
1771 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1775 const char *starts
= s
;
1776 Py_ssize_t startinpos
;
1777 Py_ssize_t endinpos
;
1784 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
1785 PyObject
*errorHandler
= NULL
;
1786 PyObject
*exc
= NULL
;
1788 /* Escaped strings will always be longer than the resulting
1789 Unicode string, so we start with size here and then reduce the
1790 length after conversion to the true value.
1791 (but if the error callback returns a long replacement string
1792 we'll have to allocate more space) */
1793 v
= _PyUnicode_New(size
);
1797 return (PyObject
*)v
;
1799 p
= PyUnicode_AS_UNICODE(v
);
1807 /* Non-escape characters are interpreted as Unicode ordinals */
1809 *p
++ = (unsigned char) *s
++;
1813 startinpos
= s
-starts
;
1820 case '\\': *p
++ = '\\'; break;
1821 case '\'': *p
++ = '\''; break;
1822 case '\"': *p
++ = '\"'; break;
1823 case 'b': *p
++ = '\b'; break;
1824 case 'f': *p
++ = '\014'; break; /* FF */
1825 case 't': *p
++ = '\t'; break;
1826 case 'n': *p
++ = '\n'; break;
1827 case 'r': *p
++ = '\r'; break;
1828 case 'v': *p
++ = '\013'; break; /* VT */
1829 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1831 /* \OOO (octal) escapes */
1832 case '0': case '1': case '2': case '3':
1833 case '4': case '5': case '6': case '7':
1835 if ('0' <= *s
&& *s
<= '7') {
1836 x
= (x
<<3) + *s
++ - '0';
1837 if ('0' <= *s
&& *s
<= '7')
1838 x
= (x
<<3) + *s
++ - '0';
1847 message
= "truncated \\xXX escape";
1853 message
= "truncated \\uXXXX escape";
1859 message
= "truncated \\UXXXXXXXX escape";
1862 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1865 if (unicode_decode_call_errorhandler(
1866 errors
, &errorHandler
,
1867 "unicodeescape", "end of string in escape sequence",
1868 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1869 (PyObject
**)&v
, &outpos
, &p
))
1873 for (i
= 0; i
< digits
; ++i
) {
1874 c
= (unsigned char) s
[i
];
1876 endinpos
= (s
+i
+1)-starts
;
1877 if (unicode_decode_call_errorhandler(
1878 errors
, &errorHandler
,
1879 "unicodeescape", message
,
1880 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1881 (PyObject
**)&v
, &outpos
, &p
))
1885 chr
= (chr
<<4) & ~0xF;
1886 if (c
>= '0' && c
<= '9')
1888 else if (c
>= 'a' && c
<= 'f')
1889 chr
+= 10 + c
- 'a';
1891 chr
+= 10 + c
- 'A';
1894 if (chr
== 0xffffffff && PyErr_Occurred())
1895 /* _decoding_error will have already written into the
1899 /* when we get here, chr is a 32-bit unicode character */
1901 /* UCS-2 character */
1902 *p
++ = (Py_UNICODE
) chr
;
1903 else if (chr
<= 0x10ffff) {
1904 /* UCS-4 character. Either store directly, or as
1906 #ifdef Py_UNICODE_WIDE
1910 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
1911 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
1914 endinpos
= s
-starts
;
1915 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1916 if (unicode_decode_call_errorhandler(
1917 errors
, &errorHandler
,
1918 "unicodeescape", "illegal Unicode character",
1919 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1920 (PyObject
**)&v
, &outpos
, &p
))
1927 message
= "malformed \\N character escape";
1928 if (ucnhash_CAPI
== NULL
) {
1929 /* load the unicode data module */
1931 m
= PyImport_ImportModule("unicodedata");
1934 api
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
1938 ucnhash_CAPI
= (_PyUnicode_Name_CAPI
*)PyCObject_AsVoidPtr(api
);
1940 if (ucnhash_CAPI
== NULL
)
1944 const char *start
= s
+1;
1945 /* look for the closing brace */
1946 while (*s
!= '}' && s
< end
)
1948 if (s
> start
&& s
< end
&& *s
== '}') {
1949 /* found a name. look it up in the unicode database */
1950 message
= "unknown Unicode character name";
1952 if (ucnhash_CAPI
->getcode(NULL
, start
, (int)(s
-start
-1), &chr
))
1956 endinpos
= s
-starts
;
1957 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1958 if (unicode_decode_call_errorhandler(
1959 errors
, &errorHandler
,
1960 "unicodeescape", message
,
1961 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1962 (PyObject
**)&v
, &outpos
, &p
))
1968 message
= "\\ at end of string";
1970 endinpos
= s
-starts
;
1971 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1972 if (unicode_decode_call_errorhandler(
1973 errors
, &errorHandler
,
1974 "unicodeescape", message
,
1975 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1976 (PyObject
**)&v
, &outpos
, &p
))
1981 *p
++ = (unsigned char)s
[-1];
1988 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
1990 Py_XDECREF(errorHandler
);
1992 return (PyObject
*)v
;
1997 "\\N escapes not supported (can't load unicodedata module)"
2000 Py_XDECREF(errorHandler
);
2006 Py_XDECREF(errorHandler
);
2011 /* Return a Unicode-Escape string version of the Unicode object.
2013 If quotes is true, the string is enclosed in u"" or u'' quotes as
2018 Py_LOCAL_INLINE(const Py_UNICODE
*) findchar(const Py_UNICODE
*s
,
2022 /* like wcschr, but doesn't stop at NULL characters */
2024 while (size
-- > 0) {
2034 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
2041 static const char *hexdigit
= "0123456789abcdef";
2043 repr
= PyString_FromStringAndSize(NULL
, 2 + 6*size
+ 1);
2047 p
= PyString_AS_STRING(repr
);
2051 *p
++ = (findchar(s
, size
, '\'') &&
2052 !findchar(s
, size
, '"')) ? '"' : '\'';
2054 while (size
-- > 0) {
2055 Py_UNICODE ch
= *s
++;
2057 /* Escape quotes and backslashes */
2059 ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1]) || ch
== '\\') {
2065 #ifdef Py_UNICODE_WIDE
2066 /* Map 21-bit characters to '\U00xxxxxx' */
2067 else if (ch
>= 0x10000) {
2068 Py_ssize_t offset
= p
- PyString_AS_STRING(repr
);
2070 /* Resize the string if necessary */
2071 if (offset
+ 12 > PyString_GET_SIZE(repr
)) {
2072 if (_PyString_Resize(&repr
, PyString_GET_SIZE(repr
) + 100))
2074 p
= PyString_AS_STRING(repr
) + offset
;
2079 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
2080 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
2081 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
2082 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
2083 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
2084 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
2085 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
2086 *p
++ = hexdigit
[ch
& 0x0000000F];
2090 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2091 else if (ch
>= 0xD800 && ch
< 0xDC00) {
2097 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
2098 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
2101 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
2102 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
2103 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
2104 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
2105 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
2106 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
2107 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
2108 *p
++ = hexdigit
[ucs
& 0x0000000F];
2111 /* Fall through: isolated surrogates are copied as-is */
2116 /* Map 16-bit characters to '\uxxxx' */
2120 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
2121 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
2122 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
2123 *p
++ = hexdigit
[ch
& 0x000F];
2126 /* Map special whitespace to '\t', \n', '\r' */
2127 else if (ch
== '\t') {
2131 else if (ch
== '\n') {
2135 else if (ch
== '\r') {
2140 /* Map non-printable US ASCII to '\xhh' */
2141 else if (ch
< ' ' || ch
>= 0x7F) {
2144 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
2145 *p
++ = hexdigit
[ch
& 0x000F];
2148 /* Copy everything else as-is */
2153 *p
++ = PyString_AS_STRING(repr
)[1];
2156 _PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
));
2160 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
2163 return unicodeescape_string(s
, size
, 0);
2166 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
2168 if (!PyUnicode_Check(unicode
)) {
2169 PyErr_BadArgument();
2172 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2173 PyUnicode_GET_SIZE(unicode
));
2176 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2178 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
2182 const char *starts
= s
;
2183 Py_ssize_t startinpos
;
2184 Py_ssize_t endinpos
;
2190 PyObject
*errorHandler
= NULL
;
2191 PyObject
*exc
= NULL
;
2193 /* Escaped strings will always be longer than the resulting
2194 Unicode string, so we start with size here and then reduce the
2195 length after conversion to the true value. (But decoding error
2196 handler might have to resize the string) */
2197 v
= _PyUnicode_New(size
);
2201 return (PyObject
*)v
;
2202 p
= PyUnicode_AS_UNICODE(v
);
2210 /* Non-escape characters are interpreted as Unicode ordinals */
2212 *p
++ = (unsigned char)*s
++;
2215 startinpos
= s
-starts
;
2217 /* \u-escapes are only interpreted iff the number of leading
2218 backslashes if odd */
2223 *p
++ = (unsigned char)*s
++;
2225 if (((s
- bs
) & 1) == 0 ||
2227 (*s
!= 'u' && *s
!= 'U')) {
2231 count
= *s
=='u' ? 4 : 8;
2234 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2235 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2236 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
2237 c
= (unsigned char)*s
;
2239 endinpos
= s
-starts
;
2240 if (unicode_decode_call_errorhandler(
2241 errors
, &errorHandler
,
2242 "rawunicodeescape", "truncated \\uXXXX",
2243 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2244 (PyObject
**)&v
, &outpos
, &p
))
2249 if (c
>= '0' && c
<= '9')
2251 else if (c
>= 'a' && c
<= 'f')
2256 #ifndef Py_UNICODE_WIDE
2258 if (unicode_decode_call_errorhandler(
2259 errors
, &errorHandler
,
2260 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2261 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2262 (PyObject
**)&v
, &outpos
, &p
))
2270 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2272 Py_XDECREF(errorHandler
);
2274 return (PyObject
*)v
;
2278 Py_XDECREF(errorHandler
);
2283 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
2290 static const char *hexdigit
= "0123456789abcdef";
2292 #ifdef Py_UNICODE_WIDE
2293 repr
= PyString_FromStringAndSize(NULL
, 10 * size
);
2295 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
2302 p
= q
= PyString_AS_STRING(repr
);
2303 while (size
-- > 0) {
2304 Py_UNICODE ch
= *s
++;
2305 #ifdef Py_UNICODE_WIDE
2306 /* Map 32-bit characters to '\Uxxxxxxxx' */
2307 if (ch
>= 0x10000) {
2310 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
2311 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
2312 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
2313 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
2314 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
2315 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
2316 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
2317 *p
++ = hexdigit
[ch
& 15];
2321 /* Map 16-bit characters to '\uxxxx' */
2325 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
2326 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
2327 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
2328 *p
++ = hexdigit
[ch
& 15];
2330 /* Copy everything else as-is */
2335 _PyString_Resize(&repr
, p
- q
);
2339 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
2341 if (!PyUnicode_Check(unicode
)) {
2342 PyErr_BadArgument();
2345 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2346 PyUnicode_GET_SIZE(unicode
));
2349 /* --- Unicode Internal Codec ------------------------------------------- */
2351 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
2355 const char *starts
= s
;
2356 Py_ssize_t startinpos
;
2357 Py_ssize_t endinpos
;
2363 PyObject
*errorHandler
= NULL
;
2364 PyObject
*exc
= NULL
;
2366 #ifdef Py_UNICODE_WIDE
2367 Py_UNICODE unimax
= PyUnicode_GetMax();
2370 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
2373 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
2374 return (PyObject
*)v
;
2375 p
= PyUnicode_AS_UNICODE(v
);
2379 memcpy(p
, s
, sizeof(Py_UNICODE
));
2380 /* We have to sanity check the raw data, otherwise doom looms for
2381 some malformed UCS-4 data. */
2383 #ifdef Py_UNICODE_WIDE
2384 *p
> unimax
|| *p
< 0 ||
2386 end
-s
< Py_UNICODE_SIZE
2389 startinpos
= s
- starts
;
2390 if (end
-s
< Py_UNICODE_SIZE
) {
2391 endinpos
= end
-starts
;
2392 reason
= "truncated input";
2395 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
2396 reason
= "illegal code point (> 0x10FFFF)";
2398 outpos
= p
- PyUnicode_AS_UNICODE(v
);
2399 if (unicode_decode_call_errorhandler(
2400 errors
, &errorHandler
,
2401 "unicode_internal", reason
,
2402 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2403 (PyObject
**)&v
, &outpos
, &p
)) {
2409 s
+= Py_UNICODE_SIZE
;
2413 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2415 Py_XDECREF(errorHandler
);
2417 return (PyObject
*)v
;
2421 Py_XDECREF(errorHandler
);
2426 /* --- Latin-1 Codec ------------------------------------------------------ */
2428 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
2435 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2437 Py_UNICODE r
= *(unsigned char*)s
;
2438 return PyUnicode_FromUnicode(&r
, 1);
2441 v
= _PyUnicode_New(size
);
2445 return (PyObject
*)v
;
2446 p
= PyUnicode_AS_UNICODE(v
);
2448 *p
++ = (unsigned char)*s
++;
2449 return (PyObject
*)v
;
2456 /* create or adjust a UnicodeEncodeError */
2457 static void make_encode_exception(PyObject
**exceptionObject
,
2458 const char *encoding
,
2459 const Py_UNICODE
*unicode
, Py_ssize_t size
,
2460 Py_ssize_t startpos
, Py_ssize_t endpos
,
2463 if (*exceptionObject
== NULL
) {
2464 *exceptionObject
= PyUnicodeEncodeError_Create(
2465 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2468 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
2470 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
2472 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
2476 Py_DECREF(*exceptionObject
);
2477 *exceptionObject
= NULL
;
2481 /* raises a UnicodeEncodeError */
2482 static void raise_encode_exception(PyObject
**exceptionObject
,
2483 const char *encoding
,
2484 const Py_UNICODE
*unicode
, Py_ssize_t size
,
2485 Py_ssize_t startpos
, Py_ssize_t endpos
,
2488 make_encode_exception(exceptionObject
,
2489 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2490 if (*exceptionObject
!= NULL
)
2491 PyCodec_StrictErrors(*exceptionObject
);
2494 /* error handling callback helper:
2495 build arguments, call the callback and check the arguments,
2496 put the result into newpos and return the replacement string, which
2497 has to be freed by the caller */
2498 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
2499 PyObject
**errorHandler
,
2500 const char *encoding
, const char *reason
,
2501 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
2502 Py_ssize_t startpos
, Py_ssize_t endpos
,
2505 static char *argparse
= "O!n;encoding error handler must return (unicode, int) tuple";
2508 PyObject
*resunicode
;
2510 if (*errorHandler
== NULL
) {
2511 *errorHandler
= PyCodec_LookupError(errors
);
2512 if (*errorHandler
== NULL
)
2516 make_encode_exception(exceptionObject
,
2517 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2518 if (*exceptionObject
== NULL
)
2521 restuple
= PyObject_CallFunctionObjArgs(
2522 *errorHandler
, *exceptionObject
, NULL
);
2523 if (restuple
== NULL
)
2525 if (!PyTuple_Check(restuple
)) {
2526 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
2527 Py_DECREF(restuple
);
2530 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
2531 &resunicode
, newpos
)) {
2532 Py_DECREF(restuple
);
2536 *newpos
= size
+*newpos
;
2537 if (*newpos
<0 || *newpos
>size
) {
2538 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
2539 Py_DECREF(restuple
);
2542 Py_INCREF(resunicode
);
2543 Py_DECREF(restuple
);
2547 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
2554 /* pointers to the beginning and end+1 of input */
2555 const Py_UNICODE
*startp
= p
;
2556 const Py_UNICODE
*endp
= p
+ size
;
2557 /* pointer to the beginning of the unencodable characters */
2558 /* const Py_UNICODE *badp = NULL; */
2559 /* pointer into the output */
2561 /* current output position */
2562 Py_ssize_t respos
= 0;
2564 const char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
2565 const char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2566 PyObject
*errorHandler
= NULL
;
2567 PyObject
*exc
= NULL
;
2568 /* the following variable is used for caching string comparisons
2569 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2570 int known_errorHandler
= -1;
2572 /* allocate enough for a simple encoding without
2573 replacements, if we need more, we'll resize */
2574 res
= PyString_FromStringAndSize(NULL
, size
);
2579 str
= PyString_AS_STRING(res
);
2585 /* can we encode this? */
2587 /* no overflow check, because we know that the space is enough */
2592 Py_ssize_t unicodepos
= p
-startp
;
2593 Py_ssize_t requiredsize
;
2594 PyObject
*repunicode
;
2599 /* startpos for collecting unencodable chars */
2600 const Py_UNICODE
*collstart
= p
;
2601 const Py_UNICODE
*collend
= p
;
2602 /* find all unecodable characters */
2603 while ((collend
< endp
) && ((*collend
)>=limit
))
2605 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2606 if (known_errorHandler
==-1) {
2607 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
2608 known_errorHandler
= 1;
2609 else if (!strcmp(errors
, "replace"))
2610 known_errorHandler
= 2;
2611 else if (!strcmp(errors
, "ignore"))
2612 known_errorHandler
= 3;
2613 else if (!strcmp(errors
, "xmlcharrefreplace"))
2614 known_errorHandler
= 4;
2616 known_errorHandler
= 0;
2618 switch (known_errorHandler
) {
2619 case 1: /* strict */
2620 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
2622 case 2: /* replace */
2623 while (collstart
++<collend
)
2624 *str
++ = '?'; /* fall through */
2625 case 3: /* ignore */
2628 case 4: /* xmlcharrefreplace */
2629 respos
= str
-PyString_AS_STRING(res
);
2630 /* determine replacement size (temporarily (mis)uses p) */
2631 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
2640 #ifndef Py_UNICODE_WIDE
2646 else if (*p
<1000000)
2652 requiredsize
= respos
+repsize
+(endp
-collend
);
2653 if (requiredsize
> ressize
) {
2654 if (requiredsize
<2*ressize
)
2655 requiredsize
= 2*ressize
;
2656 if (_PyString_Resize(&res
, requiredsize
))
2658 str
= PyString_AS_STRING(res
) + respos
;
2659 ressize
= requiredsize
;
2661 /* generate replacement (temporarily (mis)uses p) */
2662 for (p
= collstart
; p
< collend
; ++p
) {
2663 str
+= sprintf(str
, "&#%d;", (int)*p
);
2668 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
2669 encoding
, reason
, startp
, size
, &exc
,
2670 collstart
-startp
, collend
-startp
, &newpos
);
2671 if (repunicode
== NULL
)
2673 /* need more space? (at least enough for what we
2674 have+the replacement+the rest of the string, so
2675 we won't have to check space for encodable characters) */
2676 respos
= str
-PyString_AS_STRING(res
);
2677 repsize
= PyUnicode_GET_SIZE(repunicode
);
2678 requiredsize
= respos
+repsize
+(endp
-collend
);
2679 if (requiredsize
> ressize
) {
2680 if (requiredsize
<2*ressize
)
2681 requiredsize
= 2*ressize
;
2682 if (_PyString_Resize(&res
, requiredsize
)) {
2683 Py_DECREF(repunicode
);
2686 str
= PyString_AS_STRING(res
) + respos
;
2687 ressize
= requiredsize
;
2689 /* check if there is anything unencodable in the replacement
2690 and copy it to the output */
2691 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
2694 raise_encode_exception(&exc
, encoding
, startp
, size
,
2695 unicodepos
, unicodepos
+1, reason
);
2696 Py_DECREF(repunicode
);
2701 p
= startp
+ newpos
;
2702 Py_DECREF(repunicode
);
2706 /* Resize if we allocated to much */
2707 respos
= str
-PyString_AS_STRING(res
);
2709 /* If this falls res will be NULL */
2710 _PyString_Resize(&res
, respos
);
2711 Py_XDECREF(errorHandler
);
2717 Py_XDECREF(errorHandler
);
2722 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
2726 return unicode_encode_ucs1(p
, size
, errors
, 256);
2729 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
2731 if (!PyUnicode_Check(unicode
)) {
2732 PyErr_BadArgument();
2735 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
2736 PyUnicode_GET_SIZE(unicode
),
2740 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2742 PyObject
*PyUnicode_DecodeASCII(const char *s
,
2746 const char *starts
= s
;
2749 Py_ssize_t startinpos
;
2750 Py_ssize_t endinpos
;
2753 PyObject
*errorHandler
= NULL
;
2754 PyObject
*exc
= NULL
;
2756 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2757 if (size
== 1 && *(unsigned char*)s
< 128) {
2758 Py_UNICODE r
= *(unsigned char*)s
;
2759 return PyUnicode_FromUnicode(&r
, 1);
2762 v
= _PyUnicode_New(size
);
2766 return (PyObject
*)v
;
2767 p
= PyUnicode_AS_UNICODE(v
);
2770 register unsigned char c
= (unsigned char)*s
;
2776 startinpos
= s
-starts
;
2777 endinpos
= startinpos
+ 1;
2778 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
2779 if (unicode_decode_call_errorhandler(
2780 errors
, &errorHandler
,
2781 "ascii", "ordinal not in range(128)",
2782 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2783 (PyObject
**)&v
, &outpos
, &p
))
2787 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
2788 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2790 Py_XDECREF(errorHandler
);
2792 return (PyObject
*)v
;
2796 Py_XDECREF(errorHandler
);
2801 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
2805 return unicode_encode_ucs1(p
, size
, errors
, 128);
2808 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
2810 if (!PyUnicode_Check(unicode
)) {
2811 PyErr_BadArgument();
2814 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
2815 PyUnicode_GET_SIZE(unicode
),
2819 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2821 /* --- MBCS codecs for Windows -------------------------------------------- */
2823 #if SIZEOF_INT < SIZEOF_SSIZE_T
2827 /* XXX This code is limited to "true" double-byte encodings, as
2828 a) it assumes an incomplete character consists of a single byte, and
2829 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
2830 encodings, see IsDBCSLeadByteEx documentation. */
2832 static int is_dbcs_lead_byte(const char *s
, int offset
)
2834 const char *curr
= s
+ offset
;
2836 if (IsDBCSLeadByte(*curr
)) {
2837 const char *prev
= CharPrev(s
, curr
);
2838 return (prev
== curr
) || !IsDBCSLeadByte(*prev
) || (curr
- prev
== 2);
2844 * Decode MBCS string into unicode object. If 'final' is set, converts
2845 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
2847 static int decode_mbcs(PyUnicodeObject
**v
,
2848 const char *s
, /* MBCS string */
2849 int size
, /* sizeof MBCS string */
2858 /* Skip trailing lead-byte unless 'final' is set */
2859 if (!final
&& size
>= 1 && is_dbcs_lead_byte(s
, size
- 1))
2862 /* First get the size of the result */
2864 usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
2866 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2872 /* Create unicode object */
2873 *v
= _PyUnicode_New(usize
);
2878 /* Extend unicode object */
2879 n
= PyUnicode_GET_SIZE(*v
);
2880 if (_PyUnicode_Resize(v
, n
+ usize
) < 0)
2884 /* Do the conversion */
2886 p
= PyUnicode_AS_UNICODE(*v
) + n
;
2887 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
2888 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2896 PyObject
*PyUnicode_DecodeMBCSStateful(const char *s
,
2899 Py_ssize_t
*consumed
)
2901 PyUnicodeObject
*v
= NULL
;
2910 done
= decode_mbcs(&v
, s
, INT_MAX
, 0);
2913 done
= decode_mbcs(&v
, s
, (int)size
, !consumed
);
2924 if (size
> INT_MAX
) {
2931 return (PyObject
*)v
;
2934 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
2938 return PyUnicode_DecodeMBCSStateful(s
, size
, errors
, NULL
);
2942 * Convert unicode into string object (MBCS).
2943 * Returns 0 if succeed, -1 otherwise.
2945 static int encode_mbcs(PyObject
**repr
,
2946 const Py_UNICODE
*p
, /* unicode */
2947 int size
) /* size of unicode */
2954 /* First get the size of the result */
2956 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
2957 if (mbcssize
== 0) {
2958 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2963 if (*repr
== NULL
) {
2964 /* Create string object */
2965 *repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
2970 /* Extend string object */
2971 n
= PyString_Size(*repr
);
2972 if (_PyString_Resize(repr
, n
+ mbcssize
) < 0)
2976 /* Do the conversion */
2978 char *s
= PyString_AS_STRING(*repr
) + n
;
2979 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
2980 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2988 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
2992 PyObject
*repr
= NULL
;
2998 ret
= encode_mbcs(&repr
, p
, INT_MAX
);
3001 ret
= encode_mbcs(&repr
, p
, (int)size
);
3009 if (size
> INT_MAX
) {
3019 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
3021 if (!PyUnicode_Check(unicode
)) {
3022 PyErr_BadArgument();
3025 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
3026 PyUnicode_GET_SIZE(unicode
),
3032 #endif /* MS_WINDOWS */
3034 /* --- Character Mapping Codec -------------------------------------------- */
3036 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
3041 const char *starts
= s
;
3042 Py_ssize_t startinpos
;
3043 Py_ssize_t endinpos
;
3048 Py_ssize_t extrachars
= 0;
3049 PyObject
*errorHandler
= NULL
;
3050 PyObject
*exc
= NULL
;
3051 Py_UNICODE
*mapstring
= NULL
;
3052 Py_ssize_t maplen
= 0;
3054 /* Default to Latin-1 */
3055 if (mapping
== NULL
)
3056 return PyUnicode_DecodeLatin1(s
, size
, errors
);
3058 v
= _PyUnicode_New(size
);
3062 return (PyObject
*)v
;
3063 p
= PyUnicode_AS_UNICODE(v
);
3065 if (PyUnicode_CheckExact(mapping
)) {
3066 mapstring
= PyUnicode_AS_UNICODE(mapping
);
3067 maplen
= PyUnicode_GET_SIZE(mapping
);
3069 unsigned char ch
= *s
;
3070 Py_UNICODE x
= 0xfffe; /* illegal value */
3076 /* undefined mapping */
3077 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3078 startinpos
= s
-starts
;
3079 endinpos
= startinpos
+1;
3080 if (unicode_decode_call_errorhandler(
3081 errors
, &errorHandler
,
3082 "charmap", "character maps to <undefined>",
3083 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3084 (PyObject
**)&v
, &outpos
, &p
)) {
3095 unsigned char ch
= *s
;
3098 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3099 w
= PyInt_FromLong((long)ch
);
3102 x
= PyObject_GetItem(mapping
, w
);
3105 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3106 /* No mapping found means: mapping is undefined. */
3115 if (PyInt_Check(x
)) {
3116 long value
= PyInt_AS_LONG(x
);
3117 if (value
< 0 || value
> 65535) {
3118 PyErr_SetString(PyExc_TypeError
,
3119 "character mapping must be in range(65536)");
3123 *p
++ = (Py_UNICODE
)value
;
3125 else if (x
== Py_None
) {
3126 /* undefined mapping */
3127 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3128 startinpos
= s
-starts
;
3129 endinpos
= startinpos
+1;
3130 if (unicode_decode_call_errorhandler(
3131 errors
, &errorHandler
,
3132 "charmap", "character maps to <undefined>",
3133 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3134 (PyObject
**)&v
, &outpos
, &p
)) {
3141 else if (PyUnicode_Check(x
)) {
3142 Py_ssize_t targetsize
= PyUnicode_GET_SIZE(x
);
3144 if (targetsize
== 1)
3146 *p
++ = *PyUnicode_AS_UNICODE(x
);
3148 else if (targetsize
> 1) {
3150 if (targetsize
> extrachars
) {
3152 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
3153 Py_ssize_t needed
= (targetsize
- extrachars
) + \
3155 extrachars
+= needed
;
3156 if (_PyUnicode_Resize(&v
,
3157 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
3161 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
3164 PyUnicode_AS_UNICODE(x
),
3167 extrachars
-= targetsize
;
3169 /* 1-0 mapping: skip the character */
3172 /* wrong return value */
3173 PyErr_SetString(PyExc_TypeError
,
3174 "character mapping must return integer, None or unicode");
3182 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
3183 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3185 Py_XDECREF(errorHandler
);
3187 return (PyObject
*)v
;
3190 Py_XDECREF(errorHandler
);
3196 /* Charmap encoding: the lookup table */
3198 struct encoding_map
{
3200 unsigned char level1
[32];
3202 unsigned char level23
[1];
3206 encoding_map_size(PyObject
*obj
, PyObject
* args
)
3208 struct encoding_map
*map
= (struct encoding_map
*)obj
;
3209 return PyInt_FromLong(sizeof(*map
) - 1 + 16*map
->count2
+
3213 static PyMethodDef encoding_map_methods
[] = {
3214 {"size", encoding_map_size
, METH_NOARGS
,
3215 PyDoc_STR("Return the size (in bytes) of this object") },
3220 encoding_map_dealloc(PyObject
* o
)
3225 static PyTypeObject EncodingMapType
= {
3226 PyObject_HEAD_INIT(NULL
)
3228 "EncodingMap", /*tp_name*/
3229 sizeof(struct encoding_map
), /*tp_basicsize*/
3232 encoding_map_dealloc
, /*tp_dealloc*/
3239 0, /*tp_as_sequence*/
3240 0, /*tp_as_mapping*/
3247 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
3251 0, /*tp_richcompare*/
3252 0, /*tp_weaklistoffset*/
3255 encoding_map_methods
, /*tp_methods*/
3262 0, /*tp_dictoffset*/
3271 PyUnicode_BuildEncodingMap(PyObject
* string
)
3275 struct encoding_map
*mresult
;
3278 unsigned char level1
[32];
3279 unsigned char level2
[512];
3280 unsigned char *mlevel1
, *mlevel2
, *mlevel3
;
3281 int count2
= 0, count3
= 0;
3283 if (!PyUnicode_Check(string
) || PyUnicode_GetSize(string
) != 256) {
3284 PyErr_BadArgument();
3287 decode
= PyUnicode_AS_UNICODE(string
);
3288 memset(level1
, 0xFF, sizeof level1
);
3289 memset(level2
, 0xFF, sizeof level2
);
3291 /* If there isn't a one-to-one mapping of NULL to \0,
3292 or if there are non-BMP characters, we need to use
3293 a mapping dictionary. */
3296 for (i
= 1; i
< 256; i
++) {
3299 #ifdef Py_UNICODE_WIDE
3300 || decode
[i
] > 0xFFFF
3306 if (decode
[i
] == 0xFFFE)
3307 /* unmapped character */
3309 l1
= decode
[i
] >> 11;
3310 l2
= decode
[i
] >> 7;
3311 if (level1
[l1
] == 0xFF)
3312 level1
[l1
] = count2
++;
3313 if (level2
[l2
] == 0xFF)
3314 level2
[l2
] = count3
++;
3317 if (count2
>= 0xFF || count3
>= 0xFF)
3321 PyObject
*result
= PyDict_New();
3322 PyObject
*key
, *value
;
3325 for (i
= 0; i
< 256; i
++) {
3327 key
= PyInt_FromLong(decode
[i
]);
3328 value
= PyInt_FromLong(i
);
3331 if (PyDict_SetItem(result
, key
, value
) == -1)
3344 /* Create a three-level trie */
3345 result
= PyObject_MALLOC(sizeof(struct encoding_map
) +
3346 16*count2
+ 128*count3
- 1);
3348 return PyErr_NoMemory();
3349 PyObject_Init(result
, &EncodingMapType
);
3350 mresult
= (struct encoding_map
*)result
;
3351 mresult
->count2
= count2
;
3352 mresult
->count3
= count3
;
3353 mlevel1
= mresult
->level1
;
3354 mlevel2
= mresult
->level23
;
3355 mlevel3
= mresult
->level23
+ 16*count2
;
3356 memcpy(mlevel1
, level1
, 32);
3357 memset(mlevel2
, 0xFF, 16*count2
);
3358 memset(mlevel3
, 0, 128*count3
);
3360 for (i
= 1; i
< 256; i
++) {
3361 int o1
, o2
, o3
, i2
, i3
;
3362 if (decode
[i
] == 0xFFFE)
3363 /* unmapped character */
3366 o2
= (decode
[i
]>>7) & 0xF;
3367 i2
= 16*mlevel1
[o1
] + o2
;
3368 if (mlevel2
[i2
] == 0xFF)
3369 mlevel2
[i2
] = count3
++;
3370 o3
= decode
[i
] & 0x7F;
3371 i3
= 128*mlevel2
[i2
] + o3
;
3378 encoding_map_lookup(Py_UNICODE c
, PyObject
*mapping
)
3380 struct encoding_map
*map
= (struct encoding_map
*)mapping
;
3382 int l2
= (c
>>7) & 0xF;
3386 #ifdef Py_UNICODE_WIDE
3394 i
= map
->level1
[l1
];
3399 i
= map
->level23
[16*i
+l2
];
3404 i
= map
->level23
[16*map
->count2
+ 128*i
+ l3
];
3411 /* Lookup the character ch in the mapping. If the character
3412 can't be found, Py_None is returned (or NULL, if another
3414 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
3416 PyObject
*w
= PyInt_FromLong((long)c
);
3421 x
= PyObject_GetItem(mapping
, w
);
3424 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3425 /* No mapping found means: mapping is undefined. */
3433 else if (x
== Py_None
)
3435 else if (PyInt_Check(x
)) {
3436 long value
= PyInt_AS_LONG(x
);
3437 if (value
< 0 || value
> 255) {
3438 PyErr_SetString(PyExc_TypeError
,
3439 "character mapping must be in range(256)");
3445 else if (PyString_Check(x
))
3448 /* wrong return value */
3449 PyErr_SetString(PyExc_TypeError
,
3450 "character mapping must return integer, None or str");
3457 charmapencode_resize(PyObject
**outobj
, Py_ssize_t
*outpos
, Py_ssize_t requiredsize
)
3459 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
3460 /* exponentially overallocate to minimize reallocations */
3461 if (requiredsize
< 2*outsize
)
3462 requiredsize
= 2*outsize
;
3463 if (_PyString_Resize(outobj
, requiredsize
)) {
3469 typedef enum charmapencode_result
{
3470 enc_SUCCESS
, enc_FAILED
, enc_EXCEPTION
3471 }charmapencode_result
;
3472 /* lookup the character, put the result in the output string and adjust
3473 various state variables. Reallocate the output string if not enough
3474 space is available. Return a new reference to the object that
3475 was put in the output buffer, or Py_None, if the mapping was undefined
3476 (in which case no character was written) or NULL, if a
3477 reallocation error occurred. The caller must decref the result */
3479 charmapencode_result
charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
3480 PyObject
**outobj
, Py_ssize_t
*outpos
)
3484 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
3486 if (mapping
->ob_type
== &EncodingMapType
) {
3487 int res
= encoding_map_lookup(c
, mapping
);
3488 Py_ssize_t requiredsize
= *outpos
+1;
3491 if (outsize
<requiredsize
)
3492 if (!charmapencode_resize(outobj
, outpos
, requiredsize
))
3493 return enc_EXCEPTION
;
3494 outstart
= PyString_AS_STRING(*outobj
);
3495 outstart
[(*outpos
)++] = (char)res
;
3499 rep
= charmapencode_lookup(c
, mapping
);
3501 return enc_EXCEPTION
;
3502 else if (rep
==Py_None
) {
3506 if (PyInt_Check(rep
)) {
3507 Py_ssize_t requiredsize
= *outpos
+1;
3508 if (outsize
<requiredsize
)
3509 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
3511 return enc_EXCEPTION
;
3513 outstart
= PyString_AS_STRING(*outobj
);
3514 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
3517 const char *repchars
= PyString_AS_STRING(rep
);
3518 Py_ssize_t repsize
= PyString_GET_SIZE(rep
);
3519 Py_ssize_t requiredsize
= *outpos
+repsize
;
3520 if (outsize
<requiredsize
)
3521 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
3523 return enc_EXCEPTION
;
3525 outstart
= PyString_AS_STRING(*outobj
);
3526 memcpy(outstart
+ *outpos
, repchars
, repsize
);
3534 /* handle an error in PyUnicode_EncodeCharmap
3535 Return 0 on success, -1 on error */
3537 int charmap_encoding_error(
3538 const Py_UNICODE
*p
, Py_ssize_t size
, Py_ssize_t
*inpos
, PyObject
*mapping
,
3539 PyObject
**exceptionObject
,
3540 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
3541 PyObject
**res
, Py_ssize_t
*respos
)
3543 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
3547 /* startpos for collecting unencodable chars */
3548 Py_ssize_t collstartpos
= *inpos
;
3549 Py_ssize_t collendpos
= *inpos
+1;
3551 char *encoding
= "charmap";
3552 char *reason
= "character maps to <undefined>";
3553 charmapencode_result x
;
3555 /* find all unencodable characters */
3556 while (collendpos
< size
) {
3558 if (mapping
->ob_type
== &EncodingMapType
) {
3559 int res
= encoding_map_lookup(p
[collendpos
], mapping
);
3566 rep
= charmapencode_lookup(p
[collendpos
], mapping
);
3569 else if (rep
!=Py_None
) {
3576 /* cache callback name lookup
3577 * (if not done yet, i.e. it's the first error) */
3578 if (*known_errorHandler
==-1) {
3579 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3580 *known_errorHandler
= 1;
3581 else if (!strcmp(errors
, "replace"))
3582 *known_errorHandler
= 2;
3583 else if (!strcmp(errors
, "ignore"))
3584 *known_errorHandler
= 3;
3585 else if (!strcmp(errors
, "xmlcharrefreplace"))
3586 *known_errorHandler
= 4;
3588 *known_errorHandler
= 0;
3590 switch (*known_errorHandler
) {
3591 case 1: /* strict */
3592 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3594 case 2: /* replace */
3595 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
3596 x
= charmapencode_output('?', mapping
, res
, respos
);
3597 if (x
==enc_EXCEPTION
) {
3600 else if (x
==enc_FAILED
) {
3601 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3606 case 3: /* ignore */
3607 *inpos
= collendpos
;
3609 case 4: /* xmlcharrefreplace */
3610 /* generate replacement (temporarily (mis)uses p) */
3611 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
3612 char buffer
[2+29+1+1];
3614 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
3615 for (cp
= buffer
; *cp
; ++cp
) {
3616 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
3617 if (x
==enc_EXCEPTION
)
3619 else if (x
==enc_FAILED
) {
3620 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3625 *inpos
= collendpos
;
3628 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
3629 encoding
, reason
, p
, size
, exceptionObject
,
3630 collstartpos
, collendpos
, &newpos
);
3631 if (repunicode
== NULL
)
3633 /* generate replacement */
3634 repsize
= PyUnicode_GET_SIZE(repunicode
);
3635 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
3636 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
3637 if (x
==enc_EXCEPTION
) {
3640 else if (x
==enc_FAILED
) {
3641 Py_DECREF(repunicode
);
3642 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3647 Py_DECREF(repunicode
);
3652 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
3658 PyObject
*res
= NULL
;
3659 /* current input position */
3660 Py_ssize_t inpos
= 0;
3661 /* current output position */
3662 Py_ssize_t respos
= 0;
3663 PyObject
*errorHandler
= NULL
;
3664 PyObject
*exc
= NULL
;
3665 /* the following variable is used for caching string comparisons
3666 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3667 * 3=ignore, 4=xmlcharrefreplace */
3668 int known_errorHandler
= -1;
3670 /* Default to Latin-1 */
3671 if (mapping
== NULL
)
3672 return PyUnicode_EncodeLatin1(p
, size
, errors
);
3674 /* allocate enough for a simple encoding without
3675 replacements, if we need more, we'll resize */
3676 res
= PyString_FromStringAndSize(NULL
, size
);
3682 while (inpos
<size
) {
3683 /* try to encode it */
3684 charmapencode_result x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
3685 if (x
==enc_EXCEPTION
) /* error */
3687 if (x
==enc_FAILED
) { /* unencodable character */
3688 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
3690 &known_errorHandler
, &errorHandler
, errors
,
3696 /* done with this character => adjust input position */
3700 /* Resize if we allocated to much */
3701 if (respos
<PyString_GET_SIZE(res
)) {
3702 if (_PyString_Resize(&res
, respos
))
3706 Py_XDECREF(errorHandler
);
3712 Py_XDECREF(errorHandler
);
3716 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
3719 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
3720 PyErr_BadArgument();
3723 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
3724 PyUnicode_GET_SIZE(unicode
),
3729 /* create or adjust a UnicodeTranslateError */
3730 static void make_translate_exception(PyObject
**exceptionObject
,
3731 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3732 Py_ssize_t startpos
, Py_ssize_t endpos
,
3735 if (*exceptionObject
== NULL
) {
3736 *exceptionObject
= PyUnicodeTranslateError_Create(
3737 unicode
, size
, startpos
, endpos
, reason
);
3740 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
3742 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
3744 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
3748 Py_DECREF(*exceptionObject
);
3749 *exceptionObject
= NULL
;
3753 /* raises a UnicodeTranslateError */
3754 static void raise_translate_exception(PyObject
**exceptionObject
,
3755 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3756 Py_ssize_t startpos
, Py_ssize_t endpos
,
3759 make_translate_exception(exceptionObject
,
3760 unicode
, size
, startpos
, endpos
, reason
);
3761 if (*exceptionObject
!= NULL
)
3762 PyCodec_StrictErrors(*exceptionObject
);
3765 /* error handling callback helper:
3766 build arguments, call the callback and check the arguments,
3767 put the result into newpos and return the replacement string, which
3768 has to be freed by the caller */
3769 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
3770 PyObject
**errorHandler
,
3772 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
3773 Py_ssize_t startpos
, Py_ssize_t endpos
,
3776 static char *argparse
= "O!n;translating error handler must return (unicode, int) tuple";
3778 Py_ssize_t i_newpos
;
3780 PyObject
*resunicode
;
3782 if (*errorHandler
== NULL
) {
3783 *errorHandler
= PyCodec_LookupError(errors
);
3784 if (*errorHandler
== NULL
)
3788 make_translate_exception(exceptionObject
,
3789 unicode
, size
, startpos
, endpos
, reason
);
3790 if (*exceptionObject
== NULL
)
3793 restuple
= PyObject_CallFunctionObjArgs(
3794 *errorHandler
, *exceptionObject
, NULL
);
3795 if (restuple
== NULL
)
3797 if (!PyTuple_Check(restuple
)) {
3798 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
3799 Py_DECREF(restuple
);
3802 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3803 &resunicode
, &i_newpos
)) {
3804 Py_DECREF(restuple
);
3808 *newpos
= size
+i_newpos
;
3811 if (*newpos
<0 || *newpos
>size
) {
3812 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
3813 Py_DECREF(restuple
);
3816 Py_INCREF(resunicode
);
3817 Py_DECREF(restuple
);
3821 /* Lookup the character ch in the mapping and put the result in result,
3822 which must be decrefed by the caller.
3823 Return 0 on success, -1 on error */
3825 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
3827 PyObject
*w
= PyInt_FromLong((long)c
);
3832 x
= PyObject_GetItem(mapping
, w
);
3835 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3836 /* No mapping found means: use 1:1 mapping. */
3843 else if (x
== Py_None
) {
3847 else if (PyInt_Check(x
)) {
3848 long value
= PyInt_AS_LONG(x
);
3849 long max
= PyUnicode_GetMax();
3850 if (value
< 0 || value
> max
) {
3851 PyErr_Format(PyExc_TypeError
,
3852 "character mapping must be in range(0x%lx)", max
+1);
3859 else if (PyUnicode_Check(x
)) {
3864 /* wrong return value */
3865 PyErr_SetString(PyExc_TypeError
,
3866 "character mapping must return integer, None or unicode");
3871 /* ensure that *outobj is at least requiredsize characters long,
3872 if not reallocate and adjust various state variables.
3873 Return 0 on success, -1 on error */
3875 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
3876 Py_ssize_t requiredsize
)
3878 Py_ssize_t oldsize
= PyUnicode_GET_SIZE(*outobj
);
3879 if (requiredsize
> oldsize
) {
3880 /* remember old output position */
3881 Py_ssize_t outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
3882 /* exponentially overallocate to minimize reallocations */
3883 if (requiredsize
< 2 * oldsize
)
3884 requiredsize
= 2 * oldsize
;
3885 if (_PyUnicode_Resize(outobj
, requiredsize
) < 0)
3887 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
3891 /* lookup the character, put the result in the output string and adjust
3892 various state variables. Return a new reference to the object that
3893 was put in the output buffer in *result, or Py_None, if the mapping was
3894 undefined (in which case no character was written).
3895 The called must decref result.
3896 Return 0 on success, -1 on error. */
3898 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
3899 Py_ssize_t insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
3902 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
3905 /* not found => default to 1:1 mapping */
3906 *(*outp
)++ = *curinp
;
3908 else if (*res
==Py_None
)
3910 else if (PyInt_Check(*res
)) {
3911 /* no overflow check, because we know that the space is enough */
3912 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
3914 else if (PyUnicode_Check(*res
)) {
3915 Py_ssize_t repsize
= PyUnicode_GET_SIZE(*res
);
3917 /* no overflow check, because we know that the space is enough */
3918 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
3920 else if (repsize
!=0) {
3921 /* more than one character */
3922 Py_ssize_t requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
3923 (insize
- (curinp
-startinp
)) +
3925 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
3927 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
3936 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
3942 PyObject
*res
= NULL
;
3943 /* pointers to the beginning and end+1 of input */
3944 const Py_UNICODE
*startp
= p
;
3945 const Py_UNICODE
*endp
= p
+ size
;
3946 /* pointer into the output */
3948 /* current output position */
3949 Py_ssize_t respos
= 0;
3950 char *reason
= "character maps to <undefined>";
3951 PyObject
*errorHandler
= NULL
;
3952 PyObject
*exc
= NULL
;
3953 /* the following variable is used for caching string comparisons
3954 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3955 * 3=ignore, 4=xmlcharrefreplace */
3956 int known_errorHandler
= -1;
3958 if (mapping
== NULL
) {
3959 PyErr_BadArgument();
3963 /* allocate enough for a simple 1:1 translation without
3964 replacements, if we need more, we'll resize */
3965 res
= PyUnicode_FromUnicode(NULL
, size
);
3970 str
= PyUnicode_AS_UNICODE(res
);
3973 /* try to encode it */
3975 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
3980 if (x
!=Py_None
) /* it worked => adjust input pointer */
3982 else { /* untranslatable character */
3983 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
3987 /* startpos for collecting untranslatable chars */
3988 const Py_UNICODE
*collstart
= p
;
3989 const Py_UNICODE
*collend
= p
+1;
3990 const Py_UNICODE
*coll
;
3992 /* find all untranslatable characters */
3993 while (collend
< endp
) {
3994 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
4001 /* cache callback name lookup
4002 * (if not done yet, i.e. it's the first error) */
4003 if (known_errorHandler
==-1) {
4004 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4005 known_errorHandler
= 1;
4006 else if (!strcmp(errors
, "replace"))
4007 known_errorHandler
= 2;
4008 else if (!strcmp(errors
, "ignore"))
4009 known_errorHandler
= 3;
4010 else if (!strcmp(errors
, "xmlcharrefreplace"))
4011 known_errorHandler
= 4;
4013 known_errorHandler
= 0;
4015 switch (known_errorHandler
) {
4016 case 1: /* strict */
4017 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
4019 case 2: /* replace */
4020 /* No need to check for space, this is a 1:1 replacement */
4021 for (coll
= collstart
; coll
<collend
; ++coll
)
4024 case 3: /* ignore */
4027 case 4: /* xmlcharrefreplace */
4028 /* generate replacement (temporarily (mis)uses p) */
4029 for (p
= collstart
; p
< collend
; ++p
) {
4030 char buffer
[2+29+1+1];
4032 sprintf(buffer
, "&#%d;", (int)*p
);
4033 if (charmaptranslate_makespace(&res
, &str
,
4034 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
4036 for (cp
= buffer
; *cp
; ++cp
)
4042 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
4043 reason
, startp
, size
, &exc
,
4044 collstart
-startp
, collend
-startp
, &newpos
);
4045 if (repunicode
== NULL
)
4047 /* generate replacement */
4048 repsize
= PyUnicode_GET_SIZE(repunicode
);
4049 if (charmaptranslate_makespace(&res
, &str
,
4050 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
4051 Py_DECREF(repunicode
);
4054 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
4056 p
= startp
+ newpos
;
4057 Py_DECREF(repunicode
);
4061 /* Resize if we allocated to much */
4062 respos
= str
-PyUnicode_AS_UNICODE(res
);
4063 if (respos
<PyUnicode_GET_SIZE(res
)) {
4064 if (_PyUnicode_Resize(&res
, respos
) < 0)
4068 Py_XDECREF(errorHandler
);
4074 Py_XDECREF(errorHandler
);
4078 PyObject
*PyUnicode_Translate(PyObject
*str
,
4084 str
= PyUnicode_FromObject(str
);
4087 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
4088 PyUnicode_GET_SIZE(str
),
4099 /* --- Decimal Encoder ---------------------------------------------------- */
4101 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
4106 Py_UNICODE
*p
, *end
;
4107 PyObject
*errorHandler
= NULL
;
4108 PyObject
*exc
= NULL
;
4109 const char *encoding
= "decimal";
4110 const char *reason
= "invalid decimal Unicode string";
4111 /* the following variable is used for caching string comparisons
4112 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4113 int known_errorHandler
= -1;
4115 if (output
== NULL
) {
4116 PyErr_BadArgument();
4123 register Py_UNICODE ch
= *p
;
4125 PyObject
*repunicode
;
4129 Py_UNICODE
*collstart
;
4130 Py_UNICODE
*collend
;
4132 if (Py_UNICODE_ISSPACE(ch
)) {
4137 decimal
= Py_UNICODE_TODECIMAL(ch
);
4139 *output
++ = '0' + decimal
;
4143 if (0 < ch
&& ch
< 256) {
4144 *output
++ = (char)ch
;
4148 /* All other characters are considered unencodable */
4151 while (collend
< end
) {
4152 if ((0 < *collend
&& *collend
< 256) ||
4153 !Py_UNICODE_ISSPACE(*collend
) ||
4154 Py_UNICODE_TODECIMAL(*collend
))
4157 /* cache callback name lookup
4158 * (if not done yet, i.e. it's the first error) */
4159 if (known_errorHandler
==-1) {
4160 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4161 known_errorHandler
= 1;
4162 else if (!strcmp(errors
, "replace"))
4163 known_errorHandler
= 2;
4164 else if (!strcmp(errors
, "ignore"))
4165 known_errorHandler
= 3;
4166 else if (!strcmp(errors
, "xmlcharrefreplace"))
4167 known_errorHandler
= 4;
4169 known_errorHandler
= 0;
4171 switch (known_errorHandler
) {
4172 case 1: /* strict */
4173 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
4175 case 2: /* replace */
4176 for (p
= collstart
; p
< collend
; ++p
)
4179 case 3: /* ignore */
4182 case 4: /* xmlcharrefreplace */
4183 /* generate replacement (temporarily (mis)uses p) */
4184 for (p
= collstart
; p
< collend
; ++p
)
4185 output
+= sprintf(output
, "&#%d;", (int)*p
);
4189 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
4190 encoding
, reason
, s
, length
, &exc
,
4191 collstart
-s
, collend
-s
, &newpos
);
4192 if (repunicode
== NULL
)
4194 /* generate replacement */
4195 repsize
= PyUnicode_GET_SIZE(repunicode
);
4196 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
4197 Py_UNICODE ch
= *uni2
;
4198 if (Py_UNICODE_ISSPACE(ch
))
4201 decimal
= Py_UNICODE_TODECIMAL(ch
);
4203 *output
++ = '0' + decimal
;
4204 else if (0 < ch
&& ch
< 256)
4205 *output
++ = (char)ch
;
4207 Py_DECREF(repunicode
);
4208 raise_encode_exception(&exc
, encoding
,
4209 s
, length
, collstart
-s
, collend
-s
, reason
);
4215 Py_DECREF(repunicode
);
4218 /* 0-terminate the output string */
4221 Py_XDECREF(errorHandler
);
4226 Py_XDECREF(errorHandler
);
4230 /* --- Helpers ------------------------------------------------------------ */
4232 #define STRINGLIB_CHAR Py_UNICODE
4234 #define STRINGLIB_LEN PyUnicode_GET_SIZE
4235 #define STRINGLIB_NEW PyUnicode_FromUnicode
4236 #define STRINGLIB_STR PyUnicode_AS_UNICODE
4238 Py_LOCAL_INLINE(int)
4239 STRINGLIB_CMP(const Py_UNICODE
* str
, const Py_UNICODE
* other
, Py_ssize_t len
)
4241 if (str
[0] != other
[0])
4243 return memcmp((void*) str
, (void*) other
, len
* sizeof(Py_UNICODE
));
4246 #define STRINGLIB_EMPTY unicode_empty
4248 #include "stringlib/fastsearch.h"
4250 #include "stringlib/count.h"
4251 #include "stringlib/find.h"
4252 #include "stringlib/partition.h"
4254 /* helper macro to fixup start/end slice values */
4255 #define FIX_START_END(obj) \
4257 start += (obj)->length; \
4260 if (end > (obj)->length) \
4261 end = (obj)->length; \
4263 end += (obj)->length; \
4267 Py_ssize_t
PyUnicode_Count(PyObject
*str
,
4273 PyUnicodeObject
* str_obj
;
4274 PyUnicodeObject
* sub_obj
;
4276 str_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(str
);
4279 sub_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(substr
);
4285 FIX_START_END(str_obj
);
4287 result
= stringlib_count(
4288 str_obj
->str
+ start
, end
- start
, sub_obj
->str
, sub_obj
->length
4297 Py_ssize_t
PyUnicode_Find(PyObject
*str
,
4305 str
= PyUnicode_FromObject(str
);
4308 sub
= PyUnicode_FromObject(sub
);
4315 result
= stringlib_find_slice(
4316 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
4317 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
4321 result
= stringlib_rfind_slice(
4322 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
4323 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
4334 int tailmatch(PyUnicodeObject
*self
,
4335 PyUnicodeObject
*substring
,
4340 if (substring
->length
== 0)
4343 FIX_START_END(self
);
4345 end
-= substring
->length
;
4349 if (direction
> 0) {
4350 if (Py_UNICODE_MATCH(self
, end
, substring
))
4353 if (Py_UNICODE_MATCH(self
, start
, substring
))
4360 Py_ssize_t
PyUnicode_Tailmatch(PyObject
*str
,
4368 str
= PyUnicode_FromObject(str
);
4371 substr
= PyUnicode_FromObject(substr
);
4372 if (substr
== NULL
) {
4377 result
= tailmatch((PyUnicodeObject
*)str
,
4378 (PyUnicodeObject
*)substr
,
4379 start
, end
, direction
);
4385 /* Apply fixfct filter to the Unicode object self and return a
4386 reference to the modified object */
4389 PyObject
*fixup(PyUnicodeObject
*self
,
4390 int (*fixfct
)(PyUnicodeObject
*s
))
4395 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
4399 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
4401 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
4402 /* fixfct should return TRUE if it modified the buffer. If
4403 FALSE, return a reference to the original buffer instead
4404 (to save space, not time) */
4407 return (PyObject
*) self
;
4409 return (PyObject
*) u
;
4413 int fixupper(PyUnicodeObject
*self
)
4415 Py_ssize_t len
= self
->length
;
4416 Py_UNICODE
*s
= self
->str
;
4420 register Py_UNICODE ch
;
4422 ch
= Py_UNICODE_TOUPPER(*s
);
4434 int fixlower(PyUnicodeObject
*self
)
4436 Py_ssize_t len
= self
->length
;
4437 Py_UNICODE
*s
= self
->str
;
4441 register Py_UNICODE ch
;
4443 ch
= Py_UNICODE_TOLOWER(*s
);
4455 int fixswapcase(PyUnicodeObject
*self
)
4457 Py_ssize_t len
= self
->length
;
4458 Py_UNICODE
*s
= self
->str
;
4462 if (Py_UNICODE_ISUPPER(*s
)) {
4463 *s
= Py_UNICODE_TOLOWER(*s
);
4465 } else if (Py_UNICODE_ISLOWER(*s
)) {
4466 *s
= Py_UNICODE_TOUPPER(*s
);
4476 int fixcapitalize(PyUnicodeObject
*self
)
4478 Py_ssize_t len
= self
->length
;
4479 Py_UNICODE
*s
= self
->str
;
4484 if (Py_UNICODE_ISLOWER(*s
)) {
4485 *s
= Py_UNICODE_TOUPPER(*s
);
4490 if (Py_UNICODE_ISUPPER(*s
)) {
4491 *s
= Py_UNICODE_TOLOWER(*s
);
4500 int fixtitle(PyUnicodeObject
*self
)
4502 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4503 register Py_UNICODE
*e
;
4504 int previous_is_cased
;
4506 /* Shortcut for single character strings */
4507 if (PyUnicode_GET_SIZE(self
) == 1) {
4508 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
4517 e
= p
+ PyUnicode_GET_SIZE(self
);
4518 previous_is_cased
= 0;
4519 for (; p
< e
; p
++) {
4520 register const Py_UNICODE ch
= *p
;
4522 if (previous_is_cased
)
4523 *p
= Py_UNICODE_TOLOWER(ch
);
4525 *p
= Py_UNICODE_TOTITLE(ch
);
4527 if (Py_UNICODE_ISLOWER(ch
) ||
4528 Py_UNICODE_ISUPPER(ch
) ||
4529 Py_UNICODE_ISTITLE(ch
))
4530 previous_is_cased
= 1;
4532 previous_is_cased
= 0;
4538 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
4540 PyObject
*internal_separator
= NULL
;
4541 const Py_UNICODE blank
= ' ';
4542 const Py_UNICODE
*sep
= &blank
;
4543 Py_ssize_t seplen
= 1;
4544 PyUnicodeObject
*res
= NULL
; /* the result */
4545 Py_ssize_t res_alloc
= 100; /* # allocated bytes for string in res */
4546 Py_ssize_t res_used
; /* # used bytes */
4547 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
4548 PyObject
*fseq
; /* PySequence_Fast(seq) */
4549 Py_ssize_t seqlen
; /* len(fseq) -- number of items in sequence */
4553 fseq
= PySequence_Fast(seq
, "");
4558 /* Grrrr. A codec may be invoked to convert str objects to
4559 * Unicode, and so it's possible to call back into Python code
4560 * during PyUnicode_FromObject(), and so it's possible for a sick
4561 * codec to change the size of fseq (if seq is a list). Therefore
4562 * we have to keep refetching the size -- can't assume seqlen
4565 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4566 /* If empty sequence, return u"". */
4568 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
4571 /* If singleton sequence with an exact Unicode, return that. */
4573 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
4574 if (PyUnicode_CheckExact(item
)) {
4576 res
= (PyUnicodeObject
*)item
;
4581 /* At least two items to join, or one that isn't exact Unicode. */
4583 /* Set up sep and seplen -- they're needed. */
4584 if (separator
== NULL
) {
4589 internal_separator
= PyUnicode_FromObject(separator
);
4590 if (internal_separator
== NULL
)
4592 sep
= PyUnicode_AS_UNICODE(internal_separator
);
4593 seplen
= PyUnicode_GET_SIZE(internal_separator
);
4594 /* In case PyUnicode_FromObject() mutated seq. */
4595 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4600 res
= _PyUnicode_New(res_alloc
);
4603 res_p
= PyUnicode_AS_UNICODE(res
);
4606 for (i
= 0; i
< seqlen
; ++i
) {
4608 Py_ssize_t new_res_used
;
4610 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
4611 /* Convert item to Unicode. */
4612 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
4613 PyErr_Format(PyExc_TypeError
,
4614 "sequence item %zd: expected string or Unicode,"
4616 i
, item
->ob_type
->tp_name
);
4619 item
= PyUnicode_FromObject(item
);
4622 /* We own a reference to item from here on. */
4624 /* In case PyUnicode_FromObject() mutated seq. */
4625 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4627 /* Make sure we have enough space for the separator and the item. */
4628 itemlen
= PyUnicode_GET_SIZE(item
);
4629 new_res_used
= res_used
+ itemlen
;
4630 if (new_res_used
< 0)
4632 if (i
< seqlen
- 1) {
4633 new_res_used
+= seplen
;
4634 if (new_res_used
< 0)
4637 if (new_res_used
> res_alloc
) {
4638 /* double allocated size until it's big enough */
4640 res_alloc
+= res_alloc
;
4643 } while (new_res_used
> res_alloc
);
4644 if (_PyUnicode_Resize(&res
, res_alloc
) < 0) {
4648 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
4651 /* Copy item, and maybe the separator. */
4652 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), itemlen
);
4654 if (i
< seqlen
- 1) {
4655 Py_UNICODE_COPY(res_p
, sep
, seplen
);
4659 res_used
= new_res_used
;
4662 /* Shrink res to match the used area; this probably can't fail,
4663 * but it's cheap to check.
4665 if (_PyUnicode_Resize(&res
, res_used
) < 0)
4669 Py_XDECREF(internal_separator
);
4671 return (PyObject
*)res
;
4674 PyErr_SetString(PyExc_OverflowError
,
4675 "join() result is too long for a Python string");
4680 Py_XDECREF(internal_separator
);
4687 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
4699 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
4704 u
= _PyUnicode_New(left
+ self
->length
+ right
);
4707 Py_UNICODE_FILL(u
->str
, fill
, left
);
4708 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
4710 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
4716 #define SPLIT_APPEND(data, left, right) \
4717 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4720 if (PyList_Append(list, str)) { \
4728 PyObject
*split_whitespace(PyUnicodeObject
*self
,
4730 Py_ssize_t maxcount
)
4732 register Py_ssize_t i
;
4733 register Py_ssize_t j
;
4734 Py_ssize_t len
= self
->length
;
4737 for (i
= j
= 0; i
< len
; ) {
4739 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
4742 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
4745 if (maxcount
-- <= 0)
4747 SPLIT_APPEND(self
->str
, j
, i
);
4748 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
4754 SPLIT_APPEND(self
->str
, j
, len
);
4763 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
4766 register Py_ssize_t i
;
4767 register Py_ssize_t j
;
4773 string
= PyUnicode_FromObject(string
);
4776 data
= PyUnicode_AS_UNICODE(string
);
4777 len
= PyUnicode_GET_SIZE(string
);
4779 list
= PyList_New(0);
4783 for (i
= j
= 0; i
< len
; ) {
4786 /* Find a line and append it */
4787 while (i
< len
&& !BLOOM_LINEBREAK(data
[i
]))
4790 /* Skip the line break reading CRLF as one line break */
4793 if (data
[i
] == '\r' && i
+ 1 < len
&&
4801 SPLIT_APPEND(data
, j
, eol
);
4805 SPLIT_APPEND(data
, j
, len
);
4818 PyObject
*split_char(PyUnicodeObject
*self
,
4821 Py_ssize_t maxcount
)
4823 register Py_ssize_t i
;
4824 register Py_ssize_t j
;
4825 Py_ssize_t len
= self
->length
;
4828 for (i
= j
= 0; i
< len
; ) {
4829 if (self
->str
[i
] == ch
) {
4830 if (maxcount
-- <= 0)
4832 SPLIT_APPEND(self
->str
, j
, i
);
4838 SPLIT_APPEND(self
->str
, j
, len
);
4848 PyObject
*split_substring(PyUnicodeObject
*self
,
4850 PyUnicodeObject
*substring
,
4851 Py_ssize_t maxcount
)
4853 register Py_ssize_t i
;
4854 register Py_ssize_t j
;
4855 Py_ssize_t len
= self
->length
;
4856 Py_ssize_t sublen
= substring
->length
;
4859 for (i
= j
= 0; i
<= len
- sublen
; ) {
4860 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
4861 if (maxcount
-- <= 0)
4863 SPLIT_APPEND(self
->str
, j
, i
);
4869 SPLIT_APPEND(self
->str
, j
, len
);
4879 PyObject
*rsplit_whitespace(PyUnicodeObject
*self
,
4881 Py_ssize_t maxcount
)
4883 register Py_ssize_t i
;
4884 register Py_ssize_t j
;
4885 Py_ssize_t len
= self
->length
;
4888 for (i
= j
= len
- 1; i
>= 0; ) {
4890 while (i
>= 0 && Py_UNICODE_ISSPACE(self
->str
[i
]))
4893 while (i
>= 0 && !Py_UNICODE_ISSPACE(self
->str
[i
]))
4896 if (maxcount
-- <= 0)
4898 SPLIT_APPEND(self
->str
, i
+ 1, j
+ 1);
4899 while (i
>= 0 && Py_UNICODE_ISSPACE(self
->str
[i
]))
4905 SPLIT_APPEND(self
->str
, 0, j
+ 1);
4907 if (PyList_Reverse(list
) < 0)
4917 PyObject
*rsplit_char(PyUnicodeObject
*self
,
4920 Py_ssize_t maxcount
)
4922 register Py_ssize_t i
;
4923 register Py_ssize_t j
;
4924 Py_ssize_t len
= self
->length
;
4927 for (i
= j
= len
- 1; i
>= 0; ) {
4928 if (self
->str
[i
] == ch
) {
4929 if (maxcount
-- <= 0)
4931 SPLIT_APPEND(self
->str
, i
+ 1, j
+ 1);
4937 SPLIT_APPEND(self
->str
, 0, j
+ 1);
4939 if (PyList_Reverse(list
) < 0)
4949 PyObject
*rsplit_substring(PyUnicodeObject
*self
,
4951 PyUnicodeObject
*substring
,
4952 Py_ssize_t maxcount
)
4954 register Py_ssize_t i
;
4955 register Py_ssize_t j
;
4956 Py_ssize_t len
= self
->length
;
4957 Py_ssize_t sublen
= substring
->length
;
4960 for (i
= len
- sublen
, j
= len
; i
>= 0; ) {
4961 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
4962 if (maxcount
-- <= 0)
4964 SPLIT_APPEND(self
->str
, i
+ sublen
, j
);
4971 SPLIT_APPEND(self
->str
, 0, j
);
4973 if (PyList_Reverse(list
) < 0)
4985 PyObject
*split(PyUnicodeObject
*self
,
4986 PyUnicodeObject
*substring
,
4987 Py_ssize_t maxcount
)
4992 maxcount
= PY_SSIZE_T_MAX
;
4994 list
= PyList_New(0);
4998 if (substring
== NULL
)
4999 return split_whitespace(self
,list
,maxcount
);
5001 else if (substring
->length
== 1)
5002 return split_char(self
,list
,substring
->str
[0],maxcount
);
5004 else if (substring
->length
== 0) {
5006 PyErr_SetString(PyExc_ValueError
, "empty separator");
5010 return split_substring(self
,list
,substring
,maxcount
);
5014 PyObject
*rsplit(PyUnicodeObject
*self
,
5015 PyUnicodeObject
*substring
,
5016 Py_ssize_t maxcount
)
5021 maxcount
= PY_SSIZE_T_MAX
;
5023 list
= PyList_New(0);
5027 if (substring
== NULL
)
5028 return rsplit_whitespace(self
,list
,maxcount
);
5030 else if (substring
->length
== 1)
5031 return rsplit_char(self
,list
,substring
->str
[0],maxcount
);
5033 else if (substring
->length
== 0) {
5035 PyErr_SetString(PyExc_ValueError
, "empty separator");
5039 return rsplit_substring(self
,list
,substring
,maxcount
);
5043 PyObject
*replace(PyUnicodeObject
*self
,
5044 PyUnicodeObject
*str1
,
5045 PyUnicodeObject
*str2
,
5046 Py_ssize_t maxcount
)
5051 maxcount
= PY_SSIZE_T_MAX
;
5053 if (str1
->length
== str2
->length
) {
5056 if (str1
->length
== 1) {
5057 /* replace characters */
5059 if (!findchar(self
->str
, self
->length
, str1
->str
[0]))
5061 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5064 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5067 for (i
= 0; i
< u
->length
; i
++)
5068 if (u
->str
[i
] == u1
) {
5075 self
->str
, self
->length
, str1
->str
, str1
->length
, FAST_SEARCH
5079 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5082 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5083 while (i
<= self
->length
- str1
->length
)
5084 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
5087 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5094 Py_ssize_t n
, i
, j
, e
;
5095 Py_ssize_t product
, new_size
, delta
;
5098 /* replace strings */
5099 n
= stringlib_count(self
->str
, self
->length
, str1
->str
, str1
->length
);
5104 /* new_size = self->length + n * (str2->length - str1->length)); */
5105 delta
= (str2
->length
- str1
->length
);
5107 new_size
= self
->length
;
5109 product
= n
* (str2
->length
- str1
->length
);
5110 if ((product
/ (str2
->length
- str1
->length
)) != n
) {
5111 PyErr_SetString(PyExc_OverflowError
,
5112 "replace string is too long");
5115 new_size
= self
->length
+ product
;
5117 PyErr_SetString(PyExc_OverflowError
,
5118 "replace string is too long");
5122 u
= _PyUnicode_New(new_size
);
5127 e
= self
->length
- str1
->length
;
5128 if (str1
->length
> 0) {
5130 /* look for next match */
5133 if (Py_UNICODE_MATCH(self
, j
, str1
))
5140 /* copy unchanged part [i:j] */
5141 Py_UNICODE_COPY(p
, self
->str
+i
, j
-i
);
5144 /* copy substitution string */
5145 if (str2
->length
> 0) {
5146 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5149 i
= j
+ str1
->length
;
5151 if (i
< self
->length
)
5152 /* copy tail [i:] */
5153 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5157 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5161 *p
++ = self
->str
[i
++];
5163 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5166 return (PyObject
*) u
;
5169 /* nothing to replace; return original string (when possible) */
5170 if (PyUnicode_CheckExact(self
)) {
5172 return (PyObject
*) self
;
5174 return PyUnicode_FromUnicode(self
->str
, self
->length
);
5177 /* --- Unicode Object Methods --------------------------------------------- */
5179 PyDoc_STRVAR(title__doc__
,
5180 "S.title() -> unicode\n\
5182 Return a titlecased version of S, i.e. words start with title case\n\
5183 characters, all remaining cased characters have lower case.");
5186 unicode_title(PyUnicodeObject
*self
)
5188 return fixup(self
, fixtitle
);
5191 PyDoc_STRVAR(capitalize__doc__
,
5192 "S.capitalize() -> unicode\n\
5194 Return a capitalized version of S, i.e. make the first character\n\
5198 unicode_capitalize(PyUnicodeObject
*self
)
5200 return fixup(self
, fixcapitalize
);
5204 PyDoc_STRVAR(capwords__doc__
,
5205 "S.capwords() -> unicode\n\
5207 Apply .capitalize() to all words in S and return the result with\n\
5208 normalized whitespace (all whitespace strings are replaced by ' ').");
5211 unicode_capwords(PyUnicodeObject
*self
)
5217 /* Split into words */
5218 list
= split(self
, NULL
, -1);
5222 /* Capitalize each word */
5223 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
5224 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
5228 Py_DECREF(PyList_GET_ITEM(list
, i
));
5229 PyList_SET_ITEM(list
, i
, item
);
5232 /* Join the words to form a new string */
5233 item
= PyUnicode_Join(NULL
, list
);
5237 return (PyObject
*)item
;
5241 /* Argument converter. Coerces to a single unicode character */
5244 convert_uc(PyObject
*obj
, void *addr
)
5246 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
5250 uniobj
= PyUnicode_FromObject(obj
);
5251 if (uniobj
== NULL
) {
5252 PyErr_SetString(PyExc_TypeError
,
5253 "The fill character cannot be converted to Unicode");
5256 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
5257 PyErr_SetString(PyExc_TypeError
,
5258 "The fill character must be exactly one character long");
5262 unistr
= PyUnicode_AS_UNICODE(uniobj
);
5263 *fillcharloc
= unistr
[0];
5268 PyDoc_STRVAR(center__doc__
,
5269 "S.center(width[, fillchar]) -> unicode\n\
5271 Return S centered in a Unicode string of length width. Padding is\n\
5272 done using the specified fill character (default is a space)");
5275 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
5277 Py_ssize_t marg
, left
;
5279 Py_UNICODE fillchar
= ' ';
5281 if (!PyArg_ParseTuple(args
, "n|O&:center", &width
, convert_uc
, &fillchar
))
5284 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
5286 return (PyObject
*) self
;
5289 marg
= width
- self
->length
;
5290 left
= marg
/ 2 + (marg
& width
& 1);
5292 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
5297 /* This code should go into some future Unicode collation support
5298 module. The basic comparison should compare ordinals on a naive
5299 basis (this is what Java does and thus JPython too). */
5301 /* speedy UTF-16 code point order comparison */
5303 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5305 static short utf16Fixup
[32] =
5307 0, 0, 0, 0, 0, 0, 0, 0,
5308 0, 0, 0, 0, 0, 0, 0, 0,
5309 0, 0, 0, 0, 0, 0, 0, 0,
5310 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5314 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
5316 Py_ssize_t len1
, len2
;
5318 Py_UNICODE
*s1
= str1
->str
;
5319 Py_UNICODE
*s2
= str2
->str
;
5321 len1
= str1
->length
;
5322 len2
= str2
->length
;
5324 while (len1
> 0 && len2
> 0) {
5330 if (c1
> (1<<11) * 26)
5331 c1
+= utf16Fixup
[c1
>>11];
5332 if (c2
> (1<<11) * 26)
5333 c2
+= utf16Fixup
[c2
>>11];
5334 /* now c1 and c2 are in UTF-32-compatible order */
5337 return (c1
< c2
) ? -1 : 1;
5342 return (len1
< len2
) ? -1 : (len1
!= len2
);
5348 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
5350 register Py_ssize_t len1
, len2
;
5352 Py_UNICODE
*s1
= str1
->str
;
5353 Py_UNICODE
*s2
= str2
->str
;
5355 len1
= str1
->length
;
5356 len2
= str2
->length
;
5358 while (len1
> 0 && len2
> 0) {
5365 return (c1
< c2
) ? -1 : 1;
5370 return (len1
< len2
) ? -1 : (len1
!= len2
);
5375 int PyUnicode_Compare(PyObject
*left
,
5378 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
5381 /* Coerce the two arguments */
5382 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
5385 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
5389 /* Shortcut for empty or interned objects */
5396 result
= unicode_compare(u
, v
);
5408 int PyUnicode_Contains(PyObject
*container
,
5411 PyObject
*str
, *sub
;
5414 /* Coerce the two arguments */
5415 sub
= PyUnicode_FromObject(element
);
5417 PyErr_SetString(PyExc_TypeError
,
5418 "'in <string>' requires string as left operand");
5422 str
= PyUnicode_FromObject(container
);
5428 result
= stringlib_contains_obj(str
, sub
);
5436 /* Concat to string or Unicode object giving a new Unicode object. */
5438 PyObject
*PyUnicode_Concat(PyObject
*left
,
5441 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
5443 /* Coerce the two arguments */
5444 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
5447 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
5452 if (v
== unicode_empty
) {
5454 return (PyObject
*)u
;
5456 if (u
== unicode_empty
) {
5458 return (PyObject
*)v
;
5461 /* Concat the two Unicode strings */
5462 w
= _PyUnicode_New(u
->length
+ v
->length
);
5465 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
5466 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
5470 return (PyObject
*)w
;
5478 PyDoc_STRVAR(count__doc__
,
5479 "S.count(sub[, start[, end]]) -> int\n\
5481 Return the number of non-overlapping occurrences of substring sub in\n\
5482 Unicode string S[start:end]. Optional arguments start and end are\n\
5483 interpreted as in slice notation.");
5486 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
5488 PyUnicodeObject
*substring
;
5489 Py_ssize_t start
= 0;
5490 Py_ssize_t end
= PY_SSIZE_T_MAX
;
5493 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
5494 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5497 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
5498 (PyObject
*)substring
);
5499 if (substring
== NULL
)
5502 FIX_START_END(self
);
5504 result
= PyInt_FromSsize_t(
5505 stringlib_count(self
->str
+ start
, end
- start
,
5506 substring
->str
, substring
->length
)
5509 Py_DECREF(substring
);
5514 PyDoc_STRVAR(encode__doc__
,
5515 "S.encode([encoding[,errors]]) -> string or unicode\n\
5517 Encodes S using the codec registered for encoding. encoding defaults\n\
5518 to the default encoding. errors may be given to set a different error\n\
5519 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5520 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5521 'xmlcharrefreplace' as well as any other name registered with\n\
5522 codecs.register_error that can handle UnicodeEncodeErrors.");
5525 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
5527 char *encoding
= NULL
;
5528 char *errors
= NULL
;
5531 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
5533 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
5536 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
5537 PyErr_Format(PyExc_TypeError
,
5538 "encoder did not return a string/unicode object "
5540 v
->ob_type
->tp_name
);
5550 PyDoc_STRVAR(decode__doc__
,
5551 "S.decode([encoding[,errors]]) -> string or unicode\n\
5553 Decodes S using the codec registered for encoding. encoding defaults\n\
5554 to the default encoding. errors may be given to set a different error\n\
5555 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5556 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5557 as well as any other name registerd with codecs.register_error that is\n\
5558 able to handle UnicodeDecodeErrors.");
5561 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
)
5563 char *encoding
= NULL
;
5564 char *errors
= NULL
;
5567 if (!PyArg_ParseTuple(args
, "|ss:decode", &encoding
, &errors
))
5569 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
5572 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
5573 PyErr_Format(PyExc_TypeError
,
5574 "decoder did not return a string/unicode object "
5576 v
->ob_type
->tp_name
);
5586 PyDoc_STRVAR(expandtabs__doc__
,
5587 "S.expandtabs([tabsize]) -> unicode\n\
5589 Return a copy of S where all tab characters are expanded using spaces.\n\
5590 If tabsize is not given, a tab size of 8 characters is assumed.");
5593 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
5602 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
5605 /* First pass: determine size of output string */
5607 e
= self
->str
+ self
->length
;
5608 for (p
= self
->str
; p
< e
; p
++)
5611 j
+= tabsize
- (j
% tabsize
);
5615 if (*p
== '\n' || *p
== '\r') {
5621 /* Second pass: create output string and fill it */
5622 u
= _PyUnicode_New(i
+ j
);
5629 for (p
= self
->str
; p
< e
; p
++)
5632 i
= tabsize
- (j
% tabsize
);
5641 if (*p
== '\n' || *p
== '\r')
5645 return (PyObject
*) u
;
5648 PyDoc_STRVAR(find__doc__
,
5649 "S.find(sub [,start [,end]]) -> int\n\
5651 Return the lowest index in S where substring sub is found,\n\
5652 such that sub is contained within s[start,end]. Optional\n\
5653 arguments start and end are interpreted as in slice notation.\n\
5655 Return -1 on failure.");
5658 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
5660 PyObject
*substring
;
5661 Py_ssize_t start
= 0;
5662 Py_ssize_t end
= PY_SSIZE_T_MAX
;
5665 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
5666 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5668 substring
= PyUnicode_FromObject(substring
);
5672 result
= stringlib_find_slice(
5673 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
5674 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
5678 Py_DECREF(substring
);
5680 return PyInt_FromSsize_t(result
);
5684 unicode_getitem(PyUnicodeObject
*self
, Py_ssize_t index
)
5686 if (index
< 0 || index
>= self
->length
) {
5687 PyErr_SetString(PyExc_IndexError
, "string index out of range");
5691 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
5695 unicode_hash(PyUnicodeObject
*self
)
5697 /* Since Unicode objects compare equal to their ASCII string
5698 counterparts, they should use the individual character values
5699 as basis for their hash value. This is needed to assure that
5700 strings and Unicode objects behave in the same way as
5703 register Py_ssize_t len
;
5704 register Py_UNICODE
*p
;
5707 if (self
->hash
!= -1)
5709 len
= PyUnicode_GET_SIZE(self
);
5710 p
= PyUnicode_AS_UNICODE(self
);
5713 x
= (1000003*x
) ^ *p
++;
5714 x
^= PyUnicode_GET_SIZE(self
);
5721 PyDoc_STRVAR(index__doc__
,
5722 "S.index(sub [,start [,end]]) -> int\n\
5724 Like S.find() but raise ValueError when the substring is not found.");
5727 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
5730 PyObject
*substring
;
5731 Py_ssize_t start
= 0;
5732 Py_ssize_t end
= PY_SSIZE_T_MAX
;
5734 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
5735 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5737 substring
= PyUnicode_FromObject(substring
);
5741 result
= stringlib_find_slice(
5742 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
5743 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
5747 Py_DECREF(substring
);
5750 PyErr_SetString(PyExc_ValueError
, "substring not found");
5754 return PyInt_FromSsize_t(result
);
5757 PyDoc_STRVAR(islower__doc__
,
5758 "S.islower() -> bool\n\
5760 Return True if all cased characters in S are lowercase and there is\n\
5761 at least one cased character in S, False otherwise.");
5764 unicode_islower(PyUnicodeObject
*self
)
5766 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5767 register const Py_UNICODE
*e
;
5770 /* Shortcut for single character strings */
5771 if (PyUnicode_GET_SIZE(self
) == 1)
5772 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
5774 /* Special case for empty strings */
5775 if (PyUnicode_GET_SIZE(self
) == 0)
5776 return PyBool_FromLong(0);
5778 e
= p
+ PyUnicode_GET_SIZE(self
);
5780 for (; p
< e
; p
++) {
5781 register const Py_UNICODE ch
= *p
;
5783 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
5784 return PyBool_FromLong(0);
5785 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
5788 return PyBool_FromLong(cased
);
5791 PyDoc_STRVAR(isupper__doc__
,
5792 "S.isupper() -> bool\n\
5794 Return True if all cased characters in S are uppercase and there is\n\
5795 at least one cased character in S, False otherwise.");
5798 unicode_isupper(PyUnicodeObject
*self
)
5800 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5801 register const Py_UNICODE
*e
;
5804 /* Shortcut for single character strings */
5805 if (PyUnicode_GET_SIZE(self
) == 1)
5806 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
5808 /* Special case for empty strings */
5809 if (PyUnicode_GET_SIZE(self
) == 0)
5810 return PyBool_FromLong(0);
5812 e
= p
+ PyUnicode_GET_SIZE(self
);
5814 for (; p
< e
; p
++) {
5815 register const Py_UNICODE ch
= *p
;
5817 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
5818 return PyBool_FromLong(0);
5819 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
5822 return PyBool_FromLong(cased
);
5825 PyDoc_STRVAR(istitle__doc__
,
5826 "S.istitle() -> bool\n\
5828 Return True if S is a titlecased string and there is at least one\n\
5829 character in S, i.e. upper- and titlecase characters may only\n\
5830 follow uncased characters and lowercase characters only cased ones.\n\
5831 Return False otherwise.");
5834 unicode_istitle(PyUnicodeObject
*self
)
5836 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5837 register const Py_UNICODE
*e
;
5838 int cased
, previous_is_cased
;
5840 /* Shortcut for single character strings */
5841 if (PyUnicode_GET_SIZE(self
) == 1)
5842 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
5843 (Py_UNICODE_ISUPPER(*p
) != 0));
5845 /* Special case for empty strings */
5846 if (PyUnicode_GET_SIZE(self
) == 0)
5847 return PyBool_FromLong(0);
5849 e
= p
+ PyUnicode_GET_SIZE(self
);
5851 previous_is_cased
= 0;
5852 for (; p
< e
; p
++) {
5853 register const Py_UNICODE ch
= *p
;
5855 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
5856 if (previous_is_cased
)
5857 return PyBool_FromLong(0);
5858 previous_is_cased
= 1;
5861 else if (Py_UNICODE_ISLOWER(ch
)) {
5862 if (!previous_is_cased
)
5863 return PyBool_FromLong(0);
5864 previous_is_cased
= 1;
5868 previous_is_cased
= 0;
5870 return PyBool_FromLong(cased
);
5873 PyDoc_STRVAR(isspace__doc__
,
5874 "S.isspace() -> bool\n\
5876 Return True if all characters in S are whitespace\n\
5877 and there is at least one character in S, False otherwise.");
5880 unicode_isspace(PyUnicodeObject
*self
)
5882 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5883 register const Py_UNICODE
*e
;
5885 /* Shortcut for single character strings */
5886 if (PyUnicode_GET_SIZE(self
) == 1 &&
5887 Py_UNICODE_ISSPACE(*p
))
5888 return PyBool_FromLong(1);
5890 /* Special case for empty strings */
5891 if (PyUnicode_GET_SIZE(self
) == 0)
5892 return PyBool_FromLong(0);
5894 e
= p
+ PyUnicode_GET_SIZE(self
);
5895 for (; p
< e
; p
++) {
5896 if (!Py_UNICODE_ISSPACE(*p
))
5897 return PyBool_FromLong(0);
5899 return PyBool_FromLong(1);
5902 PyDoc_STRVAR(isalpha__doc__
,
5903 "S.isalpha() -> bool\n\
5905 Return True if all characters in S are alphabetic\n\
5906 and there is at least one character in S, False otherwise.");
5909 unicode_isalpha(PyUnicodeObject
*self
)
5911 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5912 register const Py_UNICODE
*e
;
5914 /* Shortcut for single character strings */
5915 if (PyUnicode_GET_SIZE(self
) == 1 &&
5916 Py_UNICODE_ISALPHA(*p
))
5917 return PyBool_FromLong(1);
5919 /* Special case for empty strings */
5920 if (PyUnicode_GET_SIZE(self
) == 0)
5921 return PyBool_FromLong(0);
5923 e
= p
+ PyUnicode_GET_SIZE(self
);
5924 for (; p
< e
; p
++) {
5925 if (!Py_UNICODE_ISALPHA(*p
))
5926 return PyBool_FromLong(0);
5928 return PyBool_FromLong(1);
5931 PyDoc_STRVAR(isalnum__doc__
,
5932 "S.isalnum() -> bool\n\
5934 Return True if all characters in S are alphanumeric\n\
5935 and there is at least one character in S, False otherwise.");
5938 unicode_isalnum(PyUnicodeObject
*self
)
5940 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5941 register const Py_UNICODE
*e
;
5943 /* Shortcut for single character strings */
5944 if (PyUnicode_GET_SIZE(self
) == 1 &&
5945 Py_UNICODE_ISALNUM(*p
))
5946 return PyBool_FromLong(1);
5948 /* Special case for empty strings */
5949 if (PyUnicode_GET_SIZE(self
) == 0)
5950 return PyBool_FromLong(0);
5952 e
= p
+ PyUnicode_GET_SIZE(self
);
5953 for (; p
< e
; p
++) {
5954 if (!Py_UNICODE_ISALNUM(*p
))
5955 return PyBool_FromLong(0);
5957 return PyBool_FromLong(1);
5960 PyDoc_STRVAR(isdecimal__doc__
,
5961 "S.isdecimal() -> bool\n\
5963 Return True if there are only decimal characters in S,\n\
5967 unicode_isdecimal(PyUnicodeObject
*self
)
5969 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5970 register const Py_UNICODE
*e
;
5972 /* Shortcut for single character strings */
5973 if (PyUnicode_GET_SIZE(self
) == 1 &&
5974 Py_UNICODE_ISDECIMAL(*p
))
5975 return PyBool_FromLong(1);
5977 /* Special case for empty strings */
5978 if (PyUnicode_GET_SIZE(self
) == 0)
5979 return PyBool_FromLong(0);
5981 e
= p
+ PyUnicode_GET_SIZE(self
);
5982 for (; p
< e
; p
++) {
5983 if (!Py_UNICODE_ISDECIMAL(*p
))
5984 return PyBool_FromLong(0);
5986 return PyBool_FromLong(1);
5989 PyDoc_STRVAR(isdigit__doc__
,
5990 "S.isdigit() -> bool\n\
5992 Return True if all characters in S are digits\n\
5993 and there is at least one character in S, False otherwise.");
5996 unicode_isdigit(PyUnicodeObject
*self
)
5998 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5999 register const Py_UNICODE
*e
;
6001 /* Shortcut for single character strings */
6002 if (PyUnicode_GET_SIZE(self
) == 1 &&
6003 Py_UNICODE_ISDIGIT(*p
))
6004 return PyBool_FromLong(1);
6006 /* Special case for empty strings */
6007 if (PyUnicode_GET_SIZE(self
) == 0)
6008 return PyBool_FromLong(0);
6010 e
= p
+ PyUnicode_GET_SIZE(self
);
6011 for (; p
< e
; p
++) {
6012 if (!Py_UNICODE_ISDIGIT(*p
))
6013 return PyBool_FromLong(0);
6015 return PyBool_FromLong(1);
6018 PyDoc_STRVAR(isnumeric__doc__
,
6019 "S.isnumeric() -> bool\n\
6021 Return True if there are only numeric characters in S,\n\
6025 unicode_isnumeric(PyUnicodeObject
*self
)
6027 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6028 register const Py_UNICODE
*e
;
6030 /* Shortcut for single character strings */
6031 if (PyUnicode_GET_SIZE(self
) == 1 &&
6032 Py_UNICODE_ISNUMERIC(*p
))
6033 return PyBool_FromLong(1);
6035 /* Special case for empty strings */
6036 if (PyUnicode_GET_SIZE(self
) == 0)
6037 return PyBool_FromLong(0);
6039 e
= p
+ PyUnicode_GET_SIZE(self
);
6040 for (; p
< e
; p
++) {
6041 if (!Py_UNICODE_ISNUMERIC(*p
))
6042 return PyBool_FromLong(0);
6044 return PyBool_FromLong(1);
6047 PyDoc_STRVAR(join__doc__
,
6048 "S.join(sequence) -> unicode\n\
6050 Return a string which is the concatenation of the strings in the\n\
6051 sequence. The separator between elements is S.");
6054 unicode_join(PyObject
*self
, PyObject
*data
)
6056 return PyUnicode_Join(self
, data
);
6060 unicode_length(PyUnicodeObject
*self
)
6062 return self
->length
;
6065 PyDoc_STRVAR(ljust__doc__
,
6066 "S.ljust(width[, fillchar]) -> int\n\
6068 Return S left justified in a Unicode string of length width. Padding is\n\
6069 done using the specified fill character (default is a space).");
6072 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
6075 Py_UNICODE fillchar
= ' ';
6077 if (!PyArg_ParseTuple(args
, "n|O&:ljust", &width
, convert_uc
, &fillchar
))
6080 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6082 return (PyObject
*) self
;
6085 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
6088 PyDoc_STRVAR(lower__doc__
,
6089 "S.lower() -> unicode\n\
6091 Return a copy of the string S converted to lowercase.");
6094 unicode_lower(PyUnicodeObject
*self
)
6096 return fixup(self
, fixlower
);
6100 #define RIGHTSTRIP 1
6103 /* Arrays indexed by above */
6104 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6106 #define STRIPNAME(i) (stripformat[i]+3)
6108 /* externally visible for str.strip(unicode) */
6110 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
6112 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6113 Py_ssize_t len
= PyUnicode_GET_SIZE(self
);
6114 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
6115 Py_ssize_t seplen
= PyUnicode_GET_SIZE(sepobj
);
6118 BLOOM_MASK sepmask
= make_bloom_mask(sep
, seplen
);
6121 if (striptype
!= RIGHTSTRIP
) {
6122 while (i
< len
&& BLOOM_MEMBER(sepmask
, s
[i
], sep
, seplen
)) {
6128 if (striptype
!= LEFTSTRIP
) {
6131 } while (j
>= i
&& BLOOM_MEMBER(sepmask
, s
[j
], sep
, seplen
));
6135 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
6137 return (PyObject
*)self
;
6140 return PyUnicode_FromUnicode(s
+i
, j
-i
);
6145 do_strip(PyUnicodeObject
*self
, int striptype
)
6147 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6148 Py_ssize_t len
= PyUnicode_GET_SIZE(self
), i
, j
;
6151 if (striptype
!= RIGHTSTRIP
) {
6152 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
6158 if (striptype
!= LEFTSTRIP
) {
6161 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
6165 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
6167 return (PyObject
*)self
;
6170 return PyUnicode_FromUnicode(s
+i
, j
-i
);
6175 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
6177 PyObject
*sep
= NULL
;
6179 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
6182 if (sep
!= NULL
&& sep
!= Py_None
) {
6183 if (PyUnicode_Check(sep
))
6184 return _PyUnicode_XStrip(self
, striptype
, sep
);
6185 else if (PyString_Check(sep
)) {
6187 sep
= PyUnicode_FromObject(sep
);
6190 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
6195 PyErr_Format(PyExc_TypeError
,
6196 "%s arg must be None, unicode or str",
6197 STRIPNAME(striptype
));
6202 return do_strip(self
, striptype
);
6206 PyDoc_STRVAR(strip__doc__
,
6207 "S.strip([chars]) -> unicode\n\
6209 Return a copy of the string S with leading and trailing\n\
6210 whitespace removed.\n\
6211 If chars is given and not None, remove characters in chars instead.\n\
6212 If chars is a str, it will be converted to unicode before stripping");
6215 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
6217 if (PyTuple_GET_SIZE(args
) == 0)
6218 return do_strip(self
, BOTHSTRIP
); /* Common case */
6220 return do_argstrip(self
, BOTHSTRIP
, args
);
6224 PyDoc_STRVAR(lstrip__doc__
,
6225 "S.lstrip([chars]) -> unicode\n\
6227 Return a copy of the string S with leading whitespace removed.\n\
6228 If chars is given and not None, remove characters in chars instead.\n\
6229 If chars is a str, it will be converted to unicode before stripping");
6232 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
6234 if (PyTuple_GET_SIZE(args
) == 0)
6235 return do_strip(self
, LEFTSTRIP
); /* Common case */
6237 return do_argstrip(self
, LEFTSTRIP
, args
);
6241 PyDoc_STRVAR(rstrip__doc__
,
6242 "S.rstrip([chars]) -> unicode\n\
6244 Return a copy of the string S with trailing whitespace removed.\n\
6245 If chars is given and not None, remove characters in chars instead.\n\
6246 If chars is a str, it will be converted to unicode before stripping");
6249 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
6251 if (PyTuple_GET_SIZE(args
) == 0)
6252 return do_strip(self
, RIGHTSTRIP
); /* Common case */
6254 return do_argstrip(self
, RIGHTSTRIP
, args
);
6259 unicode_repeat(PyUnicodeObject
*str
, Py_ssize_t len
)
6269 if (len
== 1 && PyUnicode_CheckExact(str
)) {
6270 /* no repeat, return original string */
6272 return (PyObject
*) str
;
6275 /* ensure # of chars needed doesn't overflow int and # of bytes
6276 * needed doesn't overflow size_t
6278 nchars
= len
* str
->length
;
6279 if (len
&& nchars
/ len
!= str
->length
) {
6280 PyErr_SetString(PyExc_OverflowError
,
6281 "repeated string is too long");
6284 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
6285 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
6286 PyErr_SetString(PyExc_OverflowError
,
6287 "repeated string is too long");
6290 u
= _PyUnicode_New(nchars
);
6296 if (str
->length
== 1 && len
> 0) {
6297 Py_UNICODE_FILL(p
, str
->str
[0], len
);
6299 Py_ssize_t done
= 0; /* number of characters copied this far */
6300 if (done
< nchars
) {
6301 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
6304 while (done
< nchars
) {
6305 int n
= (done
<= nchars
-done
) ? done
: nchars
-done
;
6306 Py_UNICODE_COPY(p
+done
, p
, n
);
6311 return (PyObject
*) u
;
6314 PyObject
*PyUnicode_Replace(PyObject
*obj
,
6317 Py_ssize_t maxcount
)
6324 self
= PyUnicode_FromObject(obj
);
6327 str1
= PyUnicode_FromObject(subobj
);
6332 str2
= PyUnicode_FromObject(replobj
);
6338 result
= replace((PyUnicodeObject
*)self
,
6339 (PyUnicodeObject
*)str1
,
6340 (PyUnicodeObject
*)str2
,
6348 PyDoc_STRVAR(replace__doc__
,
6349 "S.replace (old, new[, maxsplit]) -> unicode\n\
6351 Return a copy of S with all occurrences of substring\n\
6352 old replaced by new. If the optional argument maxsplit is\n\
6353 given, only the first maxsplit occurrences are replaced.");
6356 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
6358 PyUnicodeObject
*str1
;
6359 PyUnicodeObject
*str2
;
6360 Py_ssize_t maxcount
= -1;
6363 if (!PyArg_ParseTuple(args
, "OO|n:replace", &str1
, &str2
, &maxcount
))
6365 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
6368 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
6374 result
= replace(self
, str1
, str2
, maxcount
);
6382 PyObject
*unicode_repr(PyObject
*unicode
)
6384 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
6385 PyUnicode_GET_SIZE(unicode
),
6389 PyDoc_STRVAR(rfind__doc__
,
6390 "S.rfind(sub [,start [,end]]) -> int\n\
6392 Return the highest index in S where substring sub is found,\n\
6393 such that sub is contained within s[start,end]. Optional\n\
6394 arguments start and end are interpreted as in slice notation.\n\
6396 Return -1 on failure.");
6399 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
6401 PyObject
*substring
;
6402 Py_ssize_t start
= 0;
6403 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6406 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
6407 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6409 substring
= PyUnicode_FromObject(substring
);
6413 result
= stringlib_rfind_slice(
6414 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6415 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6419 Py_DECREF(substring
);
6421 return PyInt_FromSsize_t(result
);
6424 PyDoc_STRVAR(rindex__doc__
,
6425 "S.rindex(sub [,start [,end]]) -> int\n\
6427 Like S.rfind() but raise ValueError when the substring is not found.");
6430 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
6432 PyObject
*substring
;
6433 Py_ssize_t start
= 0;
6434 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6437 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
6438 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6440 substring
= PyUnicode_FromObject(substring
);
6444 result
= stringlib_rfind_slice(
6445 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6446 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6450 Py_DECREF(substring
);
6453 PyErr_SetString(PyExc_ValueError
, "substring not found");
6456 return PyInt_FromSsize_t(result
);
6459 PyDoc_STRVAR(rjust__doc__
,
6460 "S.rjust(width[, fillchar]) -> unicode\n\
6462 Return S right justified in a Unicode string of length width. Padding is\n\
6463 done using the specified fill character (default is a space).");
6466 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
6469 Py_UNICODE fillchar
= ' ';
6471 if (!PyArg_ParseTuple(args
, "n|O&:rjust", &width
, convert_uc
, &fillchar
))
6474 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6476 return (PyObject
*) self
;
6479 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
6483 unicode_slice(PyUnicodeObject
*self
, Py_ssize_t start
, Py_ssize_t end
)
6485 /* standard clamping */
6490 if (end
> self
->length
)
6492 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
6493 /* full slice, return original string */
6495 return (PyObject
*) self
;
6500 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
6504 PyObject
*PyUnicode_Split(PyObject
*s
,
6506 Py_ssize_t maxsplit
)
6510 s
= PyUnicode_FromObject(s
);
6514 sep
= PyUnicode_FromObject(sep
);
6521 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
6528 PyDoc_STRVAR(split__doc__
,
6529 "S.split([sep [,maxsplit]]) -> list of strings\n\
6531 Return a list of the words in S, using sep as the\n\
6532 delimiter string. If maxsplit is given, at most maxsplit\n\
6533 splits are done. If sep is not specified or is None,\n\
6534 any whitespace string is a separator.");
6537 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
6539 PyObject
*substring
= Py_None
;
6540 Py_ssize_t maxcount
= -1;
6542 if (!PyArg_ParseTuple(args
, "|On:split", &substring
, &maxcount
))
6545 if (substring
== Py_None
)
6546 return split(self
, NULL
, maxcount
);
6547 else if (PyUnicode_Check(substring
))
6548 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
6550 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
6554 PyUnicode_Partition(PyObject
*str_in
, PyObject
*sep_in
)
6560 str_obj
= PyUnicode_FromObject(str_in
);
6563 sep_obj
= PyUnicode_FromObject(sep_in
);
6569 out
= stringlib_partition(
6570 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
6571 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
6582 PyUnicode_RPartition(PyObject
*str_in
, PyObject
*sep_in
)
6588 str_obj
= PyUnicode_FromObject(str_in
);
6591 sep_obj
= PyUnicode_FromObject(sep_in
);
6597 out
= stringlib_rpartition(
6598 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
6599 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
6608 PyDoc_STRVAR(partition__doc__
,
6609 "S.partition(sep) -> (head, sep, tail)\n\
6611 Searches for the separator sep in S, and returns the part before it,\n\
6612 the separator itself, and the part after it. If the separator is not\n\
6613 found, returns S and two empty strings.");
6616 unicode_partition(PyUnicodeObject
*self
, PyObject
*separator
)
6618 return PyUnicode_Partition((PyObject
*)self
, separator
);
6621 PyDoc_STRVAR(rpartition__doc__
,
6622 "S.rpartition(sep) -> (head, sep, tail)\n\
6624 Searches for the separator sep in S, starting at the end of S, and returns\n\
6625 the part before it, the separator itself, and the part after it. If the\n\
6626 separator is not found, returns S and two empty strings.");
6629 unicode_rpartition(PyUnicodeObject
*self
, PyObject
*separator
)
6631 return PyUnicode_RPartition((PyObject
*)self
, separator
);
6634 PyObject
*PyUnicode_RSplit(PyObject
*s
,
6636 Py_ssize_t maxsplit
)
6640 s
= PyUnicode_FromObject(s
);
6644 sep
= PyUnicode_FromObject(sep
);
6651 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
6658 PyDoc_STRVAR(rsplit__doc__
,
6659 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6661 Return a list of the words in S, using sep as the\n\
6662 delimiter string, starting at the end of the string and\n\
6663 working to the front. If maxsplit is given, at most maxsplit\n\
6664 splits are done. If sep is not specified, any whitespace string\n\
6668 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
6670 PyObject
*substring
= Py_None
;
6671 Py_ssize_t maxcount
= -1;
6673 if (!PyArg_ParseTuple(args
, "|On:rsplit", &substring
, &maxcount
))
6676 if (substring
== Py_None
)
6677 return rsplit(self
, NULL
, maxcount
);
6678 else if (PyUnicode_Check(substring
))
6679 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
6681 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
6684 PyDoc_STRVAR(splitlines__doc__
,
6685 "S.splitlines([keepends]]) -> list of strings\n\
6687 Return a list of the lines in S, breaking at line boundaries.\n\
6688 Line breaks are not included in the resulting list unless keepends\n\
6689 is given and true.");
6692 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
6696 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
6699 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
6703 PyObject
*unicode_str(PyUnicodeObject
*self
)
6705 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
6708 PyDoc_STRVAR(swapcase__doc__
,
6709 "S.swapcase() -> unicode\n\
6711 Return a copy of S with uppercase characters converted to lowercase\n\
6715 unicode_swapcase(PyUnicodeObject
*self
)
6717 return fixup(self
, fixswapcase
);
6720 PyDoc_STRVAR(translate__doc__
,
6721 "S.translate(table) -> unicode\n\
6723 Return a copy of the string S, where all characters have been mapped\n\
6724 through the given translation table, which must be a mapping of\n\
6725 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6726 Unmapped characters are left untouched. Characters mapped to None\n\
6730 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
6732 return PyUnicode_TranslateCharmap(self
->str
,
6738 PyDoc_STRVAR(upper__doc__
,
6739 "S.upper() -> unicode\n\
6741 Return a copy of S converted to uppercase.");
6744 unicode_upper(PyUnicodeObject
*self
)
6746 return fixup(self
, fixupper
);
6749 PyDoc_STRVAR(zfill__doc__
,
6750 "S.zfill(width) -> unicode\n\
6752 Pad a numeric string x with zeros on the left, to fill a field\n\
6753 of the specified width. The string x is never truncated.");
6756 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
6762 if (!PyArg_ParseTuple(args
, "n:zfill", &width
))
6765 if (self
->length
>= width
) {
6766 if (PyUnicode_CheckExact(self
)) {
6768 return (PyObject
*) self
;
6771 return PyUnicode_FromUnicode(
6772 PyUnicode_AS_UNICODE(self
),
6773 PyUnicode_GET_SIZE(self
)
6777 fill
= width
- self
->length
;
6779 u
= pad(self
, fill
, 0, '0');
6784 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
6785 /* move sign to beginning of string */
6786 u
->str
[0] = u
->str
[fill
];
6790 return (PyObject
*) u
;
6795 unicode_freelistsize(PyUnicodeObject
*self
)
6797 return PyInt_FromLong(unicode_freelist_size
);
6801 PyDoc_STRVAR(startswith__doc__
,
6802 "S.startswith(prefix[, start[, end]]) -> bool\n\
6804 Return True if S starts with the specified prefix, False otherwise.\n\
6805 With optional start, test S beginning at that position.\n\
6806 With optional end, stop comparing S at that position.\n\
6807 prefix can also be a tuple of strings to try.");
6810 unicode_startswith(PyUnicodeObject
*self
,
6814 PyUnicodeObject
*substring
;
6815 Py_ssize_t start
= 0;
6816 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6819 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &subobj
,
6820 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6822 if (PyTuple_Check(subobj
)) {
6824 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
6825 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6826 PyTuple_GET_ITEM(subobj
, i
));
6827 if (substring
== NULL
)
6829 result
= tailmatch(self
, substring
, start
, end
, -1);
6830 Py_DECREF(substring
);
6835 /* nothing matched */
6838 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
6839 if (substring
== NULL
)
6841 result
= tailmatch(self
, substring
, start
, end
, -1);
6842 Py_DECREF(substring
);
6843 return PyBool_FromLong(result
);
6847 PyDoc_STRVAR(endswith__doc__
,
6848 "S.endswith(suffix[, start[, end]]) -> bool\n\
6850 Return True if S ends with the specified suffix, False otherwise.\n\
6851 With optional start, test S beginning at that position.\n\
6852 With optional end, stop comparing S at that position.\n\
6853 suffix can also be a tuple of strings to try.");
6856 unicode_endswith(PyUnicodeObject
*self
,
6860 PyUnicodeObject
*substring
;
6861 Py_ssize_t start
= 0;
6862 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6865 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &subobj
,
6866 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6868 if (PyTuple_Check(subobj
)) {
6870 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
6871 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6872 PyTuple_GET_ITEM(subobj
, i
));
6873 if (substring
== NULL
)
6875 result
= tailmatch(self
, substring
, start
, end
, +1);
6876 Py_DECREF(substring
);
6883 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
6884 if (substring
== NULL
)
6887 result
= tailmatch(self
, substring
, start
, end
, +1);
6888 Py_DECREF(substring
);
6889 return PyBool_FromLong(result
);
6895 unicode_getnewargs(PyUnicodeObject
*v
)
6897 return Py_BuildValue("(u#)", v
->str
, v
->length
);
6901 static PyMethodDef unicode_methods
[] = {
6903 /* Order is according to common usage: often used methods should
6904 appear first, since lookup is done sequentially. */
6906 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
, encode__doc__
},
6907 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
6908 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
6909 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
6910 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
6911 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
6912 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
6913 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
6914 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
6915 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
6916 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
6917 {"partition", (PyCFunction
) unicode_partition
, METH_O
, partition__doc__
},
6918 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
6919 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
6920 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
6921 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
6922 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
, decode__doc__
},
6923 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6924 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
6925 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
6926 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
6927 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
6928 {"rpartition", (PyCFunction
) unicode_rpartition
, METH_O
, rpartition__doc__
},
6929 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
6930 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
6931 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
6932 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
6933 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
6934 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
6935 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
6936 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
6937 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
6938 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
6939 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
6940 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
6941 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
6942 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
6943 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
6944 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
6945 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
6947 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
6951 /* This one is just used for debugging the implementation. */
6952 {"freelistsize", (PyCFunction
) unicode_freelistsize
, METH_NOARGS
},
6955 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
6960 unicode_mod(PyObject
*v
, PyObject
*w
)
6962 if (!PyUnicode_Check(v
)) {
6963 Py_INCREF(Py_NotImplemented
);
6964 return Py_NotImplemented
;
6966 return PyUnicode_Format(v
, w
);
6969 static PyNumberMethods unicode_as_number
= {
6974 unicode_mod
, /*nb_remainder*/
6977 static PySequenceMethods unicode_as_sequence
= {
6978 (lenfunc
) unicode_length
, /* sq_length */
6979 PyUnicode_Concat
, /* sq_concat */
6980 (ssizeargfunc
) unicode_repeat
, /* sq_repeat */
6981 (ssizeargfunc
) unicode_getitem
, /* sq_item */
6982 (ssizessizeargfunc
) unicode_slice
, /* sq_slice */
6983 0, /* sq_ass_item */
6984 0, /* sq_ass_slice */
6985 PyUnicode_Contains
, /* sq_contains */
6988 #define HASINDEX(o) PyType_HasFeature((o)->ob_type, Py_TPFLAGS_HAVE_INDEX)
6991 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
6993 PyNumberMethods
*nb
= item
->ob_type
->tp_as_number
;
6994 if (nb
!= NULL
&& HASINDEX(item
) && nb
->nb_index
!= NULL
) {
6995 Py_ssize_t i
= nb
->nb_index(item
);
6996 if (i
== -1 && PyErr_Occurred())
6999 i
+= PyUnicode_GET_SIZE(self
);
7000 return unicode_getitem(self
, i
);
7001 } else if (PySlice_Check(item
)) {
7002 Py_ssize_t start
, stop
, step
, slicelength
, cur
, i
;
7003 Py_UNICODE
* source_buf
;
7004 Py_UNICODE
* result_buf
;
7007 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyUnicode_GET_SIZE(self
),
7008 &start
, &stop
, &step
, &slicelength
) < 0) {
7012 if (slicelength
<= 0) {
7013 return PyUnicode_FromUnicode(NULL
, 0);
7015 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
7016 result_buf
= (Py_UNICODE
*)PyMem_MALLOC(slicelength
*
7017 sizeof(Py_UNICODE
));
7019 if (result_buf
== NULL
)
7020 return PyErr_NoMemory();
7022 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
7023 result_buf
[i
] = source_buf
[cur
];
7026 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
7027 PyMem_FREE(result_buf
);
7031 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
7036 static PyMappingMethods unicode_as_mapping
= {
7037 (lenfunc
)unicode_length
, /* mp_length */
7038 (binaryfunc
)unicode_subscript
, /* mp_subscript */
7039 (objobjargproc
)0, /* mp_ass_subscript */
7043 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
7048 PyErr_SetString(PyExc_SystemError
,
7049 "accessing non-existent unicode segment");
7052 *ptr
= (void *) self
->str
;
7053 return PyUnicode_GET_DATA_SIZE(self
);
7057 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, Py_ssize_t index
,
7060 PyErr_SetString(PyExc_TypeError
,
7061 "cannot use unicode as modifiable buffer");
7066 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
7070 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
7075 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
7082 PyErr_SetString(PyExc_SystemError
,
7083 "accessing non-existent unicode segment");
7086 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
7089 *ptr
= (void *) PyString_AS_STRING(str
);
7090 return PyString_GET_SIZE(str
);
7093 /* Helpers for PyUnicode_Format() */
7096 getnextarg(PyObject
*args
, Py_ssize_t arglen
, Py_ssize_t
*p_argidx
)
7098 Py_ssize_t argidx
= *p_argidx
;
7099 if (argidx
< arglen
) {
7104 return PyTuple_GetItem(args
, argidx
);
7106 PyErr_SetString(PyExc_TypeError
,
7107 "not enough arguments for format string");
7111 #define F_LJUST (1<<0)
7112 #define F_SIGN (1<<1)
7113 #define F_BLANK (1<<2)
7114 #define F_ALT (1<<3)
7115 #define F_ZERO (1<<4)
7118 strtounicode(Py_UNICODE
*buffer
, const char *charbuffer
)
7120 register Py_ssize_t i
;
7121 Py_ssize_t len
= strlen(charbuffer
);
7122 for (i
= len
- 1; i
>= 0; i
--)
7123 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
7129 doubletounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, double x
)
7133 PyOS_ascii_formatd((char *)buffer
, len
, format
, x
);
7134 result
= strtounicode(buffer
, (char *)buffer
);
7135 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
7139 longtounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, long x
)
7143 PyOS_snprintf((char *)buffer
, len
, format
, x
);
7144 result
= strtounicode(buffer
, (char *)buffer
);
7145 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
7148 /* XXX To save some code duplication, formatfloat/long/int could have been
7149 shared with stringobject.c, converting from 8-bit to Unicode after the
7150 formatting is done. */
7153 formatfloat(Py_UNICODE
*buf
,
7160 /* fmt = '%#.' + `prec` + `type`
7161 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7165 x
= PyFloat_AsDouble(v
);
7166 if (x
== -1.0 && PyErr_Occurred())
7170 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
7172 /* Worst case length calc to ensure no buffer overrun:
7176 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7177 for any double rep.)
7178 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7181 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7182 len = 1 + 50 + 1 + prec = 52 + prec
7184 If prec=0 the effective precision is 1 (the leading digit is
7185 always given), therefore increase the length by one.
7188 if ((type
== 'g' && buflen
<= (size_t)10 + (size_t)prec
) ||
7189 (type
== 'f' && buflen
<= (size_t)53 + (size_t)prec
)) {
7190 PyErr_SetString(PyExc_OverflowError
,
7191 "formatted float is too long (precision too large?)");
7194 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%d%c",
7195 (flags
&F_ALT
) ? "#" : "",
7197 return doubletounicode(buf
, buflen
, fmt
, x
);
7201 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
7205 PyObject
*str
; /* temporary string object. */
7206 PyUnicodeObject
*result
;
7208 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
7211 result
= _PyUnicode_New(len
);
7216 for (i
= 0; i
< len
; i
++)
7217 result
->str
[i
] = buf
[i
];
7218 result
->str
[len
] = 0;
7220 return (PyObject
*)result
;
7224 formatint(Py_UNICODE
*buf
,
7231 /* fmt = '%#.' + `prec` + 'l' + `type`
7232 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7236 char fmt
[64]; /* plenty big enough! */
7240 x
= PyInt_AsLong(v
);
7241 if (x
== -1 && PyErr_Occurred())
7243 if (x
< 0 && type
== 'u') {
7246 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
7253 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7254 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7256 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
7257 PyErr_SetString(PyExc_OverflowError
,
7258 "formatted integer is too long (precision too large?)");
7262 if ((flags
& F_ALT
) &&
7263 (type
== 'x' || type
== 'X')) {
7264 /* When converting under %#x or %#X, there are a number
7265 * of issues that cause pain:
7266 * - when 0 is being converted, the C standard leaves off
7267 * the '0x' or '0X', which is inconsistent with other
7268 * %#x/%#X conversions and inconsistent with Python's
7270 * - there are platforms that violate the standard and
7271 * convert 0 with the '0x' or '0X'
7272 * (Metrowerks, Compaq Tru64)
7273 * - there are platforms that give '0x' when converting
7274 * under %#X, but convert 0 in accordance with the
7275 * standard (OS/2 EMX)
7277 * We can achieve the desired consistency by inserting our
7278 * own '0x' or '0X' prefix, and substituting %x/%X in place
7281 * Note that this is the same approach as used in
7282 * formatint() in stringobject.c
7284 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
7285 sign
, type
, prec
, type
);
7288 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
7289 sign
, (flags
&F_ALT
) ? "#" : "",
7293 return longtounicode(buf
, buflen
, fmt
, -x
);
7295 return longtounicode(buf
, buflen
, fmt
, x
);
7299 formatchar(Py_UNICODE
*buf
,
7303 /* presume that the buffer is at least 2 characters long */
7304 if (PyUnicode_Check(v
)) {
7305 if (PyUnicode_GET_SIZE(v
) != 1)
7307 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
7310 else if (PyString_Check(v
)) {
7311 if (PyString_GET_SIZE(v
) != 1)
7313 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
7317 /* Integer input truncated to a character */
7319 x
= PyInt_AsLong(v
);
7320 if (x
== -1 && PyErr_Occurred())
7322 #ifdef Py_UNICODE_WIDE
7323 if (x
< 0 || x
> 0x10ffff) {
7324 PyErr_SetString(PyExc_OverflowError
,
7325 "%c arg not in range(0x110000) "
7326 "(wide Python build)");
7330 if (x
< 0 || x
> 0xffff) {
7331 PyErr_SetString(PyExc_OverflowError
,
7332 "%c arg not in range(0x10000) "
7333 "(narrow Python build)");
7337 buf
[0] = (Py_UNICODE
) x
;
7343 PyErr_SetString(PyExc_TypeError
,
7344 "%c requires int or char");
7348 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7350 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7351 chars are formatted. XXX This is a magic number. Each formatting
7352 routine does bounds checking to ensure no overflow, but a better
7353 solution may be to malloc a buffer of appropriate size for each
7354 format. For now, the current solution is sufficient.
7356 #define FORMATBUFLEN (size_t)120
7358 PyObject
*PyUnicode_Format(PyObject
*format
,
7361 Py_UNICODE
*fmt
, *res
;
7362 Py_ssize_t fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
7364 PyUnicodeObject
*result
= NULL
;
7365 PyObject
*dict
= NULL
;
7368 if (format
== NULL
|| args
== NULL
) {
7369 PyErr_BadInternalCall();
7372 uformat
= PyUnicode_FromObject(format
);
7373 if (uformat
== NULL
)
7375 fmt
= PyUnicode_AS_UNICODE(uformat
);
7376 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
7378 reslen
= rescnt
= fmtcnt
+ 100;
7379 result
= _PyUnicode_New(reslen
);
7382 res
= PyUnicode_AS_UNICODE(result
);
7384 if (PyTuple_Check(args
)) {
7385 arglen
= PyTuple_Size(args
);
7392 if (args
->ob_type
->tp_as_mapping
&& !PyTuple_Check(args
) &&
7393 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
7396 while (--fmtcnt
>= 0) {
7399 rescnt
= fmtcnt
+ 100;
7401 if (_PyUnicode_Resize(&result
, reslen
) < 0)
7403 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
7409 /* Got a format specifier */
7411 Py_ssize_t width
= -1;
7413 Py_UNICODE c
= '\0';
7416 PyObject
*temp
= NULL
;
7420 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
7424 Py_UNICODE
*keystart
;
7430 PyErr_SetString(PyExc_TypeError
,
7431 "format requires a mapping");
7437 /* Skip over balanced parentheses */
7438 while (pcount
> 0 && --fmtcnt
>= 0) {
7441 else if (*fmt
== '(')
7445 keylen
= fmt
- keystart
- 1;
7446 if (fmtcnt
< 0 || pcount
> 0) {
7447 PyErr_SetString(PyExc_ValueError
,
7448 "incomplete format key");
7452 /* keys are converted to strings using UTF-8 and
7453 then looked up since Python uses strings to hold
7454 variables names etc. in its namespaces and we
7455 wouldn't want to break common idioms. */
7456 key
= PyUnicode_EncodeUTF8(keystart
,
7460 key
= PyUnicode_FromUnicode(keystart
, keylen
);
7468 args
= PyObject_GetItem(dict
, key
);
7477 while (--fmtcnt
>= 0) {
7478 switch (c
= *fmt
++) {
7479 case '-': flags
|= F_LJUST
; continue;
7480 case '+': flags
|= F_SIGN
; continue;
7481 case ' ': flags
|= F_BLANK
; continue;
7482 case '#': flags
|= F_ALT
; continue;
7483 case '0': flags
|= F_ZERO
; continue;
7488 v
= getnextarg(args
, arglen
, &argidx
);
7491 if (!PyInt_Check(v
)) {
7492 PyErr_SetString(PyExc_TypeError
,
7496 width
= PyInt_AsLong(v
);
7504 else if (c
>= '0' && c
<= '9') {
7506 while (--fmtcnt
>= 0) {
7508 if (c
< '0' || c
> '9')
7510 if ((width
*10) / 10 != width
) {
7511 PyErr_SetString(PyExc_ValueError
,
7515 width
= width
*10 + (c
- '0');
7523 v
= getnextarg(args
, arglen
, &argidx
);
7526 if (!PyInt_Check(v
)) {
7527 PyErr_SetString(PyExc_TypeError
,
7531 prec
= PyInt_AsLong(v
);
7537 else if (c
>= '0' && c
<= '9') {
7539 while (--fmtcnt
>= 0) {
7540 c
= Py_CHARMASK(*fmt
++);
7541 if (c
< '0' || c
> '9')
7543 if ((prec
*10) / 10 != prec
) {
7544 PyErr_SetString(PyExc_ValueError
,
7548 prec
= prec
*10 + (c
- '0');
7553 if (c
== 'h' || c
== 'l' || c
== 'L') {
7559 PyErr_SetString(PyExc_ValueError
,
7560 "incomplete format");
7564 v
= getnextarg(args
, arglen
, &argidx
);
7574 /* presume that buffer length is at least 1 */
7581 if (PyUnicode_Check(v
) && c
== 's') {
7588 temp
= PyObject_Unicode(v
);
7590 temp
= PyObject_Repr(v
);
7593 if (PyUnicode_Check(temp
))
7594 /* nothing to do */;
7595 else if (PyString_Check(temp
)) {
7596 /* convert to string to Unicode */
7597 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
7598 PyString_GET_SIZE(temp
),
7608 PyErr_SetString(PyExc_TypeError
,
7609 "%s argument has non-string str()");
7613 pbuf
= PyUnicode_AS_UNICODE(temp
);
7614 len
= PyUnicode_GET_SIZE(temp
);
7615 if (prec
>= 0 && len
> prec
)
7627 if (PyLong_Check(v
)) {
7628 temp
= formatlong(v
, flags
, prec
, c
);
7631 pbuf
= PyUnicode_AS_UNICODE(temp
);
7632 len
= PyUnicode_GET_SIZE(temp
);
7637 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
7656 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
7667 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
7673 PyErr_Format(PyExc_ValueError
,
7674 "unsupported format character '%c' (0x%x) "
7676 (31<=c
&& c
<=126) ? (char)c
: '?',
7678 (int)(fmt
-1 - PyUnicode_AS_UNICODE(uformat
)));
7682 if (*pbuf
== '-' || *pbuf
== '+') {
7686 else if (flags
& F_SIGN
)
7688 else if (flags
& F_BLANK
)
7695 if (rescnt
- (sign
!= 0) < width
) {
7697 rescnt
= width
+ fmtcnt
+ 100;
7704 if (_PyUnicode_Resize(&result
, reslen
) < 0) {
7708 res
= PyUnicode_AS_UNICODE(result
)
7718 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
7719 assert(pbuf
[0] == '0');
7720 assert(pbuf
[1] == c
);
7731 if (width
> len
&& !(flags
& F_LJUST
)) {
7735 } while (--width
> len
);
7740 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
7741 assert(pbuf
[0] == '0');
7742 assert(pbuf
[1] == c
);
7747 Py_UNICODE_COPY(res
, pbuf
, len
);
7750 while (--width
>= len
) {
7754 if (dict
&& (argidx
< arglen
) && c
!= '%') {
7755 PyErr_SetString(PyExc_TypeError
,
7756 "not all arguments converted during string formatting");
7763 if (argidx
< arglen
&& !dict
) {
7764 PyErr_SetString(PyExc_TypeError
,
7765 "not all arguments converted during string formatting");
7769 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
7775 return (PyObject
*)result
;
7786 static PyBufferProcs unicode_as_buffer
= {
7787 (readbufferproc
) unicode_buffer_getreadbuf
,
7788 (writebufferproc
) unicode_buffer_getwritebuf
,
7789 (segcountproc
) unicode_buffer_getsegcount
,
7790 (charbufferproc
) unicode_buffer_getcharbuf
,
7794 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
7797 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
7800 static char *kwlist
[] = {"string", "encoding", "errors", 0};
7801 char *encoding
= NULL
;
7802 char *errors
= NULL
;
7804 if (type
!= &PyUnicode_Type
)
7805 return unicode_subtype_new(type
, args
, kwds
);
7806 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
7807 kwlist
, &x
, &encoding
, &errors
))
7810 return (PyObject
*)_PyUnicode_New(0);
7811 if (encoding
== NULL
&& errors
== NULL
)
7812 return PyObject_Unicode(x
);
7814 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
7818 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
7820 PyUnicodeObject
*tmp
, *pnew
;
7823 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
7824 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
7827 assert(PyUnicode_Check(tmp
));
7828 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
7833 pnew
->str
= PyMem_NEW(Py_UNICODE
, n
+1);
7834 if (pnew
->str
== NULL
) {
7835 _Py_ForgetReference((PyObject
*)pnew
);
7838 return PyErr_NoMemory();
7840 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
7842 pnew
->hash
= tmp
->hash
;
7844 return (PyObject
*)pnew
;
7847 PyDoc_STRVAR(unicode_doc
,
7848 "unicode(string [, encoding[, errors]]) -> object\n\
7850 Create a new Unicode object from the given encoded string.\n\
7851 encoding defaults to the current default string encoding.\n\
7852 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7854 PyTypeObject PyUnicode_Type
= {
7855 PyObject_HEAD_INIT(&PyType_Type
)
7857 "unicode", /* tp_name */
7858 sizeof(PyUnicodeObject
), /* tp_size */
7859 0, /* tp_itemsize */
7861 (destructor
)unicode_dealloc
, /* tp_dealloc */
7865 (cmpfunc
) unicode_compare
, /* tp_compare */
7866 unicode_repr
, /* tp_repr */
7867 &unicode_as_number
, /* tp_as_number */
7868 &unicode_as_sequence
, /* tp_as_sequence */
7869 &unicode_as_mapping
, /* tp_as_mapping */
7870 (hashfunc
) unicode_hash
, /* tp_hash*/
7872 (reprfunc
) unicode_str
, /* tp_str */
7873 PyObject_GenericGetAttr
, /* tp_getattro */
7874 0, /* tp_setattro */
7875 &unicode_as_buffer
, /* tp_as_buffer */
7876 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
7877 Py_TPFLAGS_BASETYPE
, /* tp_flags */
7878 unicode_doc
, /* tp_doc */
7879 0, /* tp_traverse */
7881 0, /* tp_richcompare */
7882 0, /* tp_weaklistoffset */
7884 0, /* tp_iternext */
7885 unicode_methods
, /* tp_methods */
7888 &PyBaseString_Type
, /* tp_base */
7890 0, /* tp_descr_get */
7891 0, /* tp_descr_set */
7892 0, /* tp_dictoffset */
7895 unicode_new
, /* tp_new */
7896 PyObject_Del
, /* tp_free */
7899 /* Initialize the Unicode implementation */
7901 void _PyUnicode_Init(void)
7905 /* XXX - move this array to unicodectype.c ? */
7906 Py_UNICODE linebreak
[] = {
7907 0x000A, /* LINE FEED */
7908 0x000D, /* CARRIAGE RETURN */
7909 0x001C, /* FILE SEPARATOR */
7910 0x001D, /* GROUP SEPARATOR */
7911 0x001E, /* RECORD SEPARATOR */
7912 0x0085, /* NEXT LINE */
7913 0x2028, /* LINE SEPARATOR */
7914 0x2029, /* PARAGRAPH SEPARATOR */
7917 /* Init the implementation */
7918 unicode_freelist
= NULL
;
7919 unicode_freelist_size
= 0;
7920 unicode_empty
= _PyUnicode_New(0);
7921 strcpy(unicode_default_encoding
, "ascii");
7922 for (i
= 0; i
< 256; i
++)
7923 unicode_latin1
[i
] = NULL
;
7924 if (PyType_Ready(&PyUnicode_Type
) < 0)
7925 Py_FatalError("Can't initialize 'unicode'");
7927 /* initialize the linebreak bloom filter */
7928 bloom_linebreak
= make_bloom_mask(
7929 linebreak
, sizeof(linebreak
) / sizeof(linebreak
[0])
7932 PyType_Ready(&EncodingMapType
);
7935 /* Finalize the Unicode implementation */
7938 _PyUnicode_Fini(void)
7943 Py_XDECREF(unicode_empty
);
7944 unicode_empty
= NULL
;
7946 for (i
= 0; i
< 256; i
++) {
7947 if (unicode_latin1
[i
]) {
7948 Py_DECREF(unicode_latin1
[i
]);
7949 unicode_latin1
[i
] = NULL
;
7953 for (u
= unicode_freelist
; u
!= NULL
;) {
7954 PyUnicodeObject
*v
= u
;
7955 u
= *(PyUnicodeObject
**)u
;
7958 Py_XDECREF(v
->defenc
);
7961 unicode_freelist
= NULL
;
7962 unicode_freelist_size
= 0;
7973 indent-tabs-mode: nil