3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
45 #include "unicodeobject.h"
52 /* Limit for the Unicode object free list */
54 #define MAX_UNICODE_FREELIST_SIZE 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
80 # define BYTEORDER_IS_LITTLE_ENDIAN
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
95 /* Free list for Unicode objects */
96 static PyUnicodeObject
*unicode_freelist
;
97 static int unicode_freelist_size
;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject
*unicode_empty
;
102 /* Single character Unicode strings in the Latin-1 range are being
104 static PyUnicodeObject
*unicode_latin1
[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding
[100];
116 PyUnicode_GetMax(void)
118 #ifdef Py_UNICODE_WIDE
121 /* This is actually an illegal character, so it should
122 not be passed to unichr. */
127 /* --- Bloom Filters ----------------------------------------------------- */
129 /* stuff to implement simple "bloom filters" for Unicode characters.
130 to keep things simple, we use a single bitmask, using the least 5
131 bits from each unicode characters as the bit index. */
133 /* the linebreak mask is set up by Unicode_Init below */
135 #define BLOOM_MASK unsigned long
137 static BLOOM_MASK bloom_linebreak
;
139 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
141 #define BLOOM_LINEBREAK(ch)\
142 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
144 Py_LOCAL_INLINE(BLOOM_MASK
) make_bloom_mask(Py_UNICODE
* ptr
, Py_ssize_t len
)
146 /* calculate simple bloom-style bitmask for a given unicode string */
152 for (i
= 0; i
< len
; i
++)
153 mask
|= (1 << (ptr
[i
] & 0x1F));
158 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr
, Py_UNICODE
* set
, Py_ssize_t setlen
)
162 for (i
= 0; i
< setlen
; i
++)
169 #define BLOOM_MEMBER(mask, chr, set, setlen)\
170 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
172 /* --- Unicode Object ----------------------------------------------------- */
175 int unicode_resize(register PyUnicodeObject
*unicode
,
180 /* Shortcut if there's nothing much to do. */
181 if (unicode
->length
== length
)
184 /* Resizing shared object (unicode_empty or single character
185 objects) in-place is not allowed. Use PyUnicode_Resize()
188 if (unicode
== unicode_empty
||
189 (unicode
->length
== 1 &&
190 unicode
->str
[0] < 256U &&
191 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
192 PyErr_SetString(PyExc_SystemError
,
193 "can't resize shared unicode objects");
197 /* We allocate one more byte to make sure the string is Ux0000 terminated.
198 The overallocation is also used by fastsearch, which assumes that it's
199 safe to look at str[length] (without making any assumptions about what
202 oldstr
= unicode
->str
;
203 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
205 unicode
->str
= (Py_UNICODE
*)oldstr
;
209 unicode
->str
[length
] = 0;
210 unicode
->length
= length
;
213 /* Reset the object caches */
214 if (unicode
->defenc
) {
215 Py_DECREF(unicode
->defenc
);
216 unicode
->defenc
= NULL
;
223 /* We allocate one more byte to make sure the string is
224 Ux0000 terminated -- XXX is this needed ?
226 XXX This allocator could further be enhanced by assuring that the
227 free list never reduces its size below 1.
232 PyUnicodeObject
*_PyUnicode_New(Py_ssize_t length
)
234 register PyUnicodeObject
*unicode
;
236 /* Optimization for empty strings */
237 if (length
== 0 && unicode_empty
!= NULL
) {
238 Py_INCREF(unicode_empty
);
239 return unicode_empty
;
242 /* Unicode freelist & memory allocation */
243 if (unicode_freelist
) {
244 unicode
= unicode_freelist
;
245 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
246 unicode_freelist_size
--;
248 /* Keep-Alive optimization: we only upsize the buffer,
249 never downsize it. */
250 if ((unicode
->length
< length
) &&
251 unicode_resize(unicode
, length
) < 0) {
252 PyMem_DEL(unicode
->str
);
257 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
259 PyObject_INIT(unicode
, &PyUnicode_Type
);
262 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
265 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
272 /* Initialize the first element to guard against cases where
273 * the caller fails before initializing str -- unicode_resize()
274 * reads str[0], and the Keep-Alive optimization can keep memory
275 * allocated for str alive across a call to unicode_dealloc(unicode).
276 * We don't want unicode_resize to read uninitialized memory in
280 unicode
->str
[length
] = 0;
281 unicode
->length
= length
;
283 unicode
->defenc
= NULL
;
287 _Py_ForgetReference((PyObject
*)unicode
);
288 PyObject_Del(unicode
);
293 void unicode_dealloc(register PyUnicodeObject
*unicode
)
295 if (PyUnicode_CheckExact(unicode
) &&
296 unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
297 /* Keep-Alive optimization */
298 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
299 PyMem_DEL(unicode
->str
);
303 if (unicode
->defenc
) {
304 Py_DECREF(unicode
->defenc
);
305 unicode
->defenc
= NULL
;
307 /* Add to free list */
308 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
309 unicode_freelist
= unicode
;
310 unicode_freelist_size
++;
313 PyMem_DEL(unicode
->str
);
314 Py_XDECREF(unicode
->defenc
);
315 Py_Type(unicode
)->tp_free((PyObject
*)unicode
);
319 int PyUnicode_Resize(PyObject
**unicode
, Py_ssize_t length
)
321 register PyUnicodeObject
*v
;
323 /* Argument checks */
324 if (unicode
== NULL
) {
325 PyErr_BadInternalCall();
328 v
= (PyUnicodeObject
*)*unicode
;
329 if (v
== NULL
|| !PyUnicode_Check(v
) || Py_Refcnt(v
) != 1 || length
< 0) {
330 PyErr_BadInternalCall();
334 /* Resizing unicode_empty and single character objects is not
335 possible since these are being shared. We simply return a fresh
336 copy with the same Unicode content. */
337 if (v
->length
!= length
&&
338 (v
== unicode_empty
|| v
->length
== 1)) {
339 PyUnicodeObject
*w
= _PyUnicode_New(length
);
342 Py_UNICODE_COPY(w
->str
, v
->str
,
343 length
< v
->length
? length
: v
->length
);
345 *unicode
= (PyObject
*)w
;
349 /* Note that we don't have to modify *unicode for unshared Unicode
350 objects, since we can modify them in-place. */
351 return unicode_resize(v
, length
);
354 /* Internal API for use in unicodeobject.c only ! */
355 #define _PyUnicode_Resize(unicodevar, length) \
356 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
358 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
361 PyUnicodeObject
*unicode
;
363 /* If the Unicode data is known at construction time, we can apply
364 some optimizations which share commonly used objects. */
367 /* Optimization for empty strings */
368 if (size
== 0 && unicode_empty
!= NULL
) {
369 Py_INCREF(unicode_empty
);
370 return (PyObject
*)unicode_empty
;
373 /* Single character Unicode objects in the Latin-1 range are
374 shared when using this constructor */
375 if (size
== 1 && *u
< 256) {
376 unicode
= unicode_latin1
[*u
];
378 unicode
= _PyUnicode_New(1);
381 unicode
->str
[0] = *u
;
382 unicode_latin1
[*u
] = unicode
;
385 return (PyObject
*)unicode
;
389 unicode
= _PyUnicode_New(size
);
393 /* Copy the Unicode data into the new object */
395 Py_UNICODE_COPY(unicode
->str
, u
, size
);
397 return (PyObject
*)unicode
;
402 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
405 PyUnicodeObject
*unicode
;
408 PyErr_BadInternalCall();
412 unicode
= _PyUnicode_New(size
);
416 /* Copy the wchar_t data into the new object */
417 #ifdef HAVE_USABLE_WCHAR_T
418 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
421 register Py_UNICODE
*u
;
422 register Py_ssize_t i
;
423 u
= PyUnicode_AS_UNICODE(unicode
);
424 for (i
= size
; i
> 0; i
--)
429 return (PyObject
*)unicode
;
432 Py_ssize_t
PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
436 if (unicode
== NULL
) {
437 PyErr_BadInternalCall();
441 /* If possible, try to copy the 0-termination as well */
442 if (size
> PyUnicode_GET_SIZE(unicode
))
443 size
= PyUnicode_GET_SIZE(unicode
) + 1;
445 #ifdef HAVE_USABLE_WCHAR_T
446 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
449 register Py_UNICODE
*u
;
450 register Py_ssize_t i
;
451 u
= PyUnicode_AS_UNICODE(unicode
);
452 for (i
= size
; i
> 0; i
--)
457 if (size
> PyUnicode_GET_SIZE(unicode
))
458 return PyUnicode_GET_SIZE(unicode
);
465 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
469 #ifdef Py_UNICODE_WIDE
470 if (ordinal
< 0 || ordinal
> 0x10ffff) {
471 PyErr_SetString(PyExc_ValueError
,
472 "unichr() arg not in range(0x110000) "
473 "(wide Python build)");
477 if (ordinal
< 0 || ordinal
> 0xffff) {
478 PyErr_SetString(PyExc_ValueError
,
479 "unichr() arg not in range(0x10000) "
480 "(narrow Python build)");
485 s
[0] = (Py_UNICODE
)ordinal
;
486 return PyUnicode_FromUnicode(s
, 1);
489 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
491 /* XXX Perhaps we should make this API an alias of
492 PyObject_Unicode() instead ?! */
493 if (PyUnicode_CheckExact(obj
)) {
497 if (PyUnicode_Check(obj
)) {
498 /* For a Unicode subtype that's not a Unicode object,
499 return a true Unicode object with the same data. */
500 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
501 PyUnicode_GET_SIZE(obj
));
503 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
506 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
507 const char *encoding
,
510 const char *s
= NULL
;
515 PyErr_BadInternalCall();
520 /* For b/w compatibility we also accept Unicode objects provided
521 that no encodings is given and then redirect to
522 PyObject_Unicode() which then applies the additional logic for
525 NOTE: This API should really only be used for object which
526 represent *encoded* Unicode !
529 if (PyUnicode_Check(obj
)) {
531 PyErr_SetString(PyExc_TypeError
,
532 "decoding Unicode is not supported");
535 return PyObject_Unicode(obj
);
538 if (PyUnicode_Check(obj
)) {
539 PyErr_SetString(PyExc_TypeError
,
540 "decoding Unicode is not supported");
546 if (PyString_Check(obj
)) {
547 s
= PyString_AS_STRING(obj
);
548 len
= PyString_GET_SIZE(obj
);
550 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
551 /* Overwrite the error message with something more useful in
552 case of a TypeError. */
553 if (PyErr_ExceptionMatches(PyExc_TypeError
))
554 PyErr_Format(PyExc_TypeError
,
555 "coercing to Unicode: need string or buffer, "
557 Py_Type(obj
)->tp_name
);
561 /* Convert to Unicode */
563 Py_INCREF(unicode_empty
);
564 v
= (PyObject
*)unicode_empty
;
567 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
575 PyObject
*PyUnicode_Decode(const char *s
,
577 const char *encoding
,
580 PyObject
*buffer
= NULL
, *unicode
;
582 if (encoding
== NULL
)
583 encoding
= PyUnicode_GetDefaultEncoding();
585 /* Shortcuts for common default encodings */
586 if (strcmp(encoding
, "utf-8") == 0)
587 return PyUnicode_DecodeUTF8(s
, size
, errors
);
588 else if (strcmp(encoding
, "latin-1") == 0)
589 return PyUnicode_DecodeLatin1(s
, size
, errors
);
590 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
591 else if (strcmp(encoding
, "mbcs") == 0)
592 return PyUnicode_DecodeMBCS(s
, size
, errors
);
594 else if (strcmp(encoding
, "ascii") == 0)
595 return PyUnicode_DecodeASCII(s
, size
, errors
);
597 /* Decode via the codec registry */
598 buffer
= PyBuffer_FromMemory((void *)s
, size
);
601 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
604 if (!PyUnicode_Check(unicode
)) {
605 PyErr_Format(PyExc_TypeError
,
606 "decoder did not return an unicode object (type=%.400s)",
607 Py_Type(unicode
)->tp_name
);
619 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
620 const char *encoding
,
625 if (!PyUnicode_Check(unicode
)) {
630 if (encoding
== NULL
)
631 encoding
= PyUnicode_GetDefaultEncoding();
633 /* Decode via the codec registry */
634 v
= PyCodec_Decode(unicode
, encoding
, errors
);
643 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
645 const char *encoding
,
648 PyObject
*v
, *unicode
;
650 unicode
= PyUnicode_FromUnicode(s
, size
);
653 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
658 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
659 const char *encoding
,
664 if (!PyUnicode_Check(unicode
)) {
669 if (encoding
== NULL
)
670 encoding
= PyUnicode_GetDefaultEncoding();
672 /* Encode via the codec registry */
673 v
= PyCodec_Encode(unicode
, encoding
, errors
);
682 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
683 const char *encoding
,
688 if (!PyUnicode_Check(unicode
)) {
693 if (encoding
== NULL
)
694 encoding
= PyUnicode_GetDefaultEncoding();
696 /* Shortcuts for common default encodings */
697 if (errors
== NULL
) {
698 if (strcmp(encoding
, "utf-8") == 0)
699 return PyUnicode_AsUTF8String(unicode
);
700 else if (strcmp(encoding
, "latin-1") == 0)
701 return PyUnicode_AsLatin1String(unicode
);
702 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
703 else if (strcmp(encoding
, "mbcs") == 0)
704 return PyUnicode_AsMBCSString(unicode
);
706 else if (strcmp(encoding
, "ascii") == 0)
707 return PyUnicode_AsASCIIString(unicode
);
710 /* Encode via the codec registry */
711 v
= PyCodec_Encode(unicode
, encoding
, errors
);
714 if (!PyString_Check(v
)) {
715 PyErr_Format(PyExc_TypeError
,
716 "encoder did not return a string object (type=%.400s)",
717 Py_Type(v
)->tp_name
);
727 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
730 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
734 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
735 if (v
&& errors
== NULL
)
736 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
740 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
742 if (!PyUnicode_Check(unicode
)) {
746 return PyUnicode_AS_UNICODE(unicode
);
752 Py_ssize_t
PyUnicode_GetSize(PyObject
*unicode
)
754 if (!PyUnicode_Check(unicode
)) {
758 return PyUnicode_GET_SIZE(unicode
);
764 const char *PyUnicode_GetDefaultEncoding(void)
766 return unicode_default_encoding
;
769 int PyUnicode_SetDefaultEncoding(const char *encoding
)
773 /* Make sure the encoding is valid. As side effect, this also
774 loads the encoding into the codec registry cache. */
775 v
= _PyCodec_Lookup(encoding
);
779 strncpy(unicode_default_encoding
,
781 sizeof(unicode_default_encoding
));
788 /* error handling callback helper:
789 build arguments, call the callback and check the arguments,
790 if no exception occurred, copy the replacement to the output
791 and adjust various state variables.
792 return 0 on success, -1 on error
796 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
797 const char *encoding
, const char *reason
,
798 const char *input
, Py_ssize_t insize
, Py_ssize_t
*startinpos
,
799 Py_ssize_t
*endinpos
, PyObject
**exceptionObject
, const char **inptr
,
800 PyObject
**output
, Py_ssize_t
*outpos
, Py_UNICODE
**outptr
)
802 static char *argparse
= "O!n;decoding error handler must return (unicode, int) tuple";
804 PyObject
*restuple
= NULL
;
805 PyObject
*repunicode
= NULL
;
806 Py_ssize_t outsize
= PyUnicode_GET_SIZE(*output
);
807 Py_ssize_t requiredsize
;
813 if (*errorHandler
== NULL
) {
814 *errorHandler
= PyCodec_LookupError(errors
);
815 if (*errorHandler
== NULL
)
819 if (*exceptionObject
== NULL
) {
820 *exceptionObject
= PyUnicodeDecodeError_Create(
821 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
822 if (*exceptionObject
== NULL
)
826 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
828 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
830 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
834 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
835 if (restuple
== NULL
)
837 if (!PyTuple_Check(restuple
)) {
838 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
841 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
844 newpos
= insize
+newpos
;
845 if (newpos
<0 || newpos
>insize
) {
846 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", newpos
);
850 /* need more space? (at least enough for what we
851 have+the replacement+the rest of the string (starting
852 at the new input position), so we won't have to check space
853 when there are no errors in the rest of the string) */
854 repptr
= PyUnicode_AS_UNICODE(repunicode
);
855 repsize
= PyUnicode_GET_SIZE(repunicode
);
856 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
857 if (requiredsize
> outsize
) {
858 if (requiredsize
<2*outsize
)
859 requiredsize
= 2*outsize
;
860 if (PyUnicode_Resize(output
, requiredsize
) < 0)
862 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
865 *inptr
= input
+ newpos
;
866 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
873 Py_XDECREF(restuple
);
877 /* --- UTF-7 Codec -------------------------------------------------------- */
879 /* see RFC2152 for details */
882 char utf7_special
[128] = {
883 /* indicate whether a UTF-7 character is special i.e. cannot be directly
887 2 - whitespace (optional)
888 3 - RFC2152 Set O (optional) */
889 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
891 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
892 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
893 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
894 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
895 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
896 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
900 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
901 warnings about the comparison always being false; since
902 utf7_special[0] is 1, we can safely make that one comparison
905 #define SPECIAL(c, encodeO, encodeWS) \
906 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
907 (encodeWS && (utf7_special[(c)] == 2)) || \
908 (encodeO && (utf7_special[(c)] == 3)))
911 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
913 (isalnum(c) || (c) == '+' || (c) == '/')
915 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
916 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
918 #define ENCODE(out, ch, bits) \
919 while (bits >= 6) { \
920 *out++ = B64(ch >> (bits-6)); \
924 #define DECODE(out, ch, bits, surrogate) \
925 while (bits >= 16) { \
926 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
929 /* We have already generated an error for the high surrogate \
930 so let's not bother seeing if the low surrogate is correct or not */ \
932 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
933 /* This is a surrogate pair. Unfortunately we can't represent \
934 it in a 16-bit character */ \
936 errmsg = "code pairs are not supported"; \
943 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
947 const char *starts
= s
;
948 Py_ssize_t startinpos
;
952 PyUnicodeObject
*unicode
;
954 const char *errmsg
= "";
956 unsigned int bitsleft
= 0;
957 unsigned long charsleft
= 0;
959 PyObject
*errorHandler
= NULL
;
960 PyObject
*exc
= NULL
;
962 unicode
= _PyUnicode_New(size
);
966 return (PyObject
*)unicode
;
977 if ((ch
== '-') || !B64CHAR(ch
)) {
981 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
983 /* The shift sequence has a partial character in it. If
984 bitsleft < 6 then we could just classify it as padding
985 but that is not the case here */
987 errmsg
= "partial character in shift sequence";
990 /* According to RFC2152 the remaining bits should be zero. We
991 choose to signal an error/insert a replacement character
992 here so indicate the potential of a misencoded character. */
994 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
995 if (bitsleft
&& charsleft
<< (sizeof(charsleft
) * 8 - bitsleft
)) {
996 errmsg
= "non-zero padding bits in shift sequence";
1001 if ((s
< e
) && (*(s
) == '-')) {
1005 } else if (SPECIAL(ch
,0,0)) {
1006 errmsg
= "unexpected special character";
1012 charsleft
= (charsleft
<< 6) | UB64(ch
);
1015 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
1018 else if ( ch
== '+' ) {
1019 startinpos
= s
-starts
;
1021 if (s
< e
&& *s
== '-') {
1030 else if (SPECIAL(ch
,0,0)) {
1031 startinpos
= s
-starts
;
1032 errmsg
= "unexpected special character";
1042 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1043 endinpos
= s
-starts
;
1044 if (unicode_decode_call_errorhandler(
1045 errors
, &errorHandler
,
1047 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1048 (PyObject
**)&unicode
, &outpos
, &p
))
1053 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1055 if (unicode_decode_call_errorhandler(
1056 errors
, &errorHandler
,
1057 "utf7", "unterminated shift sequence",
1058 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1059 (PyObject
**)&unicode
, &outpos
, &p
))
1065 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1068 Py_XDECREF(errorHandler
);
1070 return (PyObject
*)unicode
;
1073 Py_XDECREF(errorHandler
);
1080 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1083 int encodeWhiteSpace
,
1087 /* It might be possible to tighten this worst case */
1088 Py_ssize_t cbAllocated
= 5 * size
;
1091 unsigned int bitsleft
= 0;
1092 unsigned long charsleft
= 0;
1097 return PyString_FromStringAndSize(NULL
, 0);
1099 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
1103 start
= out
= PyString_AS_STRING(v
);
1104 for (;i
< size
; ++i
) {
1105 Py_UNICODE ch
= s
[i
];
1111 } else if (SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1115 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1116 inShift
= bitsleft
> 0;
1121 if (!SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1122 *out
++ = B64(charsleft
<< (6-bitsleft
));
1125 /* Characters not in the BASE64 set implicitly unshift the sequence
1126 so no '-' is required, except if the character is itself a '-' */
1127 if (B64CHAR(ch
) || ch
== '-') {
1134 charsleft
= (charsleft
<< 16) | ch
;
1135 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1137 /* If the next character is special then we dont' need to terminate
1138 the shift sequence. If the next character is not a BASE64 character
1139 or '-' then the shift sequence will be terminated implicitly and we
1140 don't have to insert a '-'. */
1142 if (bitsleft
== 0) {
1144 Py_UNICODE ch2
= s
[i
+1];
1146 if (SPECIAL(ch2
, encodeSetO
, encodeWhiteSpace
)) {
1148 } else if (B64CHAR(ch2
) || ch2
== '-') {
1165 *out
++= B64(charsleft
<< (6-bitsleft
) );
1169 _PyString_Resize(&v
, out
- start
);
1180 /* --- UTF-8 Codec -------------------------------------------------------- */
1183 char utf8_code_length
[256] = {
1184 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1185 illegal prefix. see RFC 2279 for details */
1186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1193 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1197 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1198 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1199 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1200 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1201 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1204 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1208 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1211 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1214 Py_ssize_t
*consumed
)
1216 const char *starts
= s
;
1218 Py_ssize_t startinpos
;
1219 Py_ssize_t endinpos
;
1222 PyUnicodeObject
*unicode
;
1224 const char *errmsg
= "";
1225 PyObject
*errorHandler
= NULL
;
1226 PyObject
*exc
= NULL
;
1228 /* Note: size will always be longer than the resulting Unicode
1230 unicode
= _PyUnicode_New(size
);
1236 return (PyObject
*)unicode
;
1239 /* Unpack UTF-8 encoded data */
1244 Py_UCS4 ch
= (unsigned char)*s
;
1247 *p
++ = (Py_UNICODE
)ch
;
1252 n
= utf8_code_length
[ch
];
1258 errmsg
= "unexpected end of data";
1259 startinpos
= s
-starts
;
1268 errmsg
= "unexpected code byte";
1269 startinpos
= s
-starts
;
1270 endinpos
= startinpos
+1;
1274 errmsg
= "internal error";
1275 startinpos
= s
-starts
;
1276 endinpos
= startinpos
+1;
1280 if ((s
[1] & 0xc0) != 0x80) {
1281 errmsg
= "invalid data";
1282 startinpos
= s
-starts
;
1283 endinpos
= startinpos
+2;
1286 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1288 startinpos
= s
-starts
;
1289 endinpos
= startinpos
+2;
1290 errmsg
= "illegal encoding";
1294 *p
++ = (Py_UNICODE
)ch
;
1298 if ((s
[1] & 0xc0) != 0x80 ||
1299 (s
[2] & 0xc0) != 0x80) {
1300 errmsg
= "invalid data";
1301 startinpos
= s
-starts
;
1302 endinpos
= startinpos
+3;
1305 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1307 /* Note: UTF-8 encodings of surrogates are considered
1308 legal UTF-8 sequences;
1310 XXX For wide builds (UCS-4) we should probably try
1311 to recombine the surrogates into a single code
1314 errmsg
= "illegal encoding";
1315 startinpos
= s
-starts
;
1316 endinpos
= startinpos
+3;
1320 *p
++ = (Py_UNICODE
)ch
;
1324 if ((s
[1] & 0xc0) != 0x80 ||
1325 (s
[2] & 0xc0) != 0x80 ||
1326 (s
[3] & 0xc0) != 0x80) {
1327 errmsg
= "invalid data";
1328 startinpos
= s
-starts
;
1329 endinpos
= startinpos
+4;
1332 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
1333 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
1334 /* validate and convert to UTF-16 */
1335 if ((ch
< 0x10000) /* minimum value allowed for 4
1337 || (ch
> 0x10ffff)) /* maximum value allowed for
1340 errmsg
= "illegal encoding";
1341 startinpos
= s
-starts
;
1342 endinpos
= startinpos
+4;
1345 #ifdef Py_UNICODE_WIDE
1346 *p
++ = (Py_UNICODE
)ch
;
1348 /* compute and append the two surrogates: */
1350 /* translate from 10000..10FFFF to 0..FFFF */
1353 /* high surrogate = top 10 bits added to D800 */
1354 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
1356 /* low surrogate = bottom 10 bits added to DC00 */
1357 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
1362 /* Other sizes are only needed for UCS-4 */
1363 errmsg
= "unsupported Unicode code range";
1364 startinpos
= s
-starts
;
1365 endinpos
= startinpos
+n
;
1372 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1373 if (unicode_decode_call_errorhandler(
1374 errors
, &errorHandler
,
1376 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1377 (PyObject
**)&unicode
, &outpos
, &p
))
1381 *consumed
= s
-starts
;
1384 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
1387 Py_XDECREF(errorHandler
);
1389 return (PyObject
*)unicode
;
1392 Py_XDECREF(errorHandler
);
1398 /* Allocation strategy: if the string is short, convert into a stack buffer
1399 and allocate exactly as much space needed at the end. Else allocate the
1400 maximum possible needed (4 result bytes per Unicode character), and return
1401 the excess memory at the end.
1404 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
1408 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1410 Py_ssize_t i
; /* index into s of next input byte */
1411 PyObject
*v
; /* result string object */
1412 char *p
; /* next free byte in output buffer */
1413 Py_ssize_t nallocated
; /* number of result bytes allocated */
1414 Py_ssize_t nneeded
; /* number of result bytes needed */
1415 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
1420 if (size
<= MAX_SHORT_UNICHARS
) {
1421 /* Write into the stack buffer; nallocated can't overflow.
1422 * At the end, we'll allocate exactly as much heap space as it
1423 * turns out we need.
1425 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
1426 v
= NULL
; /* will allocate after we're done */
1430 /* Overallocate on the heap, and give the excess back at the end. */
1431 nallocated
= size
* 4;
1432 if (nallocated
/ 4 != size
) /* overflow! */
1433 return PyErr_NoMemory();
1434 v
= PyString_FromStringAndSize(NULL
, nallocated
);
1437 p
= PyString_AS_STRING(v
);
1440 for (i
= 0; i
< size
;) {
1441 Py_UCS4 ch
= s
[i
++];
1447 else if (ch
< 0x0800) {
1448 /* Encode Latin-1 */
1449 *p
++ = (char)(0xc0 | (ch
>> 6));
1450 *p
++ = (char)(0x80 | (ch
& 0x3f));
1453 /* Encode UCS2 Unicode ordinals */
1455 /* Special case: check for high surrogate */
1456 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
1458 /* Check for low surrogate and combine the two to
1459 form a UCS4 value */
1460 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1461 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
1465 /* Fall through: handles isolated high surrogates */
1467 *p
++ = (char)(0xe0 | (ch
>> 12));
1468 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1469 *p
++ = (char)(0x80 | (ch
& 0x3f));
1473 /* Encode UCS4 Unicode ordinals */
1474 *p
++ = (char)(0xf0 | (ch
>> 18));
1475 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
1476 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1477 *p
++ = (char)(0x80 | (ch
& 0x3f));
1482 /* This was stack allocated. */
1483 nneeded
= p
- stackbuf
;
1484 assert(nneeded
<= nallocated
);
1485 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
1488 /* Cut back to size actually needed. */
1489 nneeded
= p
- PyString_AS_STRING(v
);
1490 assert(nneeded
<= nallocated
);
1491 _PyString_Resize(&v
, nneeded
);
1495 #undef MAX_SHORT_UNICHARS
1498 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
1500 if (!PyUnicode_Check(unicode
)) {
1501 PyErr_BadArgument();
1504 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
1505 PyUnicode_GET_SIZE(unicode
),
1509 /* --- UTF-32 Codec ------------------------------------------------------- */
1512 PyUnicode_DecodeUTF32(const char *s
,
1517 return PyUnicode_DecodeUTF32Stateful(s
, size
, errors
, byteorder
, NULL
);
1521 PyUnicode_DecodeUTF32Stateful(const char *s
,
1525 Py_ssize_t
*consumed
)
1527 const char *starts
= s
;
1528 Py_ssize_t startinpos
;
1529 Py_ssize_t endinpos
;
1531 PyUnicodeObject
*unicode
;
1533 #ifndef Py_UNICODE_WIDE
1536 const int pairs
= 0;
1538 const unsigned char *q
, *e
;
1539 int bo
= 0; /* assume native ordering by default */
1540 const char *errmsg
= "";
1541 /* Offsets from q for retrieving bytes in the right order. */
1542 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1543 int iorder
[] = {0, 1, 2, 3};
1545 int iorder
[] = {3, 2, 1, 0};
1547 PyObject
*errorHandler
= NULL
;
1548 PyObject
*exc
= NULL
;
1549 /* On narrow builds we split characters outside the BMP into two
1550 codepoints => count how much extra space we need. */
1551 #ifndef Py_UNICODE_WIDE
1552 for (i
= pairs
= 0; i
< size
/4; i
++)
1553 if (((Py_UCS4
*)s
)[i
] >= 0x10000)
1557 /* This might be one to much, because of a BOM */
1558 unicode
= _PyUnicode_New((size
+3)/4+pairs
);
1562 return (PyObject
*)unicode
;
1564 /* Unpack UTF-32 encoded data */
1566 q
= (unsigned char *)s
;
1572 /* Check for BOM marks (U+FEFF) in the input and adjust current
1573 byte order setting accordingly. In native mode, the leading BOM
1574 mark is skipped, in all other modes, it is copied to the output
1575 stream as-is (giving a ZWNBSP character). */
1578 const Py_UCS4 bom
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
1579 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
1580 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1581 if (bom
== 0x0000FEFF) {
1585 else if (bom
== 0xFFFE0000) {
1590 if (bom
== 0x0000FEFF) {
1594 else if (bom
== 0xFFFE0000) {
1619 /* remaining bytes at the end? (size should be divisible by 4) */
1623 errmsg
= "truncated data";
1624 startinpos
= ((const char *)q
)-starts
;
1625 endinpos
= ((const char *)e
)-starts
;
1627 /* The remaining input chars are ignored if the callback
1628 chooses to skip the input */
1630 ch
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
1631 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
1635 errmsg
= "codepoint not in range(0x110000)";
1636 startinpos
= ((const char *)q
)-starts
;
1637 endinpos
= startinpos
+4;
1640 #ifndef Py_UNICODE_WIDE
1643 *p
++ = 0xD800 | ((ch
-0x10000) >> 10);
1644 *p
++ = 0xDC00 | ((ch
-0x10000) & 0x3FF);
1652 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1653 if (unicode_decode_call_errorhandler(
1654 errors
, &errorHandler
,
1656 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1657 (PyObject
**)&unicode
, &outpos
, &p
))
1665 *consumed
= (const char *)q
-starts
;
1668 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
1671 Py_XDECREF(errorHandler
);
1673 return (PyObject
*)unicode
;
1677 Py_XDECREF(errorHandler
);
1683 PyUnicode_EncodeUTF32(const Py_UNICODE
*s
,
1690 #ifndef Py_UNICODE_WIDE
1693 const int pairs
= 0;
1695 /* Offsets from p for storing byte pairs in the right order. */
1696 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1697 int iorder
[] = {0, 1, 2, 3};
1699 int iorder
[] = {3, 2, 1, 0};
1702 #define STORECHAR(CH) \
1704 p[iorder[3]] = ((CH) >> 24) & 0xff; \
1705 p[iorder[2]] = ((CH) >> 16) & 0xff; \
1706 p[iorder[1]] = ((CH) >> 8) & 0xff; \
1707 p[iorder[0]] = (CH) & 0xff; \
1711 /* In narrow builds we can output surrogate pairs as one codepoint,
1712 so we need less space. */
1713 #ifndef Py_UNICODE_WIDE
1714 for (i
= pairs
= 0; i
< size
-1; i
++)
1715 if (0xD800 <= s
[i
] && s
[i
] <= 0xDBFF &&
1716 0xDC00 <= s
[i
+1] && s
[i
+1] <= 0xDFFF)
1719 v
= PyString_FromStringAndSize(NULL
,
1720 4 * (size
- pairs
+ (byteorder
== 0)));
1724 p
= (unsigned char *)PyString_AS_STRING(v
);
1730 if (byteorder
== -1) {
1737 else if (byteorder
== 1) {
1745 while (size
-- > 0) {
1747 #ifndef Py_UNICODE_WIDE
1748 if (0xD800 <= ch
&& ch
<= 0xDBFF && size
> 0) {
1750 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1751 ch
= (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
1763 PyObject
*PyUnicode_AsUTF32String(PyObject
*unicode
)
1765 if (!PyUnicode_Check(unicode
)) {
1766 PyErr_BadArgument();
1769 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode
),
1770 PyUnicode_GET_SIZE(unicode
),
1775 /* --- UTF-16 Codec ------------------------------------------------------- */
1778 PyUnicode_DecodeUTF16(const char *s
,
1783 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
1787 PyUnicode_DecodeUTF16Stateful(const char *s
,
1791 Py_ssize_t
*consumed
)
1793 const char *starts
= s
;
1794 Py_ssize_t startinpos
;
1795 Py_ssize_t endinpos
;
1797 PyUnicodeObject
*unicode
;
1799 const unsigned char *q
, *e
;
1800 int bo
= 0; /* assume native ordering by default */
1801 const char *errmsg
= "";
1802 /* Offsets from q for retrieving byte pairs in the right order. */
1803 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1804 int ihi
= 1, ilo
= 0;
1806 int ihi
= 0, ilo
= 1;
1808 PyObject
*errorHandler
= NULL
;
1809 PyObject
*exc
= NULL
;
1811 /* Note: size will always be longer than the resulting Unicode
1813 unicode
= _PyUnicode_New(size
);
1817 return (PyObject
*)unicode
;
1819 /* Unpack UTF-16 encoded data */
1821 q
= (unsigned char *)s
;
1827 /* Check for BOM marks (U+FEFF) in the input and adjust current
1828 byte order setting accordingly. In native mode, the leading BOM
1829 mark is skipped, in all other modes, it is copied to the output
1830 stream as-is (giving a ZWNBSP character). */
1833 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
1834 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1835 if (bom
== 0xFEFF) {
1839 else if (bom
== 0xFFFE) {
1844 if (bom
== 0xFEFF) {
1848 else if (bom
== 0xFFFE) {
1869 /* remaining bytes at the end? (size should be even) */
1873 errmsg
= "truncated data";
1874 startinpos
= ((const char *)q
)-starts
;
1875 endinpos
= ((const char *)e
)-starts
;
1877 /* The remaining input chars are ignored if the callback
1878 chooses to skip the input */
1880 ch
= (q
[ihi
] << 8) | q
[ilo
];
1884 if (ch
< 0xD800 || ch
> 0xDFFF) {
1889 /* UTF-16 code pair: */
1891 errmsg
= "unexpected end of data";
1892 startinpos
= (((const char *)q
)-2)-starts
;
1893 endinpos
= ((const char *)e
)-starts
;
1896 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1897 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
1899 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1900 #ifndef Py_UNICODE_WIDE
1904 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
1909 errmsg
= "illegal UTF-16 surrogate";
1910 startinpos
= (((const char *)q
)-4)-starts
;
1911 endinpos
= startinpos
+2;
1916 errmsg
= "illegal encoding";
1917 startinpos
= (((const char *)q
)-2)-starts
;
1918 endinpos
= startinpos
+2;
1919 /* Fall through to report the error */
1922 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1923 if (unicode_decode_call_errorhandler(
1924 errors
, &errorHandler
,
1926 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
1927 (PyObject
**)&unicode
, &outpos
, &p
))
1935 *consumed
= (const char *)q
-starts
;
1938 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
1941 Py_XDECREF(errorHandler
);
1943 return (PyObject
*)unicode
;
1947 Py_XDECREF(errorHandler
);
1953 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1960 #ifdef Py_UNICODE_WIDE
1963 const int pairs
= 0;
1965 /* Offsets from p for storing byte pairs in the right order. */
1966 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1967 int ihi
= 1, ilo
= 0;
1969 int ihi
= 0, ilo
= 1;
1972 #define STORECHAR(CH) \
1974 p[ihi] = ((CH) >> 8) & 0xff; \
1975 p[ilo] = (CH) & 0xff; \
1979 #ifdef Py_UNICODE_WIDE
1980 for (i
= pairs
= 0; i
< size
; i
++)
1981 if (s
[i
] >= 0x10000)
1984 v
= PyString_FromStringAndSize(NULL
,
1985 2 * (size
+ pairs
+ (byteorder
== 0)));
1989 p
= (unsigned char *)PyString_AS_STRING(v
);
1995 if (byteorder
== -1) {
2000 else if (byteorder
== 1) {
2006 while (size
-- > 0) {
2007 Py_UNICODE ch
= *s
++;
2009 #ifdef Py_UNICODE_WIDE
2010 if (ch
>= 0x10000) {
2011 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
2012 ch
= 0xD800 | ((ch
-0x10000) >> 10);
2023 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
2025 if (!PyUnicode_Check(unicode
)) {
2026 PyErr_BadArgument();
2029 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
2030 PyUnicode_GET_SIZE(unicode
),
2035 /* --- Unicode Escape Codec ----------------------------------------------- */
2037 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
2039 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
2043 const char *starts
= s
;
2044 Py_ssize_t startinpos
;
2045 Py_ssize_t endinpos
;
2052 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
2053 PyObject
*errorHandler
= NULL
;
2054 PyObject
*exc
= NULL
;
2056 /* Escaped strings will always be longer than the resulting
2057 Unicode string, so we start with size here and then reduce the
2058 length after conversion to the true value.
2059 (but if the error callback returns a long replacement string
2060 we'll have to allocate more space) */
2061 v
= _PyUnicode_New(size
);
2065 return (PyObject
*)v
;
2067 p
= PyUnicode_AS_UNICODE(v
);
2075 /* Non-escape characters are interpreted as Unicode ordinals */
2077 *p
++ = (unsigned char) *s
++;
2081 startinpos
= s
-starts
;
2088 case '\\': *p
++ = '\\'; break;
2089 case '\'': *p
++ = '\''; break;
2090 case '\"': *p
++ = '\"'; break;
2091 case 'b': *p
++ = '\b'; break;
2092 case 'f': *p
++ = '\014'; break; /* FF */
2093 case 't': *p
++ = '\t'; break;
2094 case 'n': *p
++ = '\n'; break;
2095 case 'r': *p
++ = '\r'; break;
2096 case 'v': *p
++ = '\013'; break; /* VT */
2097 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
2099 /* \OOO (octal) escapes */
2100 case '0': case '1': case '2': case '3':
2101 case '4': case '5': case '6': case '7':
2103 if ('0' <= *s
&& *s
<= '7') {
2104 x
= (x
<<3) + *s
++ - '0';
2105 if ('0' <= *s
&& *s
<= '7')
2106 x
= (x
<<3) + *s
++ - '0';
2115 message
= "truncated \\xXX escape";
2121 message
= "truncated \\uXXXX escape";
2127 message
= "truncated \\UXXXXXXXX escape";
2130 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2133 if (unicode_decode_call_errorhandler(
2134 errors
, &errorHandler
,
2135 "unicodeescape", "end of string in escape sequence",
2136 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2137 (PyObject
**)&v
, &outpos
, &p
))
2141 for (i
= 0; i
< digits
; ++i
) {
2142 c
= (unsigned char) s
[i
];
2144 endinpos
= (s
+i
+1)-starts
;
2145 if (unicode_decode_call_errorhandler(
2146 errors
, &errorHandler
,
2147 "unicodeescape", message
,
2148 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2149 (PyObject
**)&v
, &outpos
, &p
))
2153 chr
= (chr
<<4) & ~0xF;
2154 if (c
>= '0' && c
<= '9')
2156 else if (c
>= 'a' && c
<= 'f')
2157 chr
+= 10 + c
- 'a';
2159 chr
+= 10 + c
- 'A';
2162 if (chr
== 0xffffffff && PyErr_Occurred())
2163 /* _decoding_error will have already written into the
2167 /* when we get here, chr is a 32-bit unicode character */
2169 /* UCS-2 character */
2170 *p
++ = (Py_UNICODE
) chr
;
2171 else if (chr
<= 0x10ffff) {
2172 /* UCS-4 character. Either store directly, or as
2174 #ifdef Py_UNICODE_WIDE
2178 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
2179 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
2182 endinpos
= s
-starts
;
2183 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2184 if (unicode_decode_call_errorhandler(
2185 errors
, &errorHandler
,
2186 "unicodeescape", "illegal Unicode character",
2187 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2188 (PyObject
**)&v
, &outpos
, &p
))
2195 message
= "malformed \\N character escape";
2196 if (ucnhash_CAPI
== NULL
) {
2197 /* load the unicode data module */
2199 m
= PyImport_ImportModule("unicodedata");
2202 api
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
2206 ucnhash_CAPI
= (_PyUnicode_Name_CAPI
*)PyCObject_AsVoidPtr(api
);
2208 if (ucnhash_CAPI
== NULL
)
2212 const char *start
= s
+1;
2213 /* look for the closing brace */
2214 while (*s
!= '}' && s
< end
)
2216 if (s
> start
&& s
< end
&& *s
== '}') {
2217 /* found a name. look it up in the unicode database */
2218 message
= "unknown Unicode character name";
2220 if (ucnhash_CAPI
->getcode(NULL
, start
, (int)(s
-start
-1), &chr
))
2224 endinpos
= s
-starts
;
2225 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2226 if (unicode_decode_call_errorhandler(
2227 errors
, &errorHandler
,
2228 "unicodeescape", message
,
2229 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2230 (PyObject
**)&v
, &outpos
, &p
))
2236 message
= "\\ at end of string";
2238 endinpos
= s
-starts
;
2239 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2240 if (unicode_decode_call_errorhandler(
2241 errors
, &errorHandler
,
2242 "unicodeescape", message
,
2243 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2244 (PyObject
**)&v
, &outpos
, &p
))
2249 *p
++ = (unsigned char)s
[-1];
2256 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2258 Py_XDECREF(errorHandler
);
2260 return (PyObject
*)v
;
2265 "\\N escapes not supported (can't load unicodedata module)"
2268 Py_XDECREF(errorHandler
);
2274 Py_XDECREF(errorHandler
);
2279 /* Return a Unicode-Escape string version of the Unicode object.
2281 If quotes is true, the string is enclosed in u"" or u'' quotes as
2286 Py_LOCAL_INLINE(const Py_UNICODE
*) findchar(const Py_UNICODE
*s
,
2290 /* like wcschr, but doesn't stop at NULL characters */
2292 while (size
-- > 0) {
2302 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
2309 static const char *hexdigit
= "0123456789abcdef";
2311 /* XXX(nnorwitz): rather than over-allocating, it would be
2312 better to choose a different scheme. Perhaps scan the
2313 first N-chars of the string and allocate based on that size.
2315 /* Initial allocation is based on the longest-possible unichr
2318 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2319 unichr, so in this case it's the longest unichr escape. In
2320 narrow (UTF-16) builds this is five chars per source unichr
2321 since there are two unichrs in the surrogate pair, so in narrow
2322 (UTF-16) builds it's not the longest unichr escape.
2324 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2325 so in the narrow (UTF-16) build case it's the longest unichr
2329 repr
= PyString_FromStringAndSize(NULL
,
2331 #ifdef Py_UNICODE_WIDE
2340 p
= PyString_AS_STRING(repr
);
2344 *p
++ = (findchar(s
, size
, '\'') &&
2345 !findchar(s
, size
, '"')) ? '"' : '\'';
2347 while (size
-- > 0) {
2348 Py_UNICODE ch
= *s
++;
2350 /* Escape quotes and backslashes */
2352 ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1]) || ch
== '\\') {
2358 #ifdef Py_UNICODE_WIDE
2359 /* Map 21-bit characters to '\U00xxxxxx' */
2360 else if (ch
>= 0x10000) {
2363 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
2364 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
2365 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
2366 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
2367 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
2368 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
2369 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
2370 *p
++ = hexdigit
[ch
& 0x0000000F];
2374 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2375 else if (ch
>= 0xD800 && ch
< 0xDC00) {
2381 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
2382 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
2385 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
2386 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
2387 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
2388 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
2389 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
2390 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
2391 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
2392 *p
++ = hexdigit
[ucs
& 0x0000000F];
2395 /* Fall through: isolated surrogates are copied as-is */
2401 /* Map 16-bit characters to '\uxxxx' */
2405 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
2406 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
2407 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
2408 *p
++ = hexdigit
[ch
& 0x000F];
2411 /* Map special whitespace to '\t', \n', '\r' */
2412 else if (ch
== '\t') {
2416 else if (ch
== '\n') {
2420 else if (ch
== '\r') {
2425 /* Map non-printable US ASCII to '\xhh' */
2426 else if (ch
< ' ' || ch
>= 0x7F) {
2429 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
2430 *p
++ = hexdigit
[ch
& 0x000F];
2433 /* Copy everything else as-is */
2438 *p
++ = PyString_AS_STRING(repr
)[1];
2441 _PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
));
2445 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
2448 return unicodeescape_string(s
, size
, 0);
2451 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
2453 if (!PyUnicode_Check(unicode
)) {
2454 PyErr_BadArgument();
2457 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2458 PyUnicode_GET_SIZE(unicode
));
2461 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2463 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
2467 const char *starts
= s
;
2468 Py_ssize_t startinpos
;
2469 Py_ssize_t endinpos
;
2475 PyObject
*errorHandler
= NULL
;
2476 PyObject
*exc
= NULL
;
2478 /* Escaped strings will always be longer than the resulting
2479 Unicode string, so we start with size here and then reduce the
2480 length after conversion to the true value. (But decoding error
2481 handler might have to resize the string) */
2482 v
= _PyUnicode_New(size
);
2486 return (PyObject
*)v
;
2487 p
= PyUnicode_AS_UNICODE(v
);
2495 /* Non-escape characters are interpreted as Unicode ordinals */
2497 *p
++ = (unsigned char)*s
++;
2500 startinpos
= s
-starts
;
2502 /* \u-escapes are only interpreted iff the number of leading
2503 backslashes if odd */
2508 *p
++ = (unsigned char)*s
++;
2510 if (((s
- bs
) & 1) == 0 ||
2512 (*s
!= 'u' && *s
!= 'U')) {
2516 count
= *s
=='u' ? 4 : 8;
2519 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2520 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2521 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
2522 c
= (unsigned char)*s
;
2524 endinpos
= s
-starts
;
2525 if (unicode_decode_call_errorhandler(
2526 errors
, &errorHandler
,
2527 "rawunicodeescape", "truncated \\uXXXX",
2528 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2529 (PyObject
**)&v
, &outpos
, &p
))
2534 if (c
>= '0' && c
<= '9')
2536 else if (c
>= 'a' && c
<= 'f')
2541 #ifndef Py_UNICODE_WIDE
2543 if (unicode_decode_call_errorhandler(
2544 errors
, &errorHandler
,
2545 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2546 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2547 (PyObject
**)&v
, &outpos
, &p
))
2555 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2557 Py_XDECREF(errorHandler
);
2559 return (PyObject
*)v
;
2563 Py_XDECREF(errorHandler
);
2568 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
2575 static const char *hexdigit
= "0123456789abcdef";
2577 #ifdef Py_UNICODE_WIDE
2578 repr
= PyString_FromStringAndSize(NULL
, 10 * size
);
2580 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
2587 p
= q
= PyString_AS_STRING(repr
);
2588 while (size
-- > 0) {
2589 Py_UNICODE ch
= *s
++;
2590 #ifdef Py_UNICODE_WIDE
2591 /* Map 32-bit characters to '\Uxxxxxxxx' */
2592 if (ch
>= 0x10000) {
2595 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
2596 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
2597 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
2598 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
2599 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
2600 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
2601 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
2602 *p
++ = hexdigit
[ch
& 15];
2606 /* Map 16-bit characters to '\uxxxx' */
2610 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
2611 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
2612 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
2613 *p
++ = hexdigit
[ch
& 15];
2615 /* Copy everything else as-is */
2620 _PyString_Resize(&repr
, p
- q
);
2624 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
2626 if (!PyUnicode_Check(unicode
)) {
2627 PyErr_BadArgument();
2630 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2631 PyUnicode_GET_SIZE(unicode
));
2634 /* --- Unicode Internal Codec ------------------------------------------- */
2636 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
2640 const char *starts
= s
;
2641 Py_ssize_t startinpos
;
2642 Py_ssize_t endinpos
;
2648 PyObject
*errorHandler
= NULL
;
2649 PyObject
*exc
= NULL
;
2651 #ifdef Py_UNICODE_WIDE
2652 Py_UNICODE unimax
= PyUnicode_GetMax();
2655 /* XXX overflow detection missing */
2656 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
2659 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
2660 return (PyObject
*)v
;
2661 p
= PyUnicode_AS_UNICODE(v
);
2665 memcpy(p
, s
, sizeof(Py_UNICODE
));
2666 /* We have to sanity check the raw data, otherwise doom looms for
2667 some malformed UCS-4 data. */
2669 #ifdef Py_UNICODE_WIDE
2670 *p
> unimax
|| *p
< 0 ||
2672 end
-s
< Py_UNICODE_SIZE
2675 startinpos
= s
- starts
;
2676 if (end
-s
< Py_UNICODE_SIZE
) {
2677 endinpos
= end
-starts
;
2678 reason
= "truncated input";
2681 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
2682 reason
= "illegal code point (> 0x10FFFF)";
2684 outpos
= p
- PyUnicode_AS_UNICODE(v
);
2685 if (unicode_decode_call_errorhandler(
2686 errors
, &errorHandler
,
2687 "unicode_internal", reason
,
2688 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2689 (PyObject
**)&v
, &outpos
, &p
)) {
2695 s
+= Py_UNICODE_SIZE
;
2699 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2701 Py_XDECREF(errorHandler
);
2703 return (PyObject
*)v
;
2707 Py_XDECREF(errorHandler
);
2712 /* --- Latin-1 Codec ------------------------------------------------------ */
2714 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
2721 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2723 Py_UNICODE r
= *(unsigned char*)s
;
2724 return PyUnicode_FromUnicode(&r
, 1);
2727 v
= _PyUnicode_New(size
);
2731 return (PyObject
*)v
;
2732 p
= PyUnicode_AS_UNICODE(v
);
2734 *p
++ = (unsigned char)*s
++;
2735 return (PyObject
*)v
;
2742 /* create or adjust a UnicodeEncodeError */
2743 static void make_encode_exception(PyObject
**exceptionObject
,
2744 const char *encoding
,
2745 const Py_UNICODE
*unicode
, Py_ssize_t size
,
2746 Py_ssize_t startpos
, Py_ssize_t endpos
,
2749 if (*exceptionObject
== NULL
) {
2750 *exceptionObject
= PyUnicodeEncodeError_Create(
2751 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2754 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
2756 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
2758 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
2762 Py_DECREF(*exceptionObject
);
2763 *exceptionObject
= NULL
;
2767 /* raises a UnicodeEncodeError */
2768 static void raise_encode_exception(PyObject
**exceptionObject
,
2769 const char *encoding
,
2770 const Py_UNICODE
*unicode
, Py_ssize_t size
,
2771 Py_ssize_t startpos
, Py_ssize_t endpos
,
2774 make_encode_exception(exceptionObject
,
2775 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2776 if (*exceptionObject
!= NULL
)
2777 PyCodec_StrictErrors(*exceptionObject
);
2780 /* error handling callback helper:
2781 build arguments, call the callback and check the arguments,
2782 put the result into newpos and return the replacement string, which
2783 has to be freed by the caller */
2784 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
2785 PyObject
**errorHandler
,
2786 const char *encoding
, const char *reason
,
2787 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
2788 Py_ssize_t startpos
, Py_ssize_t endpos
,
2791 static char *argparse
= "O!n;encoding error handler must return (unicode, int) tuple";
2794 PyObject
*resunicode
;
2796 if (*errorHandler
== NULL
) {
2797 *errorHandler
= PyCodec_LookupError(errors
);
2798 if (*errorHandler
== NULL
)
2802 make_encode_exception(exceptionObject
,
2803 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2804 if (*exceptionObject
== NULL
)
2807 restuple
= PyObject_CallFunctionObjArgs(
2808 *errorHandler
, *exceptionObject
, NULL
);
2809 if (restuple
== NULL
)
2811 if (!PyTuple_Check(restuple
)) {
2812 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
2813 Py_DECREF(restuple
);
2816 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
2817 &resunicode
, newpos
)) {
2818 Py_DECREF(restuple
);
2822 *newpos
= size
+*newpos
;
2823 if (*newpos
<0 || *newpos
>size
) {
2824 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
2825 Py_DECREF(restuple
);
2828 Py_INCREF(resunicode
);
2829 Py_DECREF(restuple
);
2833 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
2840 /* pointers to the beginning and end+1 of input */
2841 const Py_UNICODE
*startp
= p
;
2842 const Py_UNICODE
*endp
= p
+ size
;
2843 /* pointer to the beginning of the unencodable characters */
2844 /* const Py_UNICODE *badp = NULL; */
2845 /* pointer into the output */
2847 /* current output position */
2848 Py_ssize_t respos
= 0;
2850 const char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
2851 const char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2852 PyObject
*errorHandler
= NULL
;
2853 PyObject
*exc
= NULL
;
2854 /* the following variable is used for caching string comparisons
2855 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2856 int known_errorHandler
= -1;
2858 /* allocate enough for a simple encoding without
2859 replacements, if we need more, we'll resize */
2860 res
= PyString_FromStringAndSize(NULL
, size
);
2865 str
= PyString_AS_STRING(res
);
2871 /* can we encode this? */
2873 /* no overflow check, because we know that the space is enough */
2878 Py_ssize_t unicodepos
= p
-startp
;
2879 Py_ssize_t requiredsize
;
2880 PyObject
*repunicode
;
2885 /* startpos for collecting unencodable chars */
2886 const Py_UNICODE
*collstart
= p
;
2887 const Py_UNICODE
*collend
= p
;
2888 /* find all unecodable characters */
2889 while ((collend
< endp
) && ((*collend
)>=limit
))
2891 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2892 if (known_errorHandler
==-1) {
2893 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
2894 known_errorHandler
= 1;
2895 else if (!strcmp(errors
, "replace"))
2896 known_errorHandler
= 2;
2897 else if (!strcmp(errors
, "ignore"))
2898 known_errorHandler
= 3;
2899 else if (!strcmp(errors
, "xmlcharrefreplace"))
2900 known_errorHandler
= 4;
2902 known_errorHandler
= 0;
2904 switch (known_errorHandler
) {
2905 case 1: /* strict */
2906 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
2908 case 2: /* replace */
2909 while (collstart
++<collend
)
2910 *str
++ = '?'; /* fall through */
2911 case 3: /* ignore */
2914 case 4: /* xmlcharrefreplace */
2915 respos
= str
-PyString_AS_STRING(res
);
2916 /* determine replacement size (temporarily (mis)uses p) */
2917 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
2926 #ifndef Py_UNICODE_WIDE
2932 else if (*p
<1000000)
2938 requiredsize
= respos
+repsize
+(endp
-collend
);
2939 if (requiredsize
> ressize
) {
2940 if (requiredsize
<2*ressize
)
2941 requiredsize
= 2*ressize
;
2942 if (_PyString_Resize(&res
, requiredsize
))
2944 str
= PyString_AS_STRING(res
) + respos
;
2945 ressize
= requiredsize
;
2947 /* generate replacement (temporarily (mis)uses p) */
2948 for (p
= collstart
; p
< collend
; ++p
) {
2949 str
+= sprintf(str
, "&#%d;", (int)*p
);
2954 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
2955 encoding
, reason
, startp
, size
, &exc
,
2956 collstart
-startp
, collend
-startp
, &newpos
);
2957 if (repunicode
== NULL
)
2959 /* need more space? (at least enough for what we
2960 have+the replacement+the rest of the string, so
2961 we won't have to check space for encodable characters) */
2962 respos
= str
-PyString_AS_STRING(res
);
2963 repsize
= PyUnicode_GET_SIZE(repunicode
);
2964 requiredsize
= respos
+repsize
+(endp
-collend
);
2965 if (requiredsize
> ressize
) {
2966 if (requiredsize
<2*ressize
)
2967 requiredsize
= 2*ressize
;
2968 if (_PyString_Resize(&res
, requiredsize
)) {
2969 Py_DECREF(repunicode
);
2972 str
= PyString_AS_STRING(res
) + respos
;
2973 ressize
= requiredsize
;
2975 /* check if there is anything unencodable in the replacement
2976 and copy it to the output */
2977 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
2980 raise_encode_exception(&exc
, encoding
, startp
, size
,
2981 unicodepos
, unicodepos
+1, reason
);
2982 Py_DECREF(repunicode
);
2987 p
= startp
+ newpos
;
2988 Py_DECREF(repunicode
);
2992 /* Resize if we allocated to much */
2993 respos
= str
-PyString_AS_STRING(res
);
2995 /* If this falls res will be NULL */
2996 _PyString_Resize(&res
, respos
);
2997 Py_XDECREF(errorHandler
);
3003 Py_XDECREF(errorHandler
);
3008 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
3012 return unicode_encode_ucs1(p
, size
, errors
, 256);
3015 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
3017 if (!PyUnicode_Check(unicode
)) {
3018 PyErr_BadArgument();
3021 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
3022 PyUnicode_GET_SIZE(unicode
),
3026 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3028 PyObject
*PyUnicode_DecodeASCII(const char *s
,
3032 const char *starts
= s
;
3035 Py_ssize_t startinpos
;
3036 Py_ssize_t endinpos
;
3039 PyObject
*errorHandler
= NULL
;
3040 PyObject
*exc
= NULL
;
3042 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3043 if (size
== 1 && *(unsigned char*)s
< 128) {
3044 Py_UNICODE r
= *(unsigned char*)s
;
3045 return PyUnicode_FromUnicode(&r
, 1);
3048 v
= _PyUnicode_New(size
);
3052 return (PyObject
*)v
;
3053 p
= PyUnicode_AS_UNICODE(v
);
3056 register unsigned char c
= (unsigned char)*s
;
3062 startinpos
= s
-starts
;
3063 endinpos
= startinpos
+ 1;
3064 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
3065 if (unicode_decode_call_errorhandler(
3066 errors
, &errorHandler
,
3067 "ascii", "ordinal not in range(128)",
3068 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3069 (PyObject
**)&v
, &outpos
, &p
))
3073 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
3074 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3076 Py_XDECREF(errorHandler
);
3078 return (PyObject
*)v
;
3082 Py_XDECREF(errorHandler
);
3087 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
3091 return unicode_encode_ucs1(p
, size
, errors
, 128);
3094 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
3096 if (!PyUnicode_Check(unicode
)) {
3097 PyErr_BadArgument();
3100 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
3101 PyUnicode_GET_SIZE(unicode
),
3105 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3107 /* --- MBCS codecs for Windows -------------------------------------------- */
3109 #if SIZEOF_INT < SIZEOF_SSIZE_T
3113 /* XXX This code is limited to "true" double-byte encodings, as
3114 a) it assumes an incomplete character consists of a single byte, and
3115 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3116 encodings, see IsDBCSLeadByteEx documentation. */
3118 static int is_dbcs_lead_byte(const char *s
, int offset
)
3120 const char *curr
= s
+ offset
;
3122 if (IsDBCSLeadByte(*curr
)) {
3123 const char *prev
= CharPrev(s
, curr
);
3124 return (prev
== curr
) || !IsDBCSLeadByte(*prev
) || (curr
- prev
== 2);
3130 * Decode MBCS string into unicode object. If 'final' is set, converts
3131 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3133 static int decode_mbcs(PyUnicodeObject
**v
,
3134 const char *s
, /* MBCS string */
3135 int size
, /* sizeof MBCS string */
3144 /* Skip trailing lead-byte unless 'final' is set */
3145 if (!final
&& size
>= 1 && is_dbcs_lead_byte(s
, size
- 1))
3148 /* First get the size of the result */
3150 usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
3152 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3158 /* Create unicode object */
3159 *v
= _PyUnicode_New(usize
);
3164 /* Extend unicode object */
3165 n
= PyUnicode_GET_SIZE(*v
);
3166 if (_PyUnicode_Resize(v
, n
+ usize
) < 0)
3170 /* Do the conversion */
3172 p
= PyUnicode_AS_UNICODE(*v
) + n
;
3173 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
3174 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3182 PyObject
*PyUnicode_DecodeMBCSStateful(const char *s
,
3185 Py_ssize_t
*consumed
)
3187 PyUnicodeObject
*v
= NULL
;
3196 done
= decode_mbcs(&v
, s
, INT_MAX
, 0);
3199 done
= decode_mbcs(&v
, s
, (int)size
, !consumed
);
3210 if (size
> INT_MAX
) {
3217 return (PyObject
*)v
;
3220 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
3224 return PyUnicode_DecodeMBCSStateful(s
, size
, errors
, NULL
);
3228 * Convert unicode into string object (MBCS).
3229 * Returns 0 if succeed, -1 otherwise.
3231 static int encode_mbcs(PyObject
**repr
,
3232 const Py_UNICODE
*p
, /* unicode */
3233 int size
) /* size of unicode */
3240 /* First get the size of the result */
3242 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
3243 if (mbcssize
== 0) {
3244 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3249 if (*repr
== NULL
) {
3250 /* Create string object */
3251 *repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
3256 /* Extend string object */
3257 n
= PyString_Size(*repr
);
3258 if (_PyString_Resize(repr
, n
+ mbcssize
) < 0)
3262 /* Do the conversion */
3264 char *s
= PyString_AS_STRING(*repr
) + n
;
3265 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
3266 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3274 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
3278 PyObject
*repr
= NULL
;
3284 ret
= encode_mbcs(&repr
, p
, INT_MAX
);
3287 ret
= encode_mbcs(&repr
, p
, (int)size
);
3295 if (size
> INT_MAX
) {
3305 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
3307 if (!PyUnicode_Check(unicode
)) {
3308 PyErr_BadArgument();
3311 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
3312 PyUnicode_GET_SIZE(unicode
),
3318 #endif /* MS_WINDOWS */
3320 /* --- Character Mapping Codec -------------------------------------------- */
3322 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
3327 const char *starts
= s
;
3328 Py_ssize_t startinpos
;
3329 Py_ssize_t endinpos
;
3334 Py_ssize_t extrachars
= 0;
3335 PyObject
*errorHandler
= NULL
;
3336 PyObject
*exc
= NULL
;
3337 Py_UNICODE
*mapstring
= NULL
;
3338 Py_ssize_t maplen
= 0;
3340 /* Default to Latin-1 */
3341 if (mapping
== NULL
)
3342 return PyUnicode_DecodeLatin1(s
, size
, errors
);
3344 v
= _PyUnicode_New(size
);
3348 return (PyObject
*)v
;
3349 p
= PyUnicode_AS_UNICODE(v
);
3351 if (PyUnicode_CheckExact(mapping
)) {
3352 mapstring
= PyUnicode_AS_UNICODE(mapping
);
3353 maplen
= PyUnicode_GET_SIZE(mapping
);
3355 unsigned char ch
= *s
;
3356 Py_UNICODE x
= 0xfffe; /* illegal value */
3362 /* undefined mapping */
3363 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3364 startinpos
= s
-starts
;
3365 endinpos
= startinpos
+1;
3366 if (unicode_decode_call_errorhandler(
3367 errors
, &errorHandler
,
3368 "charmap", "character maps to <undefined>",
3369 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3370 (PyObject
**)&v
, &outpos
, &p
)) {
3381 unsigned char ch
= *s
;
3384 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3385 w
= PyInt_FromLong((long)ch
);
3388 x
= PyObject_GetItem(mapping
, w
);
3391 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3392 /* No mapping found means: mapping is undefined. */
3401 if (PyInt_Check(x
)) {
3402 long value
= PyInt_AS_LONG(x
);
3403 if (value
< 0 || value
> 65535) {
3404 PyErr_SetString(PyExc_TypeError
,
3405 "character mapping must be in range(65536)");
3409 *p
++ = (Py_UNICODE
)value
;
3411 else if (x
== Py_None
) {
3412 /* undefined mapping */
3413 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3414 startinpos
= s
-starts
;
3415 endinpos
= startinpos
+1;
3416 if (unicode_decode_call_errorhandler(
3417 errors
, &errorHandler
,
3418 "charmap", "character maps to <undefined>",
3419 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3420 (PyObject
**)&v
, &outpos
, &p
)) {
3427 else if (PyUnicode_Check(x
)) {
3428 Py_ssize_t targetsize
= PyUnicode_GET_SIZE(x
);
3430 if (targetsize
== 1)
3432 *p
++ = *PyUnicode_AS_UNICODE(x
);
3434 else if (targetsize
> 1) {
3436 if (targetsize
> extrachars
) {
3438 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
3439 Py_ssize_t needed
= (targetsize
- extrachars
) + \
3441 extrachars
+= needed
;
3442 /* XXX overflow detection missing */
3443 if (_PyUnicode_Resize(&v
,
3444 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
3448 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
3451 PyUnicode_AS_UNICODE(x
),
3454 extrachars
-= targetsize
;
3456 /* 1-0 mapping: skip the character */
3459 /* wrong return value */
3460 PyErr_SetString(PyExc_TypeError
,
3461 "character mapping must return integer, None or unicode");
3469 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
3470 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3472 Py_XDECREF(errorHandler
);
3474 return (PyObject
*)v
;
3477 Py_XDECREF(errorHandler
);
3483 /* Charmap encoding: the lookup table */
3485 struct encoding_map
{
3487 unsigned char level1
[32];
3489 unsigned char level23
[1];
3493 encoding_map_size(PyObject
*obj
, PyObject
* args
)
3495 struct encoding_map
*map
= (struct encoding_map
*)obj
;
3496 return PyInt_FromLong(sizeof(*map
) - 1 + 16*map
->count2
+
3500 static PyMethodDef encoding_map_methods
[] = {
3501 {"size", encoding_map_size
, METH_NOARGS
,
3502 PyDoc_STR("Return the size (in bytes) of this object") },
3507 encoding_map_dealloc(PyObject
* o
)
3512 static PyTypeObject EncodingMapType
= {
3513 PyVarObject_HEAD_INIT(NULL
, 0)
3514 "EncodingMap", /*tp_name*/
3515 sizeof(struct encoding_map
), /*tp_basicsize*/
3518 encoding_map_dealloc
, /*tp_dealloc*/
3525 0, /*tp_as_sequence*/
3526 0, /*tp_as_mapping*/
3533 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
3537 0, /*tp_richcompare*/
3538 0, /*tp_weaklistoffset*/
3541 encoding_map_methods
, /*tp_methods*/
3548 0, /*tp_dictoffset*/
3557 PyUnicode_BuildEncodingMap(PyObject
* string
)
3561 struct encoding_map
*mresult
;
3564 unsigned char level1
[32];
3565 unsigned char level2
[512];
3566 unsigned char *mlevel1
, *mlevel2
, *mlevel3
;
3567 int count2
= 0, count3
= 0;
3569 if (!PyUnicode_Check(string
) || PyUnicode_GetSize(string
) != 256) {
3570 PyErr_BadArgument();
3573 decode
= PyUnicode_AS_UNICODE(string
);
3574 memset(level1
, 0xFF, sizeof level1
);
3575 memset(level2
, 0xFF, sizeof level2
);
3577 /* If there isn't a one-to-one mapping of NULL to \0,
3578 or if there are non-BMP characters, we need to use
3579 a mapping dictionary. */
3582 for (i
= 1; i
< 256; i
++) {
3585 #ifdef Py_UNICODE_WIDE
3586 || decode
[i
] > 0xFFFF
3592 if (decode
[i
] == 0xFFFE)
3593 /* unmapped character */
3595 l1
= decode
[i
] >> 11;
3596 l2
= decode
[i
] >> 7;
3597 if (level1
[l1
] == 0xFF)
3598 level1
[l1
] = count2
++;
3599 if (level2
[l2
] == 0xFF)
3600 level2
[l2
] = count3
++;
3603 if (count2
>= 0xFF || count3
>= 0xFF)
3607 PyObject
*result
= PyDict_New();
3608 PyObject
*key
, *value
;
3611 for (i
= 0; i
< 256; i
++) {
3613 key
= PyInt_FromLong(decode
[i
]);
3614 value
= PyInt_FromLong(i
);
3617 if (PyDict_SetItem(result
, key
, value
) == -1)
3630 /* Create a three-level trie */
3631 result
= PyObject_MALLOC(sizeof(struct encoding_map
) +
3632 16*count2
+ 128*count3
- 1);
3634 return PyErr_NoMemory();
3635 PyObject_Init(result
, &EncodingMapType
);
3636 mresult
= (struct encoding_map
*)result
;
3637 mresult
->count2
= count2
;
3638 mresult
->count3
= count3
;
3639 mlevel1
= mresult
->level1
;
3640 mlevel2
= mresult
->level23
;
3641 mlevel3
= mresult
->level23
+ 16*count2
;
3642 memcpy(mlevel1
, level1
, 32);
3643 memset(mlevel2
, 0xFF, 16*count2
);
3644 memset(mlevel3
, 0, 128*count3
);
3646 for (i
= 1; i
< 256; i
++) {
3647 int o1
, o2
, o3
, i2
, i3
;
3648 if (decode
[i
] == 0xFFFE)
3649 /* unmapped character */
3652 o2
= (decode
[i
]>>7) & 0xF;
3653 i2
= 16*mlevel1
[o1
] + o2
;
3654 if (mlevel2
[i2
] == 0xFF)
3655 mlevel2
[i2
] = count3
++;
3656 o3
= decode
[i
] & 0x7F;
3657 i3
= 128*mlevel2
[i2
] + o3
;
3664 encoding_map_lookup(Py_UNICODE c
, PyObject
*mapping
)
3666 struct encoding_map
*map
= (struct encoding_map
*)mapping
;
3668 int l2
= (c
>>7) & 0xF;
3672 #ifdef Py_UNICODE_WIDE
3680 i
= map
->level1
[l1
];
3685 i
= map
->level23
[16*i
+l2
];
3690 i
= map
->level23
[16*map
->count2
+ 128*i
+ l3
];
3697 /* Lookup the character ch in the mapping. If the character
3698 can't be found, Py_None is returned (or NULL, if another
3700 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
3702 PyObject
*w
= PyInt_FromLong((long)c
);
3707 x
= PyObject_GetItem(mapping
, w
);
3710 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3711 /* No mapping found means: mapping is undefined. */
3719 else if (x
== Py_None
)
3721 else if (PyInt_Check(x
)) {
3722 long value
= PyInt_AS_LONG(x
);
3723 if (value
< 0 || value
> 255) {
3724 PyErr_SetString(PyExc_TypeError
,
3725 "character mapping must be in range(256)");
3731 else if (PyString_Check(x
))
3734 /* wrong return value */
3735 PyErr_SetString(PyExc_TypeError
,
3736 "character mapping must return integer, None or str");
3743 charmapencode_resize(PyObject
**outobj
, Py_ssize_t
*outpos
, Py_ssize_t requiredsize
)
3745 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
3746 /* exponentially overallocate to minimize reallocations */
3747 if (requiredsize
< 2*outsize
)
3748 requiredsize
= 2*outsize
;
3749 if (_PyString_Resize(outobj
, requiredsize
)) {
3755 typedef enum charmapencode_result
{
3756 enc_SUCCESS
, enc_FAILED
, enc_EXCEPTION
3757 }charmapencode_result
;
3758 /* lookup the character, put the result in the output string and adjust
3759 various state variables. Reallocate the output string if not enough
3760 space is available. Return a new reference to the object that
3761 was put in the output buffer, or Py_None, if the mapping was undefined
3762 (in which case no character was written) or NULL, if a
3763 reallocation error occurred. The caller must decref the result */
3765 charmapencode_result
charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
3766 PyObject
**outobj
, Py_ssize_t
*outpos
)
3770 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
3772 if (Py_Type(mapping
) == &EncodingMapType
) {
3773 int res
= encoding_map_lookup(c
, mapping
);
3774 Py_ssize_t requiredsize
= *outpos
+1;
3777 if (outsize
<requiredsize
)
3778 if (!charmapencode_resize(outobj
, outpos
, requiredsize
))
3779 return enc_EXCEPTION
;
3780 outstart
= PyString_AS_STRING(*outobj
);
3781 outstart
[(*outpos
)++] = (char)res
;
3785 rep
= charmapencode_lookup(c
, mapping
);
3787 return enc_EXCEPTION
;
3788 else if (rep
==Py_None
) {
3792 if (PyInt_Check(rep
)) {
3793 Py_ssize_t requiredsize
= *outpos
+1;
3794 if (outsize
<requiredsize
)
3795 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
3797 return enc_EXCEPTION
;
3799 outstart
= PyString_AS_STRING(*outobj
);
3800 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
3803 const char *repchars
= PyString_AS_STRING(rep
);
3804 Py_ssize_t repsize
= PyString_GET_SIZE(rep
);
3805 Py_ssize_t requiredsize
= *outpos
+repsize
;
3806 if (outsize
<requiredsize
)
3807 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
3809 return enc_EXCEPTION
;
3811 outstart
= PyString_AS_STRING(*outobj
);
3812 memcpy(outstart
+ *outpos
, repchars
, repsize
);
3820 /* handle an error in PyUnicode_EncodeCharmap
3821 Return 0 on success, -1 on error */
3823 int charmap_encoding_error(
3824 const Py_UNICODE
*p
, Py_ssize_t size
, Py_ssize_t
*inpos
, PyObject
*mapping
,
3825 PyObject
**exceptionObject
,
3826 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
3827 PyObject
**res
, Py_ssize_t
*respos
)
3829 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
3833 /* startpos for collecting unencodable chars */
3834 Py_ssize_t collstartpos
= *inpos
;
3835 Py_ssize_t collendpos
= *inpos
+1;
3837 char *encoding
= "charmap";
3838 char *reason
= "character maps to <undefined>";
3839 charmapencode_result x
;
3841 /* find all unencodable characters */
3842 while (collendpos
< size
) {
3844 if (Py_Type(mapping
) == &EncodingMapType
) {
3845 int res
= encoding_map_lookup(p
[collendpos
], mapping
);
3852 rep
= charmapencode_lookup(p
[collendpos
], mapping
);
3855 else if (rep
!=Py_None
) {
3862 /* cache callback name lookup
3863 * (if not done yet, i.e. it's the first error) */
3864 if (*known_errorHandler
==-1) {
3865 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3866 *known_errorHandler
= 1;
3867 else if (!strcmp(errors
, "replace"))
3868 *known_errorHandler
= 2;
3869 else if (!strcmp(errors
, "ignore"))
3870 *known_errorHandler
= 3;
3871 else if (!strcmp(errors
, "xmlcharrefreplace"))
3872 *known_errorHandler
= 4;
3874 *known_errorHandler
= 0;
3876 switch (*known_errorHandler
) {
3877 case 1: /* strict */
3878 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3880 case 2: /* replace */
3881 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
3882 x
= charmapencode_output('?', mapping
, res
, respos
);
3883 if (x
==enc_EXCEPTION
) {
3886 else if (x
==enc_FAILED
) {
3887 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3892 case 3: /* ignore */
3893 *inpos
= collendpos
;
3895 case 4: /* xmlcharrefreplace */
3896 /* generate replacement (temporarily (mis)uses p) */
3897 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
3898 char buffer
[2+29+1+1];
3900 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
3901 for (cp
= buffer
; *cp
; ++cp
) {
3902 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
3903 if (x
==enc_EXCEPTION
)
3905 else if (x
==enc_FAILED
) {
3906 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3911 *inpos
= collendpos
;
3914 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
3915 encoding
, reason
, p
, size
, exceptionObject
,
3916 collstartpos
, collendpos
, &newpos
);
3917 if (repunicode
== NULL
)
3919 /* generate replacement */
3920 repsize
= PyUnicode_GET_SIZE(repunicode
);
3921 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
3922 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
3923 if (x
==enc_EXCEPTION
) {
3926 else if (x
==enc_FAILED
) {
3927 Py_DECREF(repunicode
);
3928 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3933 Py_DECREF(repunicode
);
3938 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
3944 PyObject
*res
= NULL
;
3945 /* current input position */
3946 Py_ssize_t inpos
= 0;
3947 /* current output position */
3948 Py_ssize_t respos
= 0;
3949 PyObject
*errorHandler
= NULL
;
3950 PyObject
*exc
= NULL
;
3951 /* the following variable is used for caching string comparisons
3952 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3953 * 3=ignore, 4=xmlcharrefreplace */
3954 int known_errorHandler
= -1;
3956 /* Default to Latin-1 */
3957 if (mapping
== NULL
)
3958 return PyUnicode_EncodeLatin1(p
, size
, errors
);
3960 /* allocate enough for a simple encoding without
3961 replacements, if we need more, we'll resize */
3962 res
= PyString_FromStringAndSize(NULL
, size
);
3968 while (inpos
<size
) {
3969 /* try to encode it */
3970 charmapencode_result x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
3971 if (x
==enc_EXCEPTION
) /* error */
3973 if (x
==enc_FAILED
) { /* unencodable character */
3974 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
3976 &known_errorHandler
, &errorHandler
, errors
,
3982 /* done with this character => adjust input position */
3986 /* Resize if we allocated to much */
3987 if (respos
<PyString_GET_SIZE(res
)) {
3988 if (_PyString_Resize(&res
, respos
))
3992 Py_XDECREF(errorHandler
);
3998 Py_XDECREF(errorHandler
);
4002 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
4005 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
4006 PyErr_BadArgument();
4009 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
4010 PyUnicode_GET_SIZE(unicode
),
4015 /* create or adjust a UnicodeTranslateError */
4016 static void make_translate_exception(PyObject
**exceptionObject
,
4017 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4018 Py_ssize_t startpos
, Py_ssize_t endpos
,
4021 if (*exceptionObject
== NULL
) {
4022 *exceptionObject
= PyUnicodeTranslateError_Create(
4023 unicode
, size
, startpos
, endpos
, reason
);
4026 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
4028 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
4030 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
4034 Py_DECREF(*exceptionObject
);
4035 *exceptionObject
= NULL
;
4039 /* raises a UnicodeTranslateError */
4040 static void raise_translate_exception(PyObject
**exceptionObject
,
4041 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4042 Py_ssize_t startpos
, Py_ssize_t endpos
,
4045 make_translate_exception(exceptionObject
,
4046 unicode
, size
, startpos
, endpos
, reason
);
4047 if (*exceptionObject
!= NULL
)
4048 PyCodec_StrictErrors(*exceptionObject
);
4051 /* error handling callback helper:
4052 build arguments, call the callback and check the arguments,
4053 put the result into newpos and return the replacement string, which
4054 has to be freed by the caller */
4055 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
4056 PyObject
**errorHandler
,
4058 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
4059 Py_ssize_t startpos
, Py_ssize_t endpos
,
4062 static char *argparse
= "O!n;translating error handler must return (unicode, int) tuple";
4064 Py_ssize_t i_newpos
;
4066 PyObject
*resunicode
;
4068 if (*errorHandler
== NULL
) {
4069 *errorHandler
= PyCodec_LookupError(errors
);
4070 if (*errorHandler
== NULL
)
4074 make_translate_exception(exceptionObject
,
4075 unicode
, size
, startpos
, endpos
, reason
);
4076 if (*exceptionObject
== NULL
)
4079 restuple
= PyObject_CallFunctionObjArgs(
4080 *errorHandler
, *exceptionObject
, NULL
);
4081 if (restuple
== NULL
)
4083 if (!PyTuple_Check(restuple
)) {
4084 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
4085 Py_DECREF(restuple
);
4088 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
4089 &resunicode
, &i_newpos
)) {
4090 Py_DECREF(restuple
);
4094 *newpos
= size
+i_newpos
;
4097 if (*newpos
<0 || *newpos
>size
) {
4098 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
4099 Py_DECREF(restuple
);
4102 Py_INCREF(resunicode
);
4103 Py_DECREF(restuple
);
4107 /* Lookup the character ch in the mapping and put the result in result,
4108 which must be decrefed by the caller.
4109 Return 0 on success, -1 on error */
4111 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
4113 PyObject
*w
= PyInt_FromLong((long)c
);
4118 x
= PyObject_GetItem(mapping
, w
);
4121 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4122 /* No mapping found means: use 1:1 mapping. */
4129 else if (x
== Py_None
) {
4133 else if (PyInt_Check(x
)) {
4134 long value
= PyInt_AS_LONG(x
);
4135 long max
= PyUnicode_GetMax();
4136 if (value
< 0 || value
> max
) {
4137 PyErr_Format(PyExc_TypeError
,
4138 "character mapping must be in range(0x%lx)", max
+1);
4145 else if (PyUnicode_Check(x
)) {
4150 /* wrong return value */
4151 PyErr_SetString(PyExc_TypeError
,
4152 "character mapping must return integer, None or unicode");
4157 /* ensure that *outobj is at least requiredsize characters long,
4158 if not reallocate and adjust various state variables.
4159 Return 0 on success, -1 on error */
4161 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
4162 Py_ssize_t requiredsize
)
4164 Py_ssize_t oldsize
= PyUnicode_GET_SIZE(*outobj
);
4165 if (requiredsize
> oldsize
) {
4166 /* remember old output position */
4167 Py_ssize_t outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
4168 /* exponentially overallocate to minimize reallocations */
4169 if (requiredsize
< 2 * oldsize
)
4170 requiredsize
= 2 * oldsize
;
4171 if (_PyUnicode_Resize(outobj
, requiredsize
) < 0)
4173 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
4177 /* lookup the character, put the result in the output string and adjust
4178 various state variables. Return a new reference to the object that
4179 was put in the output buffer in *result, or Py_None, if the mapping was
4180 undefined (in which case no character was written).
4181 The called must decref result.
4182 Return 0 on success, -1 on error. */
4184 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
4185 Py_ssize_t insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
4188 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
4191 /* not found => default to 1:1 mapping */
4192 *(*outp
)++ = *curinp
;
4194 else if (*res
==Py_None
)
4196 else if (PyInt_Check(*res
)) {
4197 /* no overflow check, because we know that the space is enough */
4198 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
4200 else if (PyUnicode_Check(*res
)) {
4201 Py_ssize_t repsize
= PyUnicode_GET_SIZE(*res
);
4203 /* no overflow check, because we know that the space is enough */
4204 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
4206 else if (repsize
!=0) {
4207 /* more than one character */
4208 Py_ssize_t requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
4209 (insize
- (curinp
-startinp
)) +
4211 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
4213 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
4222 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
4228 PyObject
*res
= NULL
;
4229 /* pointers to the beginning and end+1 of input */
4230 const Py_UNICODE
*startp
= p
;
4231 const Py_UNICODE
*endp
= p
+ size
;
4232 /* pointer into the output */
4234 /* current output position */
4235 Py_ssize_t respos
= 0;
4236 char *reason
= "character maps to <undefined>";
4237 PyObject
*errorHandler
= NULL
;
4238 PyObject
*exc
= NULL
;
4239 /* the following variable is used for caching string comparisons
4240 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4241 * 3=ignore, 4=xmlcharrefreplace */
4242 int known_errorHandler
= -1;
4244 if (mapping
== NULL
) {
4245 PyErr_BadArgument();
4249 /* allocate enough for a simple 1:1 translation without
4250 replacements, if we need more, we'll resize */
4251 res
= PyUnicode_FromUnicode(NULL
, size
);
4256 str
= PyUnicode_AS_UNICODE(res
);
4259 /* try to encode it */
4261 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
4266 if (x
!=Py_None
) /* it worked => adjust input pointer */
4268 else { /* untranslatable character */
4269 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4273 /* startpos for collecting untranslatable chars */
4274 const Py_UNICODE
*collstart
= p
;
4275 const Py_UNICODE
*collend
= p
+1;
4276 const Py_UNICODE
*coll
;
4278 /* find all untranslatable characters */
4279 while (collend
< endp
) {
4280 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
4287 /* cache callback name lookup
4288 * (if not done yet, i.e. it's the first error) */
4289 if (known_errorHandler
==-1) {
4290 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4291 known_errorHandler
= 1;
4292 else if (!strcmp(errors
, "replace"))
4293 known_errorHandler
= 2;
4294 else if (!strcmp(errors
, "ignore"))
4295 known_errorHandler
= 3;
4296 else if (!strcmp(errors
, "xmlcharrefreplace"))
4297 known_errorHandler
= 4;
4299 known_errorHandler
= 0;
4301 switch (known_errorHandler
) {
4302 case 1: /* strict */
4303 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
4305 case 2: /* replace */
4306 /* No need to check for space, this is a 1:1 replacement */
4307 for (coll
= collstart
; coll
<collend
; ++coll
)
4310 case 3: /* ignore */
4313 case 4: /* xmlcharrefreplace */
4314 /* generate replacement (temporarily (mis)uses p) */
4315 for (p
= collstart
; p
< collend
; ++p
) {
4316 char buffer
[2+29+1+1];
4318 sprintf(buffer
, "&#%d;", (int)*p
);
4319 if (charmaptranslate_makespace(&res
, &str
,
4320 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
4322 for (cp
= buffer
; *cp
; ++cp
)
4328 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
4329 reason
, startp
, size
, &exc
,
4330 collstart
-startp
, collend
-startp
, &newpos
);
4331 if (repunicode
== NULL
)
4333 /* generate replacement */
4334 repsize
= PyUnicode_GET_SIZE(repunicode
);
4335 if (charmaptranslate_makespace(&res
, &str
,
4336 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
4337 Py_DECREF(repunicode
);
4340 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
4342 p
= startp
+ newpos
;
4343 Py_DECREF(repunicode
);
4347 /* Resize if we allocated to much */
4348 respos
= str
-PyUnicode_AS_UNICODE(res
);
4349 if (respos
<PyUnicode_GET_SIZE(res
)) {
4350 if (_PyUnicode_Resize(&res
, respos
) < 0)
4354 Py_XDECREF(errorHandler
);
4360 Py_XDECREF(errorHandler
);
4364 PyObject
*PyUnicode_Translate(PyObject
*str
,
4370 str
= PyUnicode_FromObject(str
);
4373 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
4374 PyUnicode_GET_SIZE(str
),
4385 /* --- Decimal Encoder ---------------------------------------------------- */
4387 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
4392 Py_UNICODE
*p
, *end
;
4393 PyObject
*errorHandler
= NULL
;
4394 PyObject
*exc
= NULL
;
4395 const char *encoding
= "decimal";
4396 const char *reason
= "invalid decimal Unicode string";
4397 /* the following variable is used for caching string comparisons
4398 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4399 int known_errorHandler
= -1;
4401 if (output
== NULL
) {
4402 PyErr_BadArgument();
4409 register Py_UNICODE ch
= *p
;
4411 PyObject
*repunicode
;
4415 Py_UNICODE
*collstart
;
4416 Py_UNICODE
*collend
;
4418 if (Py_UNICODE_ISSPACE(ch
)) {
4423 decimal
= Py_UNICODE_TODECIMAL(ch
);
4425 *output
++ = '0' + decimal
;
4429 if (0 < ch
&& ch
< 256) {
4430 *output
++ = (char)ch
;
4434 /* All other characters are considered unencodable */
4437 while (collend
< end
) {
4438 if ((0 < *collend
&& *collend
< 256) ||
4439 !Py_UNICODE_ISSPACE(*collend
) ||
4440 Py_UNICODE_TODECIMAL(*collend
))
4443 /* cache callback name lookup
4444 * (if not done yet, i.e. it's the first error) */
4445 if (known_errorHandler
==-1) {
4446 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4447 known_errorHandler
= 1;
4448 else if (!strcmp(errors
, "replace"))
4449 known_errorHandler
= 2;
4450 else if (!strcmp(errors
, "ignore"))
4451 known_errorHandler
= 3;
4452 else if (!strcmp(errors
, "xmlcharrefreplace"))
4453 known_errorHandler
= 4;
4455 known_errorHandler
= 0;
4457 switch (known_errorHandler
) {
4458 case 1: /* strict */
4459 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
4461 case 2: /* replace */
4462 for (p
= collstart
; p
< collend
; ++p
)
4465 case 3: /* ignore */
4468 case 4: /* xmlcharrefreplace */
4469 /* generate replacement (temporarily (mis)uses p) */
4470 for (p
= collstart
; p
< collend
; ++p
)
4471 output
+= sprintf(output
, "&#%d;", (int)*p
);
4475 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
4476 encoding
, reason
, s
, length
, &exc
,
4477 collstart
-s
, collend
-s
, &newpos
);
4478 if (repunicode
== NULL
)
4480 /* generate replacement */
4481 repsize
= PyUnicode_GET_SIZE(repunicode
);
4482 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
4483 Py_UNICODE ch
= *uni2
;
4484 if (Py_UNICODE_ISSPACE(ch
))
4487 decimal
= Py_UNICODE_TODECIMAL(ch
);
4489 *output
++ = '0' + decimal
;
4490 else if (0 < ch
&& ch
< 256)
4491 *output
++ = (char)ch
;
4493 Py_DECREF(repunicode
);
4494 raise_encode_exception(&exc
, encoding
,
4495 s
, length
, collstart
-s
, collend
-s
, reason
);
4501 Py_DECREF(repunicode
);
4504 /* 0-terminate the output string */
4507 Py_XDECREF(errorHandler
);
4512 Py_XDECREF(errorHandler
);
4516 /* --- Helpers ------------------------------------------------------------ */
4518 #define STRINGLIB_CHAR Py_UNICODE
4520 #define STRINGLIB_LEN PyUnicode_GET_SIZE
4521 #define STRINGLIB_NEW PyUnicode_FromUnicode
4522 #define STRINGLIB_STR PyUnicode_AS_UNICODE
4524 Py_LOCAL_INLINE(int)
4525 STRINGLIB_CMP(const Py_UNICODE
* str
, const Py_UNICODE
* other
, Py_ssize_t len
)
4527 if (str
[0] != other
[0])
4529 return memcmp((void*) str
, (void*) other
, len
* sizeof(Py_UNICODE
));
4532 #define STRINGLIB_EMPTY unicode_empty
4534 #include "stringlib/fastsearch.h"
4536 #include "stringlib/count.h"
4537 #include "stringlib/find.h"
4538 #include "stringlib/partition.h"
4540 /* helper macro to fixup start/end slice values */
4541 #define FIX_START_END(obj) \
4543 start += (obj)->length; \
4546 if (end > (obj)->length) \
4547 end = (obj)->length; \
4549 end += (obj)->length; \
4553 Py_ssize_t
PyUnicode_Count(PyObject
*str
,
4559 PyUnicodeObject
* str_obj
;
4560 PyUnicodeObject
* sub_obj
;
4562 str_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(str
);
4565 sub_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(substr
);
4571 FIX_START_END(str_obj
);
4573 result
= stringlib_count(
4574 str_obj
->str
+ start
, end
- start
, sub_obj
->str
, sub_obj
->length
4583 Py_ssize_t
PyUnicode_Find(PyObject
*str
,
4591 str
= PyUnicode_FromObject(str
);
4594 sub
= PyUnicode_FromObject(sub
);
4601 result
= stringlib_find_slice(
4602 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
4603 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
4607 result
= stringlib_rfind_slice(
4608 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
4609 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
4620 int tailmatch(PyUnicodeObject
*self
,
4621 PyUnicodeObject
*substring
,
4626 if (substring
->length
== 0)
4629 FIX_START_END(self
);
4631 end
-= substring
->length
;
4635 if (direction
> 0) {
4636 if (Py_UNICODE_MATCH(self
, end
, substring
))
4639 if (Py_UNICODE_MATCH(self
, start
, substring
))
4646 Py_ssize_t
PyUnicode_Tailmatch(PyObject
*str
,
4654 str
= PyUnicode_FromObject(str
);
4657 substr
= PyUnicode_FromObject(substr
);
4658 if (substr
== NULL
) {
4663 result
= tailmatch((PyUnicodeObject
*)str
,
4664 (PyUnicodeObject
*)substr
,
4665 start
, end
, direction
);
4671 /* Apply fixfct filter to the Unicode object self and return a
4672 reference to the modified object */
4675 PyObject
*fixup(PyUnicodeObject
*self
,
4676 int (*fixfct
)(PyUnicodeObject
*s
))
4681 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
4685 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
4687 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
4688 /* fixfct should return TRUE if it modified the buffer. If
4689 FALSE, return a reference to the original buffer instead
4690 (to save space, not time) */
4693 return (PyObject
*) self
;
4695 return (PyObject
*) u
;
4699 int fixupper(PyUnicodeObject
*self
)
4701 Py_ssize_t len
= self
->length
;
4702 Py_UNICODE
*s
= self
->str
;
4706 register Py_UNICODE ch
;
4708 ch
= Py_UNICODE_TOUPPER(*s
);
4720 int fixlower(PyUnicodeObject
*self
)
4722 Py_ssize_t len
= self
->length
;
4723 Py_UNICODE
*s
= self
->str
;
4727 register Py_UNICODE ch
;
4729 ch
= Py_UNICODE_TOLOWER(*s
);
4741 int fixswapcase(PyUnicodeObject
*self
)
4743 Py_ssize_t len
= self
->length
;
4744 Py_UNICODE
*s
= self
->str
;
4748 if (Py_UNICODE_ISUPPER(*s
)) {
4749 *s
= Py_UNICODE_TOLOWER(*s
);
4751 } else if (Py_UNICODE_ISLOWER(*s
)) {
4752 *s
= Py_UNICODE_TOUPPER(*s
);
4762 int fixcapitalize(PyUnicodeObject
*self
)
4764 Py_ssize_t len
= self
->length
;
4765 Py_UNICODE
*s
= self
->str
;
4770 if (Py_UNICODE_ISLOWER(*s
)) {
4771 *s
= Py_UNICODE_TOUPPER(*s
);
4776 if (Py_UNICODE_ISUPPER(*s
)) {
4777 *s
= Py_UNICODE_TOLOWER(*s
);
4786 int fixtitle(PyUnicodeObject
*self
)
4788 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4789 register Py_UNICODE
*e
;
4790 int previous_is_cased
;
4792 /* Shortcut for single character strings */
4793 if (PyUnicode_GET_SIZE(self
) == 1) {
4794 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
4803 e
= p
+ PyUnicode_GET_SIZE(self
);
4804 previous_is_cased
= 0;
4805 for (; p
< e
; p
++) {
4806 register const Py_UNICODE ch
= *p
;
4808 if (previous_is_cased
)
4809 *p
= Py_UNICODE_TOLOWER(ch
);
4811 *p
= Py_UNICODE_TOTITLE(ch
);
4813 if (Py_UNICODE_ISLOWER(ch
) ||
4814 Py_UNICODE_ISUPPER(ch
) ||
4815 Py_UNICODE_ISTITLE(ch
))
4816 previous_is_cased
= 1;
4818 previous_is_cased
= 0;
4824 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
4826 PyObject
*internal_separator
= NULL
;
4827 const Py_UNICODE blank
= ' ';
4828 const Py_UNICODE
*sep
= &blank
;
4829 Py_ssize_t seplen
= 1;
4830 PyUnicodeObject
*res
= NULL
; /* the result */
4831 Py_ssize_t res_alloc
= 100; /* # allocated bytes for string in res */
4832 Py_ssize_t res_used
; /* # used bytes */
4833 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
4834 PyObject
*fseq
; /* PySequence_Fast(seq) */
4835 Py_ssize_t seqlen
; /* len(fseq) -- number of items in sequence */
4839 fseq
= PySequence_Fast(seq
, "");
4844 /* Grrrr. A codec may be invoked to convert str objects to
4845 * Unicode, and so it's possible to call back into Python code
4846 * during PyUnicode_FromObject(), and so it's possible for a sick
4847 * codec to change the size of fseq (if seq is a list). Therefore
4848 * we have to keep refetching the size -- can't assume seqlen
4851 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4852 /* If empty sequence, return u"". */
4854 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
4857 /* If singleton sequence with an exact Unicode, return that. */
4859 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
4860 if (PyUnicode_CheckExact(item
)) {
4862 res
= (PyUnicodeObject
*)item
;
4867 /* At least two items to join, or one that isn't exact Unicode. */
4869 /* Set up sep and seplen -- they're needed. */
4870 if (separator
== NULL
) {
4875 internal_separator
= PyUnicode_FromObject(separator
);
4876 if (internal_separator
== NULL
)
4878 sep
= PyUnicode_AS_UNICODE(internal_separator
);
4879 seplen
= PyUnicode_GET_SIZE(internal_separator
);
4880 /* In case PyUnicode_FromObject() mutated seq. */
4881 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4886 res
= _PyUnicode_New(res_alloc
);
4889 res_p
= PyUnicode_AS_UNICODE(res
);
4892 for (i
= 0; i
< seqlen
; ++i
) {
4894 Py_ssize_t new_res_used
;
4896 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
4897 /* Convert item to Unicode. */
4898 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
4899 PyErr_Format(PyExc_TypeError
,
4900 "sequence item %zd: expected string or Unicode,"
4902 i
, Py_Type(item
)->tp_name
);
4905 item
= PyUnicode_FromObject(item
);
4908 /* We own a reference to item from here on. */
4910 /* In case PyUnicode_FromObject() mutated seq. */
4911 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4913 /* Make sure we have enough space for the separator and the item. */
4914 itemlen
= PyUnicode_GET_SIZE(item
);
4915 new_res_used
= res_used
+ itemlen
;
4916 if (new_res_used
< 0)
4918 if (i
< seqlen
- 1) {
4919 new_res_used
+= seplen
;
4920 if (new_res_used
< 0)
4923 if (new_res_used
> res_alloc
) {
4924 /* double allocated size until it's big enough */
4926 res_alloc
+= res_alloc
;
4929 } while (new_res_used
> res_alloc
);
4930 if (_PyUnicode_Resize(&res
, res_alloc
) < 0) {
4934 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
4937 /* Copy item, and maybe the separator. */
4938 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), itemlen
);
4940 if (i
< seqlen
- 1) {
4941 Py_UNICODE_COPY(res_p
, sep
, seplen
);
4945 res_used
= new_res_used
;
4948 /* Shrink res to match the used area; this probably can't fail,
4949 * but it's cheap to check.
4951 if (_PyUnicode_Resize(&res
, res_used
) < 0)
4955 Py_XDECREF(internal_separator
);
4957 return (PyObject
*)res
;
4960 PyErr_SetString(PyExc_OverflowError
,
4961 "join() result is too long for a Python string");
4966 Py_XDECREF(internal_separator
);
4973 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
4985 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
4990 u
= _PyUnicode_New(left
+ self
->length
+ right
);
4993 Py_UNICODE_FILL(u
->str
, fill
, left
);
4994 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
4996 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
5002 #define SPLIT_APPEND(data, left, right) \
5003 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5006 if (PyList_Append(list, str)) { \
5014 PyObject
*split_whitespace(PyUnicodeObject
*self
,
5016 Py_ssize_t maxcount
)
5018 register Py_ssize_t i
;
5019 register Py_ssize_t j
;
5020 Py_ssize_t len
= self
->length
;
5023 for (i
= j
= 0; i
< len
; ) {
5025 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
5028 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
5031 if (maxcount
-- <= 0)
5033 SPLIT_APPEND(self
->str
, j
, i
);
5034 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
5040 SPLIT_APPEND(self
->str
, j
, len
);
5049 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
5052 register Py_ssize_t i
;
5053 register Py_ssize_t j
;
5059 string
= PyUnicode_FromObject(string
);
5062 data
= PyUnicode_AS_UNICODE(string
);
5063 len
= PyUnicode_GET_SIZE(string
);
5065 list
= PyList_New(0);
5069 for (i
= j
= 0; i
< len
; ) {
5072 /* Find a line and append it */
5073 while (i
< len
&& !BLOOM_LINEBREAK(data
[i
]))
5076 /* Skip the line break reading CRLF as one line break */
5079 if (data
[i
] == '\r' && i
+ 1 < len
&&
5087 SPLIT_APPEND(data
, j
, eol
);
5091 SPLIT_APPEND(data
, j
, len
);
5104 PyObject
*split_char(PyUnicodeObject
*self
,
5107 Py_ssize_t maxcount
)
5109 register Py_ssize_t i
;
5110 register Py_ssize_t j
;
5111 Py_ssize_t len
= self
->length
;
5114 for (i
= j
= 0; i
< len
; ) {
5115 if (self
->str
[i
] == ch
) {
5116 if (maxcount
-- <= 0)
5118 SPLIT_APPEND(self
->str
, j
, i
);
5124 SPLIT_APPEND(self
->str
, j
, len
);
5134 PyObject
*split_substring(PyUnicodeObject
*self
,
5136 PyUnicodeObject
*substring
,
5137 Py_ssize_t maxcount
)
5139 register Py_ssize_t i
;
5140 register Py_ssize_t j
;
5141 Py_ssize_t len
= self
->length
;
5142 Py_ssize_t sublen
= substring
->length
;
5145 for (i
= j
= 0; i
<= len
- sublen
; ) {
5146 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
5147 if (maxcount
-- <= 0)
5149 SPLIT_APPEND(self
->str
, j
, i
);
5155 SPLIT_APPEND(self
->str
, j
, len
);
5165 PyObject
*rsplit_whitespace(PyUnicodeObject
*self
,
5167 Py_ssize_t maxcount
)
5169 register Py_ssize_t i
;
5170 register Py_ssize_t j
;
5171 Py_ssize_t len
= self
->length
;
5174 for (i
= j
= len
- 1; i
>= 0; ) {
5176 while (i
>= 0 && Py_UNICODE_ISSPACE(self
->str
[i
]))
5179 while (i
>= 0 && !Py_UNICODE_ISSPACE(self
->str
[i
]))
5182 if (maxcount
-- <= 0)
5184 SPLIT_APPEND(self
->str
, i
+ 1, j
+ 1);
5185 while (i
>= 0 && Py_UNICODE_ISSPACE(self
->str
[i
]))
5191 SPLIT_APPEND(self
->str
, 0, j
+ 1);
5193 if (PyList_Reverse(list
) < 0)
5203 PyObject
*rsplit_char(PyUnicodeObject
*self
,
5206 Py_ssize_t maxcount
)
5208 register Py_ssize_t i
;
5209 register Py_ssize_t j
;
5210 Py_ssize_t len
= self
->length
;
5213 for (i
= j
= len
- 1; i
>= 0; ) {
5214 if (self
->str
[i
] == ch
) {
5215 if (maxcount
-- <= 0)
5217 SPLIT_APPEND(self
->str
, i
+ 1, j
+ 1);
5223 SPLIT_APPEND(self
->str
, 0, j
+ 1);
5225 if (PyList_Reverse(list
) < 0)
5235 PyObject
*rsplit_substring(PyUnicodeObject
*self
,
5237 PyUnicodeObject
*substring
,
5238 Py_ssize_t maxcount
)
5240 register Py_ssize_t i
;
5241 register Py_ssize_t j
;
5242 Py_ssize_t len
= self
->length
;
5243 Py_ssize_t sublen
= substring
->length
;
5246 for (i
= len
- sublen
, j
= len
; i
>= 0; ) {
5247 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
5248 if (maxcount
-- <= 0)
5250 SPLIT_APPEND(self
->str
, i
+ sublen
, j
);
5257 SPLIT_APPEND(self
->str
, 0, j
);
5259 if (PyList_Reverse(list
) < 0)
5271 PyObject
*split(PyUnicodeObject
*self
,
5272 PyUnicodeObject
*substring
,
5273 Py_ssize_t maxcount
)
5278 maxcount
= PY_SSIZE_T_MAX
;
5280 list
= PyList_New(0);
5284 if (substring
== NULL
)
5285 return split_whitespace(self
,list
,maxcount
);
5287 else if (substring
->length
== 1)
5288 return split_char(self
,list
,substring
->str
[0],maxcount
);
5290 else if (substring
->length
== 0) {
5292 PyErr_SetString(PyExc_ValueError
, "empty separator");
5296 return split_substring(self
,list
,substring
,maxcount
);
5300 PyObject
*rsplit(PyUnicodeObject
*self
,
5301 PyUnicodeObject
*substring
,
5302 Py_ssize_t maxcount
)
5307 maxcount
= PY_SSIZE_T_MAX
;
5309 list
= PyList_New(0);
5313 if (substring
== NULL
)
5314 return rsplit_whitespace(self
,list
,maxcount
);
5316 else if (substring
->length
== 1)
5317 return rsplit_char(self
,list
,substring
->str
[0],maxcount
);
5319 else if (substring
->length
== 0) {
5321 PyErr_SetString(PyExc_ValueError
, "empty separator");
5325 return rsplit_substring(self
,list
,substring
,maxcount
);
5329 PyObject
*replace(PyUnicodeObject
*self
,
5330 PyUnicodeObject
*str1
,
5331 PyUnicodeObject
*str2
,
5332 Py_ssize_t maxcount
)
5337 maxcount
= PY_SSIZE_T_MAX
;
5339 if (str1
->length
== str2
->length
) {
5342 if (str1
->length
== 1) {
5343 /* replace characters */
5345 if (!findchar(self
->str
, self
->length
, str1
->str
[0]))
5347 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5350 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5353 for (i
= 0; i
< u
->length
; i
++)
5354 if (u
->str
[i
] == u1
) {
5361 self
->str
, self
->length
, str1
->str
, str1
->length
, FAST_SEARCH
5365 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5368 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5369 while (i
<= self
->length
- str1
->length
)
5370 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
5373 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5380 Py_ssize_t n
, i
, j
, e
;
5381 Py_ssize_t product
, new_size
, delta
;
5384 /* replace strings */
5385 n
= stringlib_count(self
->str
, self
->length
, str1
->str
, str1
->length
);
5390 /* new_size = self->length + n * (str2->length - str1->length)); */
5391 delta
= (str2
->length
- str1
->length
);
5393 new_size
= self
->length
;
5395 product
= n
* (str2
->length
- str1
->length
);
5396 if ((product
/ (str2
->length
- str1
->length
)) != n
) {
5397 PyErr_SetString(PyExc_OverflowError
,
5398 "replace string is too long");
5401 new_size
= self
->length
+ product
;
5403 PyErr_SetString(PyExc_OverflowError
,
5404 "replace string is too long");
5408 u
= _PyUnicode_New(new_size
);
5413 e
= self
->length
- str1
->length
;
5414 if (str1
->length
> 0) {
5416 /* look for next match */
5419 if (Py_UNICODE_MATCH(self
, j
, str1
))
5426 /* copy unchanged part [i:j] */
5427 Py_UNICODE_COPY(p
, self
->str
+i
, j
-i
);
5430 /* copy substitution string */
5431 if (str2
->length
> 0) {
5432 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5435 i
= j
+ str1
->length
;
5437 if (i
< self
->length
)
5438 /* copy tail [i:] */
5439 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5443 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5447 *p
++ = self
->str
[i
++];
5449 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5452 return (PyObject
*) u
;
5455 /* nothing to replace; return original string (when possible) */
5456 if (PyUnicode_CheckExact(self
)) {
5458 return (PyObject
*) self
;
5460 return PyUnicode_FromUnicode(self
->str
, self
->length
);
5463 /* --- Unicode Object Methods --------------------------------------------- */
5465 PyDoc_STRVAR(title__doc__
,
5466 "S.title() -> unicode\n\
5468 Return a titlecased version of S, i.e. words start with title case\n\
5469 characters, all remaining cased characters have lower case.");
5472 unicode_title(PyUnicodeObject
*self
)
5474 return fixup(self
, fixtitle
);
5477 PyDoc_STRVAR(capitalize__doc__
,
5478 "S.capitalize() -> unicode\n\
5480 Return a capitalized version of S, i.e. make the first character\n\
5484 unicode_capitalize(PyUnicodeObject
*self
)
5486 return fixup(self
, fixcapitalize
);
5490 PyDoc_STRVAR(capwords__doc__
,
5491 "S.capwords() -> unicode\n\
5493 Apply .capitalize() to all words in S and return the result with\n\
5494 normalized whitespace (all whitespace strings are replaced by ' ').");
5497 unicode_capwords(PyUnicodeObject
*self
)
5503 /* Split into words */
5504 list
= split(self
, NULL
, -1);
5508 /* Capitalize each word */
5509 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
5510 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
5514 Py_DECREF(PyList_GET_ITEM(list
, i
));
5515 PyList_SET_ITEM(list
, i
, item
);
5518 /* Join the words to form a new string */
5519 item
= PyUnicode_Join(NULL
, list
);
5523 return (PyObject
*)item
;
5527 /* Argument converter. Coerces to a single unicode character */
5530 convert_uc(PyObject
*obj
, void *addr
)
5532 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
5536 uniobj
= PyUnicode_FromObject(obj
);
5537 if (uniobj
== NULL
) {
5538 PyErr_SetString(PyExc_TypeError
,
5539 "The fill character cannot be converted to Unicode");
5542 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
5543 PyErr_SetString(PyExc_TypeError
,
5544 "The fill character must be exactly one character long");
5548 unistr
= PyUnicode_AS_UNICODE(uniobj
);
5549 *fillcharloc
= unistr
[0];
5554 PyDoc_STRVAR(center__doc__
,
5555 "S.center(width[, fillchar]) -> unicode\n\
5557 Return S centered in a Unicode string of length width. Padding is\n\
5558 done using the specified fill character (default is a space)");
5561 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
5563 Py_ssize_t marg
, left
;
5565 Py_UNICODE fillchar
= ' ';
5567 if (!PyArg_ParseTuple(args
, "n|O&:center", &width
, convert_uc
, &fillchar
))
5570 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
5572 return (PyObject
*) self
;
5575 marg
= width
- self
->length
;
5576 left
= marg
/ 2 + (marg
& width
& 1);
5578 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
5583 /* This code should go into some future Unicode collation support
5584 module. The basic comparison should compare ordinals on a naive
5585 basis (this is what Java does and thus JPython too). */
5587 /* speedy UTF-16 code point order comparison */
5589 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
5591 static short utf16Fixup
[32] =
5593 0, 0, 0, 0, 0, 0, 0, 0,
5594 0, 0, 0, 0, 0, 0, 0, 0,
5595 0, 0, 0, 0, 0, 0, 0, 0,
5596 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
5600 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
5602 Py_ssize_t len1
, len2
;
5604 Py_UNICODE
*s1
= str1
->str
;
5605 Py_UNICODE
*s2
= str2
->str
;
5607 len1
= str1
->length
;
5608 len2
= str2
->length
;
5610 while (len1
> 0 && len2
> 0) {
5616 if (c1
> (1<<11) * 26)
5617 c1
+= utf16Fixup
[c1
>>11];
5618 if (c2
> (1<<11) * 26)
5619 c2
+= utf16Fixup
[c2
>>11];
5620 /* now c1 and c2 are in UTF-32-compatible order */
5623 return (c1
< c2
) ? -1 : 1;
5628 return (len1
< len2
) ? -1 : (len1
!= len2
);
5634 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
5636 register Py_ssize_t len1
, len2
;
5638 Py_UNICODE
*s1
= str1
->str
;
5639 Py_UNICODE
*s2
= str2
->str
;
5641 len1
= str1
->length
;
5642 len2
= str2
->length
;
5644 while (len1
> 0 && len2
> 0) {
5651 return (c1
< c2
) ? -1 : 1;
5656 return (len1
< len2
) ? -1 : (len1
!= len2
);
5661 int PyUnicode_Compare(PyObject
*left
,
5664 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
5667 /* Coerce the two arguments */
5668 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
5671 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
5675 /* Shortcut for empty or interned objects */
5682 result
= unicode_compare(u
, v
);
5694 PyObject
*PyUnicode_RichCompare(PyObject
*left
,
5700 result
= PyUnicode_Compare(left
, right
);
5701 if (result
== -1 && PyErr_Occurred())
5704 /* Convert the return value to a Boolean */
5707 result
= (result
== 0);
5710 result
= (result
!= 0);
5713 result
= (result
<= 0);
5716 result
= (result
>= 0);
5719 result
= (result
== -1);
5722 result
= (result
== 1);
5725 return PyBool_FromLong(result
);
5731 Type errors mean that PyUnicode_FromObject() could not convert
5732 one of the arguments (usually the right hand side) to Unicode,
5733 ie. we can't handle the comparison request. However, it is
5734 possible that the other object knows a comparison method, which
5735 is why we return Py_NotImplemented to give the other object a
5739 if (PyErr_ExceptionMatches(PyExc_TypeError
)) {
5741 Py_INCREF(Py_NotImplemented
);
5742 return Py_NotImplemented
;
5744 if (op
!= Py_EQ
&& op
!= Py_NE
)
5747 /* Equality comparison.
5749 This is a special case: we silence any PyExc_UnicodeDecodeError
5750 and instead turn it into a PyErr_UnicodeWarning.
5753 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
))
5756 if (PyErr_Warn(PyExc_UnicodeWarning
,
5758 "Unicode equal comparison "
5759 "failed to convert both arguments to Unicode - "
5760 "interpreting them as being unequal" :
5761 "Unicode unequal comparison "
5762 "failed to convert both arguments to Unicode - "
5763 "interpreting them as being unequal"
5766 result
= (op
== Py_NE
);
5767 return PyBool_FromLong(result
);
5770 int PyUnicode_Contains(PyObject
*container
,
5773 PyObject
*str
, *sub
;
5776 /* Coerce the two arguments */
5777 sub
= PyUnicode_FromObject(element
);
5779 PyErr_SetString(PyExc_TypeError
,
5780 "'in <string>' requires string as left operand");
5784 str
= PyUnicode_FromObject(container
);
5790 result
= stringlib_contains_obj(str
, sub
);
5798 /* Concat to string or Unicode object giving a new Unicode object. */
5800 PyObject
*PyUnicode_Concat(PyObject
*left
,
5803 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
5805 /* Coerce the two arguments */
5806 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
5809 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
5814 if (v
== unicode_empty
) {
5816 return (PyObject
*)u
;
5818 if (u
== unicode_empty
) {
5820 return (PyObject
*)v
;
5823 /* Concat the two Unicode strings */
5824 w
= _PyUnicode_New(u
->length
+ v
->length
);
5827 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
5828 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
5832 return (PyObject
*)w
;
5840 PyDoc_STRVAR(count__doc__
,
5841 "S.count(sub[, start[, end]]) -> int\n\
5843 Return the number of non-overlapping occurrences of substring sub in\n\
5844 Unicode string S[start:end]. Optional arguments start and end are\n\
5845 interpreted as in slice notation.");
5848 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
5850 PyUnicodeObject
*substring
;
5851 Py_ssize_t start
= 0;
5852 Py_ssize_t end
= PY_SSIZE_T_MAX
;
5855 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
5856 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5859 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
5860 (PyObject
*)substring
);
5861 if (substring
== NULL
)
5864 FIX_START_END(self
);
5866 result
= PyInt_FromSsize_t(
5867 stringlib_count(self
->str
+ start
, end
- start
,
5868 substring
->str
, substring
->length
)
5871 Py_DECREF(substring
);
5876 PyDoc_STRVAR(encode__doc__
,
5877 "S.encode([encoding[,errors]]) -> string or unicode\n\
5879 Encodes S using the codec registered for encoding. encoding defaults\n\
5880 to the default encoding. errors may be given to set a different error\n\
5881 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5882 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5883 'xmlcharrefreplace' as well as any other name registered with\n\
5884 codecs.register_error that can handle UnicodeEncodeErrors.");
5887 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
5889 char *encoding
= NULL
;
5890 char *errors
= NULL
;
5893 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
5895 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
5898 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
5899 PyErr_Format(PyExc_TypeError
,
5900 "encoder did not return a string/unicode object "
5902 Py_Type(v
)->tp_name
);
5912 PyDoc_STRVAR(decode__doc__
,
5913 "S.decode([encoding[,errors]]) -> string or unicode\n\
5915 Decodes S using the codec registered for encoding. encoding defaults\n\
5916 to the default encoding. errors may be given to set a different error\n\
5917 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5918 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5919 as well as any other name registerd with codecs.register_error that is\n\
5920 able to handle UnicodeDecodeErrors.");
5923 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
)
5925 char *encoding
= NULL
;
5926 char *errors
= NULL
;
5929 if (!PyArg_ParseTuple(args
, "|ss:decode", &encoding
, &errors
))
5931 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
5934 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
5935 PyErr_Format(PyExc_TypeError
,
5936 "decoder did not return a string/unicode object "
5938 Py_Type(v
)->tp_name
);
5948 PyDoc_STRVAR(expandtabs__doc__
,
5949 "S.expandtabs([tabsize]) -> unicode\n\
5951 Return a copy of S where all tab characters are expanded using spaces.\n\
5952 If tabsize is not given, a tab size of 8 characters is assumed.");
5955 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
5960 Py_ssize_t i
, j
, old_j
;
5964 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
5967 /* First pass: determine size of output string */
5969 e
= self
->str
+ self
->length
;
5970 for (p
= self
->str
; p
< e
; p
++)
5973 j
+= tabsize
- (j
% tabsize
);
5975 PyErr_SetString(PyExc_OverflowError
,
5976 "new string is too long");
5984 if (*p
== '\n' || *p
== '\r') {
5988 PyErr_SetString(PyExc_OverflowError
,
5989 "new string is too long");
5996 PyErr_SetString(PyExc_OverflowError
, "new string is too long");
6000 /* Second pass: create output string and fill it */
6001 u
= _PyUnicode_New(i
+ j
);
6008 for (p
= self
->str
; p
< e
; p
++)
6011 i
= tabsize
- (j
% tabsize
);
6020 if (*p
== '\n' || *p
== '\r')
6024 return (PyObject
*) u
;
6027 PyDoc_STRVAR(find__doc__
,
6028 "S.find(sub [,start [,end]]) -> int\n\
6030 Return the lowest index in S where substring sub is found,\n\
6031 such that sub is contained within s[start:end]. Optional\n\
6032 arguments start and end are interpreted as in slice notation.\n\
6034 Return -1 on failure.");
6037 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
6039 PyObject
*substring
;
6040 Py_ssize_t start
= 0;
6041 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6044 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
6045 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6047 substring
= PyUnicode_FromObject(substring
);
6051 result
= stringlib_find_slice(
6052 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6053 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6057 Py_DECREF(substring
);
6059 return PyInt_FromSsize_t(result
);
6063 unicode_getitem(PyUnicodeObject
*self
, Py_ssize_t index
)
6065 if (index
< 0 || index
>= self
->length
) {
6066 PyErr_SetString(PyExc_IndexError
, "string index out of range");
6070 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
6074 unicode_hash(PyUnicodeObject
*self
)
6076 /* Since Unicode objects compare equal to their ASCII string
6077 counterparts, they should use the individual character values
6078 as basis for their hash value. This is needed to assure that
6079 strings and Unicode objects behave in the same way as
6082 register Py_ssize_t len
;
6083 register Py_UNICODE
*p
;
6086 if (self
->hash
!= -1)
6088 len
= PyUnicode_GET_SIZE(self
);
6089 p
= PyUnicode_AS_UNICODE(self
);
6092 x
= (1000003*x
) ^ *p
++;
6093 x
^= PyUnicode_GET_SIZE(self
);
6100 PyDoc_STRVAR(index__doc__
,
6101 "S.index(sub [,start [,end]]) -> int\n\
6103 Like S.find() but raise ValueError when the substring is not found.");
6106 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
6109 PyObject
*substring
;
6110 Py_ssize_t start
= 0;
6111 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6113 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
6114 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6116 substring
= PyUnicode_FromObject(substring
);
6120 result
= stringlib_find_slice(
6121 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6122 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6126 Py_DECREF(substring
);
6129 PyErr_SetString(PyExc_ValueError
, "substring not found");
6133 return PyInt_FromSsize_t(result
);
6136 PyDoc_STRVAR(islower__doc__
,
6137 "S.islower() -> bool\n\
6139 Return True if all cased characters in S are lowercase and there is\n\
6140 at least one cased character in S, False otherwise.");
6143 unicode_islower(PyUnicodeObject
*self
)
6145 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6146 register const Py_UNICODE
*e
;
6149 /* Shortcut for single character strings */
6150 if (PyUnicode_GET_SIZE(self
) == 1)
6151 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
6153 /* Special case for empty strings */
6154 if (PyUnicode_GET_SIZE(self
) == 0)
6155 return PyBool_FromLong(0);
6157 e
= p
+ PyUnicode_GET_SIZE(self
);
6159 for (; p
< e
; p
++) {
6160 register const Py_UNICODE ch
= *p
;
6162 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
6163 return PyBool_FromLong(0);
6164 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
6167 return PyBool_FromLong(cased
);
6170 PyDoc_STRVAR(isupper__doc__
,
6171 "S.isupper() -> bool\n\
6173 Return True if all cased characters in S are uppercase and there is\n\
6174 at least one cased character in S, False otherwise.");
6177 unicode_isupper(PyUnicodeObject
*self
)
6179 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6180 register const Py_UNICODE
*e
;
6183 /* Shortcut for single character strings */
6184 if (PyUnicode_GET_SIZE(self
) == 1)
6185 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
6187 /* Special case for empty strings */
6188 if (PyUnicode_GET_SIZE(self
) == 0)
6189 return PyBool_FromLong(0);
6191 e
= p
+ PyUnicode_GET_SIZE(self
);
6193 for (; p
< e
; p
++) {
6194 register const Py_UNICODE ch
= *p
;
6196 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
6197 return PyBool_FromLong(0);
6198 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
6201 return PyBool_FromLong(cased
);
6204 PyDoc_STRVAR(istitle__doc__
,
6205 "S.istitle() -> bool\n\
6207 Return True if S is a titlecased string and there is at least one\n\
6208 character in S, i.e. upper- and titlecase characters may only\n\
6209 follow uncased characters and lowercase characters only cased ones.\n\
6210 Return False otherwise.");
6213 unicode_istitle(PyUnicodeObject
*self
)
6215 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6216 register const Py_UNICODE
*e
;
6217 int cased
, previous_is_cased
;
6219 /* Shortcut for single character strings */
6220 if (PyUnicode_GET_SIZE(self
) == 1)
6221 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
6222 (Py_UNICODE_ISUPPER(*p
) != 0));
6224 /* Special case for empty strings */
6225 if (PyUnicode_GET_SIZE(self
) == 0)
6226 return PyBool_FromLong(0);
6228 e
= p
+ PyUnicode_GET_SIZE(self
);
6230 previous_is_cased
= 0;
6231 for (; p
< e
; p
++) {
6232 register const Py_UNICODE ch
= *p
;
6234 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
6235 if (previous_is_cased
)
6236 return PyBool_FromLong(0);
6237 previous_is_cased
= 1;
6240 else if (Py_UNICODE_ISLOWER(ch
)) {
6241 if (!previous_is_cased
)
6242 return PyBool_FromLong(0);
6243 previous_is_cased
= 1;
6247 previous_is_cased
= 0;
6249 return PyBool_FromLong(cased
);
6252 PyDoc_STRVAR(isspace__doc__
,
6253 "S.isspace() -> bool\n\
6255 Return True if all characters in S are whitespace\n\
6256 and there is at least one character in S, False otherwise.");
6259 unicode_isspace(PyUnicodeObject
*self
)
6261 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6262 register const Py_UNICODE
*e
;
6264 /* Shortcut for single character strings */
6265 if (PyUnicode_GET_SIZE(self
) == 1 &&
6266 Py_UNICODE_ISSPACE(*p
))
6267 return PyBool_FromLong(1);
6269 /* Special case for empty strings */
6270 if (PyUnicode_GET_SIZE(self
) == 0)
6271 return PyBool_FromLong(0);
6273 e
= p
+ PyUnicode_GET_SIZE(self
);
6274 for (; p
< e
; p
++) {
6275 if (!Py_UNICODE_ISSPACE(*p
))
6276 return PyBool_FromLong(0);
6278 return PyBool_FromLong(1);
6281 PyDoc_STRVAR(isalpha__doc__
,
6282 "S.isalpha() -> bool\n\
6284 Return True if all characters in S are alphabetic\n\
6285 and there is at least one character in S, False otherwise.");
6288 unicode_isalpha(PyUnicodeObject
*self
)
6290 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6291 register const Py_UNICODE
*e
;
6293 /* Shortcut for single character strings */
6294 if (PyUnicode_GET_SIZE(self
) == 1 &&
6295 Py_UNICODE_ISALPHA(*p
))
6296 return PyBool_FromLong(1);
6298 /* Special case for empty strings */
6299 if (PyUnicode_GET_SIZE(self
) == 0)
6300 return PyBool_FromLong(0);
6302 e
= p
+ PyUnicode_GET_SIZE(self
);
6303 for (; p
< e
; p
++) {
6304 if (!Py_UNICODE_ISALPHA(*p
))
6305 return PyBool_FromLong(0);
6307 return PyBool_FromLong(1);
6310 PyDoc_STRVAR(isalnum__doc__
,
6311 "S.isalnum() -> bool\n\
6313 Return True if all characters in S are alphanumeric\n\
6314 and there is at least one character in S, False otherwise.");
6317 unicode_isalnum(PyUnicodeObject
*self
)
6319 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6320 register const Py_UNICODE
*e
;
6322 /* Shortcut for single character strings */
6323 if (PyUnicode_GET_SIZE(self
) == 1 &&
6324 Py_UNICODE_ISALNUM(*p
))
6325 return PyBool_FromLong(1);
6327 /* Special case for empty strings */
6328 if (PyUnicode_GET_SIZE(self
) == 0)
6329 return PyBool_FromLong(0);
6331 e
= p
+ PyUnicode_GET_SIZE(self
);
6332 for (; p
< e
; p
++) {
6333 if (!Py_UNICODE_ISALNUM(*p
))
6334 return PyBool_FromLong(0);
6336 return PyBool_FromLong(1);
6339 PyDoc_STRVAR(isdecimal__doc__
,
6340 "S.isdecimal() -> bool\n\
6342 Return True if there are only decimal characters in S,\n\
6346 unicode_isdecimal(PyUnicodeObject
*self
)
6348 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6349 register const Py_UNICODE
*e
;
6351 /* Shortcut for single character strings */
6352 if (PyUnicode_GET_SIZE(self
) == 1 &&
6353 Py_UNICODE_ISDECIMAL(*p
))
6354 return PyBool_FromLong(1);
6356 /* Special case for empty strings */
6357 if (PyUnicode_GET_SIZE(self
) == 0)
6358 return PyBool_FromLong(0);
6360 e
= p
+ PyUnicode_GET_SIZE(self
);
6361 for (; p
< e
; p
++) {
6362 if (!Py_UNICODE_ISDECIMAL(*p
))
6363 return PyBool_FromLong(0);
6365 return PyBool_FromLong(1);
6368 PyDoc_STRVAR(isdigit__doc__
,
6369 "S.isdigit() -> bool\n\
6371 Return True if all characters in S are digits\n\
6372 and there is at least one character in S, False otherwise.");
6375 unicode_isdigit(PyUnicodeObject
*self
)
6377 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6378 register const Py_UNICODE
*e
;
6380 /* Shortcut for single character strings */
6381 if (PyUnicode_GET_SIZE(self
) == 1 &&
6382 Py_UNICODE_ISDIGIT(*p
))
6383 return PyBool_FromLong(1);
6385 /* Special case for empty strings */
6386 if (PyUnicode_GET_SIZE(self
) == 0)
6387 return PyBool_FromLong(0);
6389 e
= p
+ PyUnicode_GET_SIZE(self
);
6390 for (; p
< e
; p
++) {
6391 if (!Py_UNICODE_ISDIGIT(*p
))
6392 return PyBool_FromLong(0);
6394 return PyBool_FromLong(1);
6397 PyDoc_STRVAR(isnumeric__doc__
,
6398 "S.isnumeric() -> bool\n\
6400 Return True if there are only numeric characters in S,\n\
6404 unicode_isnumeric(PyUnicodeObject
*self
)
6406 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6407 register const Py_UNICODE
*e
;
6409 /* Shortcut for single character strings */
6410 if (PyUnicode_GET_SIZE(self
) == 1 &&
6411 Py_UNICODE_ISNUMERIC(*p
))
6412 return PyBool_FromLong(1);
6414 /* Special case for empty strings */
6415 if (PyUnicode_GET_SIZE(self
) == 0)
6416 return PyBool_FromLong(0);
6418 e
= p
+ PyUnicode_GET_SIZE(self
);
6419 for (; p
< e
; p
++) {
6420 if (!Py_UNICODE_ISNUMERIC(*p
))
6421 return PyBool_FromLong(0);
6423 return PyBool_FromLong(1);
6426 PyDoc_STRVAR(join__doc__
,
6427 "S.join(sequence) -> unicode\n\
6429 Return a string which is the concatenation of the strings in the\n\
6430 sequence. The separator between elements is S.");
6433 unicode_join(PyObject
*self
, PyObject
*data
)
6435 return PyUnicode_Join(self
, data
);
6439 unicode_length(PyUnicodeObject
*self
)
6441 return self
->length
;
6444 PyDoc_STRVAR(ljust__doc__
,
6445 "S.ljust(width[, fillchar]) -> int\n\
6447 Return S left justified in a Unicode string of length width. Padding is\n\
6448 done using the specified fill character (default is a space).");
6451 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
6454 Py_UNICODE fillchar
= ' ';
6456 if (!PyArg_ParseTuple(args
, "n|O&:ljust", &width
, convert_uc
, &fillchar
))
6459 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6461 return (PyObject
*) self
;
6464 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
6467 PyDoc_STRVAR(lower__doc__
,
6468 "S.lower() -> unicode\n\
6470 Return a copy of the string S converted to lowercase.");
6473 unicode_lower(PyUnicodeObject
*self
)
6475 return fixup(self
, fixlower
);
6479 #define RIGHTSTRIP 1
6482 /* Arrays indexed by above */
6483 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6485 #define STRIPNAME(i) (stripformat[i]+3)
6487 /* externally visible for str.strip(unicode) */
6489 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
6491 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6492 Py_ssize_t len
= PyUnicode_GET_SIZE(self
);
6493 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
6494 Py_ssize_t seplen
= PyUnicode_GET_SIZE(sepobj
);
6497 BLOOM_MASK sepmask
= make_bloom_mask(sep
, seplen
);
6500 if (striptype
!= RIGHTSTRIP
) {
6501 while (i
< len
&& BLOOM_MEMBER(sepmask
, s
[i
], sep
, seplen
)) {
6507 if (striptype
!= LEFTSTRIP
) {
6510 } while (j
>= i
&& BLOOM_MEMBER(sepmask
, s
[j
], sep
, seplen
));
6514 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
6516 return (PyObject
*)self
;
6519 return PyUnicode_FromUnicode(s
+i
, j
-i
);
6524 do_strip(PyUnicodeObject
*self
, int striptype
)
6526 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6527 Py_ssize_t len
= PyUnicode_GET_SIZE(self
), i
, j
;
6530 if (striptype
!= RIGHTSTRIP
) {
6531 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
6537 if (striptype
!= LEFTSTRIP
) {
6540 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
6544 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
6546 return (PyObject
*)self
;
6549 return PyUnicode_FromUnicode(s
+i
, j
-i
);
6554 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
6556 PyObject
*sep
= NULL
;
6558 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
6561 if (sep
!= NULL
&& sep
!= Py_None
) {
6562 if (PyUnicode_Check(sep
))
6563 return _PyUnicode_XStrip(self
, striptype
, sep
);
6564 else if (PyString_Check(sep
)) {
6566 sep
= PyUnicode_FromObject(sep
);
6569 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
6574 PyErr_Format(PyExc_TypeError
,
6575 "%s arg must be None, unicode or str",
6576 STRIPNAME(striptype
));
6581 return do_strip(self
, striptype
);
6585 PyDoc_STRVAR(strip__doc__
,
6586 "S.strip([chars]) -> unicode\n\
6588 Return a copy of the string S with leading and trailing\n\
6589 whitespace removed.\n\
6590 If chars is given and not None, remove characters in chars instead.\n\
6591 If chars is a str, it will be converted to unicode before stripping");
6594 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
6596 if (PyTuple_GET_SIZE(args
) == 0)
6597 return do_strip(self
, BOTHSTRIP
); /* Common case */
6599 return do_argstrip(self
, BOTHSTRIP
, args
);
6603 PyDoc_STRVAR(lstrip__doc__
,
6604 "S.lstrip([chars]) -> unicode\n\
6606 Return a copy of the string S with leading whitespace removed.\n\
6607 If chars is given and not None, remove characters in chars instead.\n\
6608 If chars is a str, it will be converted to unicode before stripping");
6611 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
6613 if (PyTuple_GET_SIZE(args
) == 0)
6614 return do_strip(self
, LEFTSTRIP
); /* Common case */
6616 return do_argstrip(self
, LEFTSTRIP
, args
);
6620 PyDoc_STRVAR(rstrip__doc__
,
6621 "S.rstrip([chars]) -> unicode\n\
6623 Return a copy of the string S with trailing whitespace removed.\n\
6624 If chars is given and not None, remove characters in chars instead.\n\
6625 If chars is a str, it will be converted to unicode before stripping");
6628 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
6630 if (PyTuple_GET_SIZE(args
) == 0)
6631 return do_strip(self
, RIGHTSTRIP
); /* Common case */
6633 return do_argstrip(self
, RIGHTSTRIP
, args
);
6638 unicode_repeat(PyUnicodeObject
*str
, Py_ssize_t len
)
6648 if (len
== 1 && PyUnicode_CheckExact(str
)) {
6649 /* no repeat, return original string */
6651 return (PyObject
*) str
;
6654 /* ensure # of chars needed doesn't overflow int and # of bytes
6655 * needed doesn't overflow size_t
6657 nchars
= len
* str
->length
;
6658 if (len
&& nchars
/ len
!= str
->length
) {
6659 PyErr_SetString(PyExc_OverflowError
,
6660 "repeated string is too long");
6663 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
6664 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
6665 PyErr_SetString(PyExc_OverflowError
,
6666 "repeated string is too long");
6669 u
= _PyUnicode_New(nchars
);
6675 if (str
->length
== 1 && len
> 0) {
6676 Py_UNICODE_FILL(p
, str
->str
[0], len
);
6678 Py_ssize_t done
= 0; /* number of characters copied this far */
6679 if (done
< nchars
) {
6680 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
6683 while (done
< nchars
) {
6684 int n
= (done
<= nchars
-done
) ? done
: nchars
-done
;
6685 Py_UNICODE_COPY(p
+done
, p
, n
);
6690 return (PyObject
*) u
;
6693 PyObject
*PyUnicode_Replace(PyObject
*obj
,
6696 Py_ssize_t maxcount
)
6703 self
= PyUnicode_FromObject(obj
);
6706 str1
= PyUnicode_FromObject(subobj
);
6711 str2
= PyUnicode_FromObject(replobj
);
6717 result
= replace((PyUnicodeObject
*)self
,
6718 (PyUnicodeObject
*)str1
,
6719 (PyUnicodeObject
*)str2
,
6727 PyDoc_STRVAR(replace__doc__
,
6728 "S.replace (old, new[, maxsplit]) -> unicode\n\
6730 Return a copy of S with all occurrences of substring\n\
6731 old replaced by new. If the optional argument maxsplit is\n\
6732 given, only the first maxsplit occurrences are replaced.");
6735 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
6737 PyUnicodeObject
*str1
;
6738 PyUnicodeObject
*str2
;
6739 Py_ssize_t maxcount
= -1;
6742 if (!PyArg_ParseTuple(args
, "OO|n:replace", &str1
, &str2
, &maxcount
))
6744 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
6747 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
6753 result
= replace(self
, str1
, str2
, maxcount
);
6761 PyObject
*unicode_repr(PyObject
*unicode
)
6763 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
6764 PyUnicode_GET_SIZE(unicode
),
6768 PyDoc_STRVAR(rfind__doc__
,
6769 "S.rfind(sub [,start [,end]]) -> int\n\
6771 Return the highest index in S where substring sub is found,\n\
6772 such that sub is contained within s[start:end]. Optional\n\
6773 arguments start and end are interpreted as in slice notation.\n\
6775 Return -1 on failure.");
6778 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
6780 PyObject
*substring
;
6781 Py_ssize_t start
= 0;
6782 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6785 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
6786 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6788 substring
= PyUnicode_FromObject(substring
);
6792 result
= stringlib_rfind_slice(
6793 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6794 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6798 Py_DECREF(substring
);
6800 return PyInt_FromSsize_t(result
);
6803 PyDoc_STRVAR(rindex__doc__
,
6804 "S.rindex(sub [,start [,end]]) -> int\n\
6806 Like S.rfind() but raise ValueError when the substring is not found.");
6809 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
6811 PyObject
*substring
;
6812 Py_ssize_t start
= 0;
6813 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6816 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
6817 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6819 substring
= PyUnicode_FromObject(substring
);
6823 result
= stringlib_rfind_slice(
6824 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6825 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6829 Py_DECREF(substring
);
6832 PyErr_SetString(PyExc_ValueError
, "substring not found");
6835 return PyInt_FromSsize_t(result
);
6838 PyDoc_STRVAR(rjust__doc__
,
6839 "S.rjust(width[, fillchar]) -> unicode\n\
6841 Return S right justified in a Unicode string of length width. Padding is\n\
6842 done using the specified fill character (default is a space).");
6845 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
6848 Py_UNICODE fillchar
= ' ';
6850 if (!PyArg_ParseTuple(args
, "n|O&:rjust", &width
, convert_uc
, &fillchar
))
6853 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6855 return (PyObject
*) self
;
6858 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
6862 unicode_slice(PyUnicodeObject
*self
, Py_ssize_t start
, Py_ssize_t end
)
6864 /* standard clamping */
6869 if (end
> self
->length
)
6871 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
6872 /* full slice, return original string */
6874 return (PyObject
*) self
;
6879 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
6883 PyObject
*PyUnicode_Split(PyObject
*s
,
6885 Py_ssize_t maxsplit
)
6889 s
= PyUnicode_FromObject(s
);
6893 sep
= PyUnicode_FromObject(sep
);
6900 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
6907 PyDoc_STRVAR(split__doc__
,
6908 "S.split([sep [,maxsplit]]) -> list of strings\n\
6910 Return a list of the words in S, using sep as the\n\
6911 delimiter string. If maxsplit is given, at most maxsplit\n\
6912 splits are done. If sep is not specified or is None,\n\
6913 any whitespace string is a separator.");
6916 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
6918 PyObject
*substring
= Py_None
;
6919 Py_ssize_t maxcount
= -1;
6921 if (!PyArg_ParseTuple(args
, "|On:split", &substring
, &maxcount
))
6924 if (substring
== Py_None
)
6925 return split(self
, NULL
, maxcount
);
6926 else if (PyUnicode_Check(substring
))
6927 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
6929 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
6933 PyUnicode_Partition(PyObject
*str_in
, PyObject
*sep_in
)
6939 str_obj
= PyUnicode_FromObject(str_in
);
6942 sep_obj
= PyUnicode_FromObject(sep_in
);
6948 out
= stringlib_partition(
6949 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
6950 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
6961 PyUnicode_RPartition(PyObject
*str_in
, PyObject
*sep_in
)
6967 str_obj
= PyUnicode_FromObject(str_in
);
6970 sep_obj
= PyUnicode_FromObject(sep_in
);
6976 out
= stringlib_rpartition(
6977 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
6978 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
6987 PyDoc_STRVAR(partition__doc__
,
6988 "S.partition(sep) -> (head, sep, tail)\n\
6990 Searches for the separator sep in S, and returns the part before it,\n\
6991 the separator itself, and the part after it. If the separator is not\n\
6992 found, returns S and two empty strings.");
6995 unicode_partition(PyUnicodeObject
*self
, PyObject
*separator
)
6997 return PyUnicode_Partition((PyObject
*)self
, separator
);
7000 PyDoc_STRVAR(rpartition__doc__
,
7001 "S.rpartition(sep) -> (tail, sep, head)\n\
7003 Searches for the separator sep in S, starting at the end of S, and returns\n\
7004 the part before it, the separator itself, and the part after it. If the\n\
7005 separator is not found, returns two empty strings and S.");
7008 unicode_rpartition(PyUnicodeObject
*self
, PyObject
*separator
)
7010 return PyUnicode_RPartition((PyObject
*)self
, separator
);
7013 PyObject
*PyUnicode_RSplit(PyObject
*s
,
7015 Py_ssize_t maxsplit
)
7019 s
= PyUnicode_FromObject(s
);
7023 sep
= PyUnicode_FromObject(sep
);
7030 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7037 PyDoc_STRVAR(rsplit__doc__
,
7038 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7040 Return a list of the words in S, using sep as the\n\
7041 delimiter string, starting at the end of the string and\n\
7042 working to the front. If maxsplit is given, at most maxsplit\n\
7043 splits are done. If sep is not specified, any whitespace string\n\
7047 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
7049 PyObject
*substring
= Py_None
;
7050 Py_ssize_t maxcount
= -1;
7052 if (!PyArg_ParseTuple(args
, "|On:rsplit", &substring
, &maxcount
))
7055 if (substring
== Py_None
)
7056 return rsplit(self
, NULL
, maxcount
);
7057 else if (PyUnicode_Check(substring
))
7058 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
7060 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
7063 PyDoc_STRVAR(splitlines__doc__
,
7064 "S.splitlines([keepends]]) -> list of strings\n\
7066 Return a list of the lines in S, breaking at line boundaries.\n\
7067 Line breaks are not included in the resulting list unless keepends\n\
7068 is given and true.");
7071 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
7075 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
7078 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
7082 PyObject
*unicode_str(PyUnicodeObject
*self
)
7084 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
7087 PyDoc_STRVAR(swapcase__doc__
,
7088 "S.swapcase() -> unicode\n\
7090 Return a copy of S with uppercase characters converted to lowercase\n\
7094 unicode_swapcase(PyUnicodeObject
*self
)
7096 return fixup(self
, fixswapcase
);
7099 PyDoc_STRVAR(translate__doc__
,
7100 "S.translate(table) -> unicode\n\
7102 Return a copy of the string S, where all characters have been mapped\n\
7103 through the given translation table, which must be a mapping of\n\
7104 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7105 Unmapped characters are left untouched. Characters mapped to None\n\
7109 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
7111 return PyUnicode_TranslateCharmap(self
->str
,
7117 PyDoc_STRVAR(upper__doc__
,
7118 "S.upper() -> unicode\n\
7120 Return a copy of S converted to uppercase.");
7123 unicode_upper(PyUnicodeObject
*self
)
7125 return fixup(self
, fixupper
);
7128 PyDoc_STRVAR(zfill__doc__
,
7129 "S.zfill(width) -> unicode\n\
7131 Pad a numeric string x with zeros on the left, to fill a field\n\
7132 of the specified width. The string x is never truncated.");
7135 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
7141 if (!PyArg_ParseTuple(args
, "n:zfill", &width
))
7144 if (self
->length
>= width
) {
7145 if (PyUnicode_CheckExact(self
)) {
7147 return (PyObject
*) self
;
7150 return PyUnicode_FromUnicode(
7151 PyUnicode_AS_UNICODE(self
),
7152 PyUnicode_GET_SIZE(self
)
7156 fill
= width
- self
->length
;
7158 u
= pad(self
, fill
, 0, '0');
7163 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
7164 /* move sign to beginning of string */
7165 u
->str
[0] = u
->str
[fill
];
7169 return (PyObject
*) u
;
7174 unicode_freelistsize(PyUnicodeObject
*self
)
7176 return PyInt_FromLong(unicode_freelist_size
);
7180 PyDoc_STRVAR(startswith__doc__
,
7181 "S.startswith(prefix[, start[, end]]) -> bool\n\
7183 Return True if S starts with the specified prefix, False otherwise.\n\
7184 With optional start, test S beginning at that position.\n\
7185 With optional end, stop comparing S at that position.\n\
7186 prefix can also be a tuple of strings to try.");
7189 unicode_startswith(PyUnicodeObject
*self
,
7193 PyUnicodeObject
*substring
;
7194 Py_ssize_t start
= 0;
7195 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7198 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &subobj
,
7199 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7201 if (PyTuple_Check(subobj
)) {
7203 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7204 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7205 PyTuple_GET_ITEM(subobj
, i
));
7206 if (substring
== NULL
)
7208 result
= tailmatch(self
, substring
, start
, end
, -1);
7209 Py_DECREF(substring
);
7214 /* nothing matched */
7217 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7218 if (substring
== NULL
)
7220 result
= tailmatch(self
, substring
, start
, end
, -1);
7221 Py_DECREF(substring
);
7222 return PyBool_FromLong(result
);
7226 PyDoc_STRVAR(endswith__doc__
,
7227 "S.endswith(suffix[, start[, end]]) -> bool\n\
7229 Return True if S ends with the specified suffix, False otherwise.\n\
7230 With optional start, test S beginning at that position.\n\
7231 With optional end, stop comparing S at that position.\n\
7232 suffix can also be a tuple of strings to try.");
7235 unicode_endswith(PyUnicodeObject
*self
,
7239 PyUnicodeObject
*substring
;
7240 Py_ssize_t start
= 0;
7241 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7244 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &subobj
,
7245 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7247 if (PyTuple_Check(subobj
)) {
7249 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7250 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7251 PyTuple_GET_ITEM(subobj
, i
));
7252 if (substring
== NULL
)
7254 result
= tailmatch(self
, substring
, start
, end
, +1);
7255 Py_DECREF(substring
);
7262 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7263 if (substring
== NULL
)
7266 result
= tailmatch(self
, substring
, start
, end
, +1);
7267 Py_DECREF(substring
);
7268 return PyBool_FromLong(result
);
7274 unicode_getnewargs(PyUnicodeObject
*v
)
7276 return Py_BuildValue("(u#)", v
->str
, v
->length
);
7280 static PyMethodDef unicode_methods
[] = {
7282 /* Order is according to common usage: often used methods should
7283 appear first, since lookup is done sequentially. */
7285 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
, encode__doc__
},
7286 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
7287 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
7288 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
7289 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
7290 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
7291 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
7292 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
7293 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
7294 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
7295 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
7296 {"partition", (PyCFunction
) unicode_partition
, METH_O
, partition__doc__
},
7297 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
7298 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
7299 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
7300 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
7301 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
, decode__doc__
},
7302 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7303 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
7304 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
7305 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
7306 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
7307 {"rpartition", (PyCFunction
) unicode_rpartition
, METH_O
, rpartition__doc__
},
7308 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
7309 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
7310 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
7311 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
7312 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
7313 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
7314 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
7315 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
7316 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
7317 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
7318 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
7319 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
7320 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
7321 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
7322 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
7323 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
7324 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
7326 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
7330 /* This one is just used for debugging the implementation. */
7331 {"freelistsize", (PyCFunction
) unicode_freelistsize
, METH_NOARGS
},
7334 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
7339 unicode_mod(PyObject
*v
, PyObject
*w
)
7341 if (!PyUnicode_Check(v
)) {
7342 Py_INCREF(Py_NotImplemented
);
7343 return Py_NotImplemented
;
7345 return PyUnicode_Format(v
, w
);
7348 static PyNumberMethods unicode_as_number
= {
7353 unicode_mod
, /*nb_remainder*/
7356 static PySequenceMethods unicode_as_sequence
= {
7357 (lenfunc
) unicode_length
, /* sq_length */
7358 PyUnicode_Concat
, /* sq_concat */
7359 (ssizeargfunc
) unicode_repeat
, /* sq_repeat */
7360 (ssizeargfunc
) unicode_getitem
, /* sq_item */
7361 (ssizessizeargfunc
) unicode_slice
, /* sq_slice */
7362 0, /* sq_ass_item */
7363 0, /* sq_ass_slice */
7364 PyUnicode_Contains
, /* sq_contains */
7368 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
7370 if (PyIndex_Check(item
)) {
7371 Py_ssize_t i
= PyNumber_AsSsize_t(item
, PyExc_IndexError
);
7372 if (i
== -1 && PyErr_Occurred())
7375 i
+= PyUnicode_GET_SIZE(self
);
7376 return unicode_getitem(self
, i
);
7377 } else if (PySlice_Check(item
)) {
7378 Py_ssize_t start
, stop
, step
, slicelength
, cur
, i
;
7379 Py_UNICODE
* source_buf
;
7380 Py_UNICODE
* result_buf
;
7383 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyUnicode_GET_SIZE(self
),
7384 &start
, &stop
, &step
, &slicelength
) < 0) {
7388 if (slicelength
<= 0) {
7389 return PyUnicode_FromUnicode(NULL
, 0);
7390 } else if (start
== 0 && step
== 1 && slicelength
== self
->length
&&
7391 PyUnicode_CheckExact(self
)) {
7393 return (PyObject
*)self
;
7394 } else if (step
== 1) {
7395 return PyUnicode_FromUnicode(self
->str
+ start
, slicelength
);
7397 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
7398 result_buf
= (Py_UNICODE
*)PyMem_MALLOC(slicelength
*
7399 sizeof(Py_UNICODE
));
7401 if (result_buf
== NULL
)
7402 return PyErr_NoMemory();
7404 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
7405 result_buf
[i
] = source_buf
[cur
];
7408 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
7409 PyMem_FREE(result_buf
);
7413 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
7418 static PyMappingMethods unicode_as_mapping
= {
7419 (lenfunc
)unicode_length
, /* mp_length */
7420 (binaryfunc
)unicode_subscript
, /* mp_subscript */
7421 (objobjargproc
)0, /* mp_ass_subscript */
7425 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
7430 PyErr_SetString(PyExc_SystemError
,
7431 "accessing non-existent unicode segment");
7434 *ptr
= (void *) self
->str
;
7435 return PyUnicode_GET_DATA_SIZE(self
);
7439 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, Py_ssize_t index
,
7442 PyErr_SetString(PyExc_TypeError
,
7443 "cannot use unicode as modifiable buffer");
7448 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
7452 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
7457 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
7464 PyErr_SetString(PyExc_SystemError
,
7465 "accessing non-existent unicode segment");
7468 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
7471 *ptr
= (void *) PyString_AS_STRING(str
);
7472 return PyString_GET_SIZE(str
);
7475 /* Helpers for PyUnicode_Format() */
7478 getnextarg(PyObject
*args
, Py_ssize_t arglen
, Py_ssize_t
*p_argidx
)
7480 Py_ssize_t argidx
= *p_argidx
;
7481 if (argidx
< arglen
) {
7486 return PyTuple_GetItem(args
, argidx
);
7488 PyErr_SetString(PyExc_TypeError
,
7489 "not enough arguments for format string");
7493 #define F_LJUST (1<<0)
7494 #define F_SIGN (1<<1)
7495 #define F_BLANK (1<<2)
7496 #define F_ALT (1<<3)
7497 #define F_ZERO (1<<4)
7500 strtounicode(Py_UNICODE
*buffer
, const char *charbuffer
)
7502 register Py_ssize_t i
;
7503 Py_ssize_t len
= strlen(charbuffer
);
7504 for (i
= len
- 1; i
>= 0; i
--)
7505 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
7511 doubletounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, double x
)
7515 PyOS_ascii_formatd((char *)buffer
, len
, format
, x
);
7516 result
= strtounicode(buffer
, (char *)buffer
);
7517 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
7521 longtounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, long x
)
7525 PyOS_snprintf((char *)buffer
, len
, format
, x
);
7526 result
= strtounicode(buffer
, (char *)buffer
);
7527 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
7530 /* XXX To save some code duplication, formatfloat/long/int could have been
7531 shared with stringobject.c, converting from 8-bit to Unicode after the
7532 formatting is done. */
7535 formatfloat(Py_UNICODE
*buf
,
7542 /* fmt = '%#.' + `prec` + `type`
7543 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
7547 x
= PyFloat_AsDouble(v
);
7548 if (x
== -1.0 && PyErr_Occurred())
7552 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
7554 /* Worst case length calc to ensure no buffer overrun:
7558 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
7559 for any double rep.)
7560 len = 1 + prec + 1 + 2 + 5 = 9 + prec
7563 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
7564 len = 1 + 50 + 1 + prec = 52 + prec
7566 If prec=0 the effective precision is 1 (the leading digit is
7567 always given), therefore increase the length by one.
7570 if (((type
== 'g' || type
== 'G') &&
7571 buflen
<= (size_t)10 + (size_t)prec
) ||
7572 (type
== 'f' && buflen
<= (size_t)53 + (size_t)prec
)) {
7573 PyErr_SetString(PyExc_OverflowError
,
7574 "formatted float is too long (precision too large?)");
7577 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%d%c",
7578 (flags
&F_ALT
) ? "#" : "",
7580 return doubletounicode(buf
, buflen
, fmt
, x
);
7584 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
7588 PyObject
*str
; /* temporary string object. */
7589 PyUnicodeObject
*result
;
7591 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
7594 result
= _PyUnicode_New(len
);
7599 for (i
= 0; i
< len
; i
++)
7600 result
->str
[i
] = buf
[i
];
7601 result
->str
[len
] = 0;
7603 return (PyObject
*)result
;
7607 formatint(Py_UNICODE
*buf
,
7614 /* fmt = '%#.' + `prec` + 'l' + `type`
7615 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
7619 char fmt
[64]; /* plenty big enough! */
7623 x
= PyInt_AsLong(v
);
7624 if (x
== -1 && PyErr_Occurred())
7626 if (x
< 0 && type
== 'u') {
7629 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
7636 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
7637 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
7639 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
7640 PyErr_SetString(PyExc_OverflowError
,
7641 "formatted integer is too long (precision too large?)");
7645 if ((flags
& F_ALT
) &&
7646 (type
== 'x' || type
== 'X')) {
7647 /* When converting under %#x or %#X, there are a number
7648 * of issues that cause pain:
7649 * - when 0 is being converted, the C standard leaves off
7650 * the '0x' or '0X', which is inconsistent with other
7651 * %#x/%#X conversions and inconsistent with Python's
7653 * - there are platforms that violate the standard and
7654 * convert 0 with the '0x' or '0X'
7655 * (Metrowerks, Compaq Tru64)
7656 * - there are platforms that give '0x' when converting
7657 * under %#X, but convert 0 in accordance with the
7658 * standard (OS/2 EMX)
7660 * We can achieve the desired consistency by inserting our
7661 * own '0x' or '0X' prefix, and substituting %x/%X in place
7664 * Note that this is the same approach as used in
7665 * formatint() in stringobject.c
7667 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
7668 sign
, type
, prec
, type
);
7671 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
7672 sign
, (flags
&F_ALT
) ? "#" : "",
7676 return longtounicode(buf
, buflen
, fmt
, -x
);
7678 return longtounicode(buf
, buflen
, fmt
, x
);
7682 formatchar(Py_UNICODE
*buf
,
7686 /* presume that the buffer is at least 2 characters long */
7687 if (PyUnicode_Check(v
)) {
7688 if (PyUnicode_GET_SIZE(v
) != 1)
7690 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
7693 else if (PyString_Check(v
)) {
7694 if (PyString_GET_SIZE(v
) != 1)
7696 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
7700 /* Integer input truncated to a character */
7702 x
= PyInt_AsLong(v
);
7703 if (x
== -1 && PyErr_Occurred())
7705 #ifdef Py_UNICODE_WIDE
7706 if (x
< 0 || x
> 0x10ffff) {
7707 PyErr_SetString(PyExc_OverflowError
,
7708 "%c arg not in range(0x110000) "
7709 "(wide Python build)");
7713 if (x
< 0 || x
> 0xffff) {
7714 PyErr_SetString(PyExc_OverflowError
,
7715 "%c arg not in range(0x10000) "
7716 "(narrow Python build)");
7720 buf
[0] = (Py_UNICODE
) x
;
7726 PyErr_SetString(PyExc_TypeError
,
7727 "%c requires int or char");
7731 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
7733 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
7734 chars are formatted. XXX This is a magic number. Each formatting
7735 routine does bounds checking to ensure no overflow, but a better
7736 solution may be to malloc a buffer of appropriate size for each
7737 format. For now, the current solution is sufficient.
7739 #define FORMATBUFLEN (size_t)120
7741 PyObject
*PyUnicode_Format(PyObject
*format
,
7744 Py_UNICODE
*fmt
, *res
;
7745 Py_ssize_t fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
7747 PyUnicodeObject
*result
= NULL
;
7748 PyObject
*dict
= NULL
;
7751 if (format
== NULL
|| args
== NULL
) {
7752 PyErr_BadInternalCall();
7755 uformat
= PyUnicode_FromObject(format
);
7756 if (uformat
== NULL
)
7758 fmt
= PyUnicode_AS_UNICODE(uformat
);
7759 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
7761 reslen
= rescnt
= fmtcnt
+ 100;
7762 result
= _PyUnicode_New(reslen
);
7765 res
= PyUnicode_AS_UNICODE(result
);
7767 if (PyTuple_Check(args
)) {
7768 arglen
= PyTuple_Size(args
);
7775 if (Py_Type(args
)->tp_as_mapping
&& !PyTuple_Check(args
) &&
7776 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
7779 while (--fmtcnt
>= 0) {
7782 rescnt
= fmtcnt
+ 100;
7784 if (_PyUnicode_Resize(&result
, reslen
) < 0)
7786 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
7792 /* Got a format specifier */
7794 Py_ssize_t width
= -1;
7796 Py_UNICODE c
= '\0';
7799 PyObject
*temp
= NULL
;
7803 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
7807 Py_UNICODE
*keystart
;
7813 PyErr_SetString(PyExc_TypeError
,
7814 "format requires a mapping");
7820 /* Skip over balanced parentheses */
7821 while (pcount
> 0 && --fmtcnt
>= 0) {
7824 else if (*fmt
== '(')
7828 keylen
= fmt
- keystart
- 1;
7829 if (fmtcnt
< 0 || pcount
> 0) {
7830 PyErr_SetString(PyExc_ValueError
,
7831 "incomplete format key");
7835 /* keys are converted to strings using UTF-8 and
7836 then looked up since Python uses strings to hold
7837 variables names etc. in its namespaces and we
7838 wouldn't want to break common idioms. */
7839 key
= PyUnicode_EncodeUTF8(keystart
,
7843 key
= PyUnicode_FromUnicode(keystart
, keylen
);
7851 args
= PyObject_GetItem(dict
, key
);
7860 while (--fmtcnt
>= 0) {
7861 switch (c
= *fmt
++) {
7862 case '-': flags
|= F_LJUST
; continue;
7863 case '+': flags
|= F_SIGN
; continue;
7864 case ' ': flags
|= F_BLANK
; continue;
7865 case '#': flags
|= F_ALT
; continue;
7866 case '0': flags
|= F_ZERO
; continue;
7871 v
= getnextarg(args
, arglen
, &argidx
);
7874 if (!PyInt_Check(v
)) {
7875 PyErr_SetString(PyExc_TypeError
,
7879 width
= PyInt_AsLong(v
);
7887 else if (c
>= '0' && c
<= '9') {
7889 while (--fmtcnt
>= 0) {
7891 if (c
< '0' || c
> '9')
7893 if ((width
*10) / 10 != width
) {
7894 PyErr_SetString(PyExc_ValueError
,
7898 width
= width
*10 + (c
- '0');
7906 v
= getnextarg(args
, arglen
, &argidx
);
7909 if (!PyInt_Check(v
)) {
7910 PyErr_SetString(PyExc_TypeError
,
7914 prec
= PyInt_AsLong(v
);
7920 else if (c
>= '0' && c
<= '9') {
7922 while (--fmtcnt
>= 0) {
7923 c
= Py_CHARMASK(*fmt
++);
7924 if (c
< '0' || c
> '9')
7926 if ((prec
*10) / 10 != prec
) {
7927 PyErr_SetString(PyExc_ValueError
,
7931 prec
= prec
*10 + (c
- '0');
7936 if (c
== 'h' || c
== 'l' || c
== 'L') {
7942 PyErr_SetString(PyExc_ValueError
,
7943 "incomplete format");
7947 v
= getnextarg(args
, arglen
, &argidx
);
7957 /* presume that buffer length is at least 1 */
7964 if (PyUnicode_Check(v
) && c
== 's') {
7971 temp
= PyObject_Unicode(v
);
7973 temp
= PyObject_Repr(v
);
7976 if (PyUnicode_Check(temp
))
7977 /* nothing to do */;
7978 else if (PyString_Check(temp
)) {
7979 /* convert to string to Unicode */
7980 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
7981 PyString_GET_SIZE(temp
),
7991 PyErr_SetString(PyExc_TypeError
,
7992 "%s argument has non-string str()");
7996 pbuf
= PyUnicode_AS_UNICODE(temp
);
7997 len
= PyUnicode_GET_SIZE(temp
);
7998 if (prec
>= 0 && len
> prec
)
8010 if (PyLong_Check(v
)) {
8011 temp
= formatlong(v
, flags
, prec
, c
);
8014 pbuf
= PyUnicode_AS_UNICODE(temp
);
8015 len
= PyUnicode_GET_SIZE(temp
);
8020 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8039 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8050 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
8056 PyErr_Format(PyExc_ValueError
,
8057 "unsupported format character '%c' (0x%x) "
8059 (31<=c
&& c
<=126) ? (char)c
: '?',
8061 (Py_ssize_t
)(fmt
- 1 -
8062 PyUnicode_AS_UNICODE(uformat
)));
8066 if (*pbuf
== '-' || *pbuf
== '+') {
8070 else if (flags
& F_SIGN
)
8072 else if (flags
& F_BLANK
)
8079 if (rescnt
- (sign
!= 0) < width
) {
8081 rescnt
= width
+ fmtcnt
+ 100;
8088 if (_PyUnicode_Resize(&result
, reslen
) < 0) {
8092 res
= PyUnicode_AS_UNICODE(result
)
8102 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8103 assert(pbuf
[0] == '0');
8104 assert(pbuf
[1] == c
);
8115 if (width
> len
&& !(flags
& F_LJUST
)) {
8119 } while (--width
> len
);
8124 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8125 assert(pbuf
[0] == '0');
8126 assert(pbuf
[1] == c
);
8131 Py_UNICODE_COPY(res
, pbuf
, len
);
8134 while (--width
>= len
) {
8138 if (dict
&& (argidx
< arglen
) && c
!= '%') {
8139 PyErr_SetString(PyExc_TypeError
,
8140 "not all arguments converted during string formatting");
8147 if (argidx
< arglen
&& !dict
) {
8148 PyErr_SetString(PyExc_TypeError
,
8149 "not all arguments converted during string formatting");
8153 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
8159 return (PyObject
*)result
;
8170 static PyBufferProcs unicode_as_buffer
= {
8171 (readbufferproc
) unicode_buffer_getreadbuf
,
8172 (writebufferproc
) unicode_buffer_getwritebuf
,
8173 (segcountproc
) unicode_buffer_getsegcount
,
8174 (charbufferproc
) unicode_buffer_getcharbuf
,
8178 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
8181 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8184 static char *kwlist
[] = {"string", "encoding", "errors", 0};
8185 char *encoding
= NULL
;
8186 char *errors
= NULL
;
8188 if (type
!= &PyUnicode_Type
)
8189 return unicode_subtype_new(type
, args
, kwds
);
8190 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
8191 kwlist
, &x
, &encoding
, &errors
))
8194 return (PyObject
*)_PyUnicode_New(0);
8195 if (encoding
== NULL
&& errors
== NULL
)
8196 return PyObject_Unicode(x
);
8198 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
8202 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8204 PyUnicodeObject
*tmp
, *pnew
;
8207 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
8208 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
8211 assert(PyUnicode_Check(tmp
));
8212 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
8217 pnew
->str
= PyMem_NEW(Py_UNICODE
, n
+1);
8218 if (pnew
->str
== NULL
) {
8219 _Py_ForgetReference((PyObject
*)pnew
);
8222 return PyErr_NoMemory();
8224 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
8226 pnew
->hash
= tmp
->hash
;
8228 return (PyObject
*)pnew
;
8231 PyDoc_STRVAR(unicode_doc
,
8232 "unicode(string [, encoding[, errors]]) -> object\n\
8234 Create a new Unicode object from the given encoded string.\n\
8235 encoding defaults to the current default string encoding.\n\
8236 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8238 PyTypeObject PyUnicode_Type
= {
8239 PyVarObject_HEAD_INIT(&PyType_Type
, 0)
8240 "unicode", /* tp_name */
8241 sizeof(PyUnicodeObject
), /* tp_size */
8242 0, /* tp_itemsize */
8244 (destructor
)unicode_dealloc
, /* tp_dealloc */
8249 unicode_repr
, /* tp_repr */
8250 &unicode_as_number
, /* tp_as_number */
8251 &unicode_as_sequence
, /* tp_as_sequence */
8252 &unicode_as_mapping
, /* tp_as_mapping */
8253 (hashfunc
) unicode_hash
, /* tp_hash*/
8255 (reprfunc
) unicode_str
, /* tp_str */
8256 PyObject_GenericGetAttr
, /* tp_getattro */
8257 0, /* tp_setattro */
8258 &unicode_as_buffer
, /* tp_as_buffer */
8259 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
8260 Py_TPFLAGS_BASETYPE
| Py_TPFLAGS_UNICODE_SUBCLASS
, /* tp_flags */
8261 unicode_doc
, /* tp_doc */
8262 0, /* tp_traverse */
8264 PyUnicode_RichCompare
, /* tp_richcompare */
8265 0, /* tp_weaklistoffset */
8267 0, /* tp_iternext */
8268 unicode_methods
, /* tp_methods */
8271 &PyBaseString_Type
, /* tp_base */
8273 0, /* tp_descr_get */
8274 0, /* tp_descr_set */
8275 0, /* tp_dictoffset */
8278 unicode_new
, /* tp_new */
8279 PyObject_Del
, /* tp_free */
8282 /* Initialize the Unicode implementation */
8284 void _PyUnicode_Init(void)
8288 /* XXX - move this array to unicodectype.c ? */
8289 Py_UNICODE linebreak
[] = {
8290 0x000A, /* LINE FEED */
8291 0x000D, /* CARRIAGE RETURN */
8292 0x001C, /* FILE SEPARATOR */
8293 0x001D, /* GROUP SEPARATOR */
8294 0x001E, /* RECORD SEPARATOR */
8295 0x0085, /* NEXT LINE */
8296 0x2028, /* LINE SEPARATOR */
8297 0x2029, /* PARAGRAPH SEPARATOR */
8300 /* Init the implementation */
8301 unicode_freelist
= NULL
;
8302 unicode_freelist_size
= 0;
8303 unicode_empty
= _PyUnicode_New(0);
8307 strcpy(unicode_default_encoding
, "ascii");
8308 for (i
= 0; i
< 256; i
++)
8309 unicode_latin1
[i
] = NULL
;
8310 if (PyType_Ready(&PyUnicode_Type
) < 0)
8311 Py_FatalError("Can't initialize 'unicode'");
8313 /* initialize the linebreak bloom filter */
8314 bloom_linebreak
= make_bloom_mask(
8315 linebreak
, sizeof(linebreak
) / sizeof(linebreak
[0])
8318 PyType_Ready(&EncodingMapType
);
8321 /* Finalize the Unicode implementation */
8324 _PyUnicode_Fini(void)
8329 Py_XDECREF(unicode_empty
);
8330 unicode_empty
= NULL
;
8332 for (i
= 0; i
< 256; i
++) {
8333 if (unicode_latin1
[i
]) {
8334 Py_DECREF(unicode_latin1
[i
]);
8335 unicode_latin1
[i
] = NULL
;
8339 for (u
= unicode_freelist
; u
!= NULL
;) {
8340 PyUnicodeObject
*v
= u
;
8341 u
= *(PyUnicodeObject
**)u
;
8344 Py_XDECREF(v
->defenc
);
8347 unicode_freelist
= NULL
;
8348 unicode_freelist_size
= 0;
8359 indent-tabs-mode: nil