3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
41 #include "unicodeobject.h"
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
76 # define BYTEORDER_IS_LITTLE_ENDIAN
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject
*unicode_freelist
;
88 static int unicode_freelist_size
;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject
*unicode_empty
;
93 /* Single character Unicode strings in the Latin-1 range are being
95 static PyUnicodeObject
*unicode_latin1
[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding
[100];
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
118 /* --- Unicode Object ----------------------------------------------------- */
121 int unicode_resize(register PyUnicodeObject
*unicode
,
126 /* Shortcut if there's nothing much to do. */
127 if (unicode
->length
== length
)
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
133 if (unicode
== unicode_empty
||
134 (unicode
->length
== 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
140 (unsigned int)unicode
->str
[0] < 256U &&
141 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
142 PyErr_SetString(PyExc_SystemError
,
143 "can't resize shared unicode objects");
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr
= unicode
->str
;
150 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
152 unicode
->str
= oldstr
;
156 unicode
->str
[length
] = 0;
157 unicode
->length
= length
;
160 /* Reset the object caches */
161 if (unicode
->defenc
) {
162 Py_DECREF(unicode
->defenc
);
163 unicode
->defenc
= NULL
;
170 /* We allocate one more byte to make sure the string is
171 Ux0000 terminated -- XXX is this needed ?
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
179 PyUnicodeObject
*_PyUnicode_New(int length
)
181 register PyUnicodeObject
*unicode
;
183 /* Optimization fo empty strings */
184 if (length
== 0 && unicode_empty
!= NULL
) {
185 Py_INCREF(unicode_empty
);
186 return unicode_empty
;
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist
) {
191 unicode
= unicode_freelist
;
192 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
193 unicode_freelist_size
--;
195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
197 if ((unicode
->length
< length
) &&
198 unicode_resize(unicode
, length
) < 0) {
199 PyMem_DEL(unicode
->str
);
204 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
206 PyObject_INIT(unicode
, &PyUnicode_Type
);
209 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
212 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
219 /* Initialize the first element to guard against cases where
220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
227 unicode
->str
[length
] = 0;
228 unicode
->length
= length
;
230 unicode
->defenc
= NULL
;
234 _Py_ForgetReference((PyObject
*)unicode
);
235 PyObject_Del(unicode
);
240 void unicode_dealloc(register PyUnicodeObject
*unicode
)
242 if (PyUnicode_CheckExact(unicode
) &&
243 unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
244 /* Keep-Alive optimization */
245 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
246 PyMem_DEL(unicode
->str
);
250 if (unicode
->defenc
) {
251 Py_DECREF(unicode
->defenc
);
252 unicode
->defenc
= NULL
;
254 /* Add to free list */
255 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
256 unicode_freelist
= unicode
;
257 unicode_freelist_size
++;
260 PyMem_DEL(unicode
->str
);
261 Py_XDECREF(unicode
->defenc
);
262 unicode
->ob_type
->tp_free((PyObject
*)unicode
);
266 int PyUnicode_Resize(PyObject
**unicode
, int length
)
268 register PyUnicodeObject
*v
;
270 /* Argument checks */
271 if (unicode
== NULL
) {
272 PyErr_BadInternalCall();
275 v
= (PyUnicodeObject
*)*unicode
;
276 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1 || length
< 0) {
277 PyErr_BadInternalCall();
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
284 if (v
->length
!= length
&&
285 (v
== unicode_empty
|| v
->length
== 1)) {
286 PyUnicodeObject
*w
= _PyUnicode_New(length
);
289 Py_UNICODE_COPY(w
->str
, v
->str
,
290 length
< v
->length
? length
: v
->length
);
292 *unicode
= (PyObject
*)w
;
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v
, length
);
301 /* Internal API for use in unicodeobject.c only ! */
302 #define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
305 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
308 PyUnicodeObject
*unicode
;
310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
314 /* Optimization for empty strings */
315 if (size
== 0 && unicode_empty
!= NULL
) {
316 Py_INCREF(unicode_empty
);
317 return (PyObject
*)unicode_empty
;
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size
== 1 && *u
< 256) {
323 unicode
= unicode_latin1
[*u
];
325 unicode
= _PyUnicode_New(1);
328 unicode
->str
[0] = *u
;
329 unicode_latin1
[*u
] = unicode
;
332 return (PyObject
*)unicode
;
336 unicode
= _PyUnicode_New(size
);
340 /* Copy the Unicode data into the new object */
342 Py_UNICODE_COPY(unicode
->str
, u
, size
);
344 return (PyObject
*)unicode
;
349 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
352 PyUnicodeObject
*unicode
;
355 PyErr_BadInternalCall();
359 unicode
= _PyUnicode_New(size
);
363 /* Copy the wchar_t data into the new object */
364 #ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
368 register Py_UNICODE
*u
;
370 u
= PyUnicode_AS_UNICODE(unicode
);
371 for (i
= size
; i
> 0; i
--)
376 return (PyObject
*)unicode
;
379 int PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
383 if (unicode
== NULL
) {
384 PyErr_BadInternalCall();
388 /* If possible, try to copy the 0-termination as well */
389 if (size
> PyUnicode_GET_SIZE(unicode
))
390 size
= PyUnicode_GET_SIZE(unicode
) + 1;
392 #ifdef HAVE_USABLE_WCHAR_T
393 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
396 register Py_UNICODE
*u
;
398 u
= PyUnicode_AS_UNICODE(unicode
);
399 for (i
= size
; i
> 0; i
--)
404 if (size
> PyUnicode_GET_SIZE(unicode
))
405 return PyUnicode_GET_SIZE(unicode
);
412 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
416 #ifdef Py_UNICODE_WIDE
417 if (ordinal
< 0 || ordinal
> 0x10ffff) {
418 PyErr_SetString(PyExc_ValueError
,
419 "unichr() arg not in range(0x110000) "
420 "(wide Python build)");
424 if (ordinal
< 0 || ordinal
> 0xffff) {
425 PyErr_SetString(PyExc_ValueError
,
426 "unichr() arg not in range(0x10000) "
427 "(narrow Python build)");
432 s
[0] = (Py_UNICODE
)ordinal
;
433 return PyUnicode_FromUnicode(s
, 1);
436 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
438 /* XXX Perhaps we should make this API an alias of
439 PyObject_Unicode() instead ?! */
440 if (PyUnicode_CheckExact(obj
)) {
444 if (PyUnicode_Check(obj
)) {
445 /* For a Unicode subtype that's not a Unicode object,
446 return a true Unicode object with the same data. */
447 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
448 PyUnicode_GET_SIZE(obj
));
450 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
453 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
454 const char *encoding
,
457 const char *s
= NULL
;
462 PyErr_BadInternalCall();
467 /* For b/w compatibility we also accept Unicode objects provided
468 that no encodings is given and then redirect to
469 PyObject_Unicode() which then applies the additional logic for
472 NOTE: This API should really only be used for object which
473 represent *encoded* Unicode !
476 if (PyUnicode_Check(obj
)) {
478 PyErr_SetString(PyExc_TypeError
,
479 "decoding Unicode is not supported");
482 return PyObject_Unicode(obj
);
485 if (PyUnicode_Check(obj
)) {
486 PyErr_SetString(PyExc_TypeError
,
487 "decoding Unicode is not supported");
493 if (PyString_Check(obj
)) {
494 s
= PyString_AS_STRING(obj
);
495 len
= PyString_GET_SIZE(obj
);
497 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
498 /* Overwrite the error message with something more useful in
499 case of a TypeError. */
500 if (PyErr_ExceptionMatches(PyExc_TypeError
))
501 PyErr_Format(PyExc_TypeError
,
502 "coercing to Unicode: need string or buffer, "
504 obj
->ob_type
->tp_name
);
508 /* Convert to Unicode */
510 Py_INCREF(unicode_empty
);
511 v
= (PyObject
*)unicode_empty
;
514 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
522 PyObject
*PyUnicode_Decode(const char *s
,
524 const char *encoding
,
527 PyObject
*buffer
= NULL
, *unicode
;
529 if (encoding
== NULL
)
530 encoding
= PyUnicode_GetDefaultEncoding();
532 /* Shortcuts for common default encodings */
533 if (strcmp(encoding
, "utf-8") == 0)
534 return PyUnicode_DecodeUTF8(s
, size
, errors
);
535 else if (strcmp(encoding
, "latin-1") == 0)
536 return PyUnicode_DecodeLatin1(s
, size
, errors
);
537 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
538 else if (strcmp(encoding
, "mbcs") == 0)
539 return PyUnicode_DecodeMBCS(s
, size
, errors
);
541 else if (strcmp(encoding
, "ascii") == 0)
542 return PyUnicode_DecodeASCII(s
, size
, errors
);
544 /* Decode via the codec registry */
545 buffer
= PyBuffer_FromMemory((void *)s
, size
);
548 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
551 if (!PyUnicode_Check(unicode
)) {
552 PyErr_Format(PyExc_TypeError
,
553 "decoder did not return an unicode object (type=%.400s)",
554 unicode
->ob_type
->tp_name
);
566 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
567 const char *encoding
,
572 if (!PyUnicode_Check(unicode
)) {
577 if (encoding
== NULL
)
578 encoding
= PyUnicode_GetDefaultEncoding();
580 /* Decode via the codec registry */
581 v
= PyCodec_Decode(unicode
, encoding
, errors
);
590 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
592 const char *encoding
,
595 PyObject
*v
, *unicode
;
597 unicode
= PyUnicode_FromUnicode(s
, size
);
600 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
605 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
606 const char *encoding
,
611 if (!PyUnicode_Check(unicode
)) {
616 if (encoding
== NULL
)
617 encoding
= PyUnicode_GetDefaultEncoding();
619 /* Encode via the codec registry */
620 v
= PyCodec_Encode(unicode
, encoding
, errors
);
629 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
630 const char *encoding
,
635 if (!PyUnicode_Check(unicode
)) {
640 if (encoding
== NULL
)
641 encoding
= PyUnicode_GetDefaultEncoding();
643 /* Shortcuts for common default encodings */
644 if (errors
== NULL
) {
645 if (strcmp(encoding
, "utf-8") == 0)
646 return PyUnicode_AsUTF8String(unicode
);
647 else if (strcmp(encoding
, "latin-1") == 0)
648 return PyUnicode_AsLatin1String(unicode
);
649 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
650 else if (strcmp(encoding
, "mbcs") == 0)
651 return PyUnicode_AsMBCSString(unicode
);
653 else if (strcmp(encoding
, "ascii") == 0)
654 return PyUnicode_AsASCIIString(unicode
);
657 /* Encode via the codec registry */
658 v
= PyCodec_Encode(unicode
, encoding
, errors
);
661 if (!PyString_Check(v
)) {
662 PyErr_Format(PyExc_TypeError
,
663 "encoder did not return a string object (type=%.400s)",
664 v
->ob_type
->tp_name
);
674 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
677 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
681 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
682 if (v
&& errors
== NULL
)
683 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
687 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
689 if (!PyUnicode_Check(unicode
)) {
693 return PyUnicode_AS_UNICODE(unicode
);
699 int PyUnicode_GetSize(PyObject
*unicode
)
701 if (!PyUnicode_Check(unicode
)) {
705 return PyUnicode_GET_SIZE(unicode
);
711 const char *PyUnicode_GetDefaultEncoding(void)
713 return unicode_default_encoding
;
716 int PyUnicode_SetDefaultEncoding(const char *encoding
)
720 /* Make sure the encoding is valid. As side effect, this also
721 loads the encoding into the codec registry cache. */
722 v
= _PyCodec_Lookup(encoding
);
726 strncpy(unicode_default_encoding
,
728 sizeof(unicode_default_encoding
));
735 /* error handling callback helper:
736 build arguments, call the callback and check the arguments,
737 if no exception occurred, copy the replacement to the output
738 and adjust various state variables.
739 return 0 on success, -1 on error
743 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
744 const char *encoding
, const char *reason
,
745 const char *input
, int insize
, int *startinpos
, int *endinpos
, PyObject
**exceptionObject
, const char **inptr
,
746 PyObject
**output
, int *outpos
, Py_UNICODE
**outptr
)
748 static char *argparse
= "O!i;decoding error handler must return (unicode, int) tuple";
750 PyObject
*restuple
= NULL
;
751 PyObject
*repunicode
= NULL
;
752 int outsize
= PyUnicode_GET_SIZE(*output
);
759 if (*errorHandler
== NULL
) {
760 *errorHandler
= PyCodec_LookupError(errors
);
761 if (*errorHandler
== NULL
)
765 if (*exceptionObject
== NULL
) {
766 *exceptionObject
= PyUnicodeDecodeError_Create(
767 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
768 if (*exceptionObject
== NULL
)
772 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
774 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
776 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
780 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
781 if (restuple
== NULL
)
783 if (!PyTuple_Check(restuple
)) {
784 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
787 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
790 newpos
= insize
+newpos
;
791 if (newpos
<0 || newpos
>insize
) {
792 PyErr_Format(PyExc_IndexError
, "position %d from error handler out of bounds", newpos
);
796 /* need more space? (at least enough for what we
797 have+the replacement+the rest of the string (starting
798 at the new input position), so we won't have to check space
799 when there are no errors in the rest of the string) */
800 repptr
= PyUnicode_AS_UNICODE(repunicode
);
801 repsize
= PyUnicode_GET_SIZE(repunicode
);
802 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
803 if (requiredsize
> outsize
) {
804 if (requiredsize
<2*outsize
)
805 requiredsize
= 2*outsize
;
806 if (PyUnicode_Resize(output
, requiredsize
) < 0)
808 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
811 *inptr
= input
+ newpos
;
812 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
819 Py_XDECREF(restuple
);
823 /* --- UTF-7 Codec -------------------------------------------------------- */
825 /* see RFC2152 for details */
828 char utf7_special
[128] = {
829 /* indicate whether a UTF-7 character is special i.e. cannot be directly
833 2 - whitespace (optional)
834 3 - RFC2152 Set O (optional) */
835 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
837 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
839 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
841 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
846 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
847 warnings about the comparison always being false; since
848 utf7_special[0] is 1, we can safely make that one comparison
851 #define SPECIAL(c, encodeO, encodeWS) \
852 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
853 (encodeWS && (utf7_special[(c)] == 2)) || \
854 (encodeO && (utf7_special[(c)] == 3)))
857 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
859 (isalnum(c) || (c) == '+' || (c) == '/')
861 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
862 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
864 #define ENCODE(out, ch, bits) \
865 while (bits >= 6) { \
866 *out++ = B64(ch >> (bits-6)); \
870 #define DECODE(out, ch, bits, surrogate) \
871 while (bits >= 16) { \
872 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
875 /* We have already generated an error for the high surrogate \
876 so let's not bother seeing if the low surrogate is correct or not */ \
878 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
879 /* This is a surrogate pair. Unfortunately we can't represent \
880 it in a 16-bit character */ \
882 errmsg = "code pairs are not supported"; \
889 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
893 const char *starts
= s
;
898 PyUnicodeObject
*unicode
;
900 const char *errmsg
= "";
902 unsigned int bitsleft
= 0;
903 unsigned long charsleft
= 0;
905 PyObject
*errorHandler
= NULL
;
906 PyObject
*exc
= NULL
;
908 unicode
= _PyUnicode_New(size
);
912 return (PyObject
*)unicode
;
923 if ((ch
== '-') || !B64CHAR(ch
)) {
927 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
929 /* The shift sequence has a partial character in it. If
930 bitsleft < 6 then we could just classify it as padding
931 but that is not the case here */
933 errmsg
= "partial character in shift sequence";
936 /* According to RFC2152 the remaining bits should be zero. We
937 choose to signal an error/insert a replacement character
938 here so indicate the potential of a misencoded character. */
940 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
941 if (bitsleft
&& charsleft
<< (sizeof(charsleft
) * 8 - bitsleft
)) {
942 errmsg
= "non-zero padding bits in shift sequence";
947 if ((s
< e
) && (*(s
) == '-')) {
951 } else if (SPECIAL(ch
,0,0)) {
952 errmsg
= "unexpected special character";
958 charsleft
= (charsleft
<< 6) | UB64(ch
);
961 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
964 else if ( ch
== '+' ) {
965 startinpos
= s
-starts
;
967 if (s
< e
&& *s
== '-') {
976 else if (SPECIAL(ch
,0,0)) {
977 errmsg
= "unexpected special character";
987 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
989 if (unicode_decode_call_errorhandler(
990 errors
, &errorHandler
,
992 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
993 (PyObject
**)&unicode
, &outpos
, &p
))
998 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1000 if (unicode_decode_call_errorhandler(
1001 errors
, &errorHandler
,
1002 "utf7", "unterminated shift sequence",
1003 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1004 (PyObject
**)&unicode
, &outpos
, &p
))
1010 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1013 Py_XDECREF(errorHandler
);
1015 return (PyObject
*)unicode
;
1018 Py_XDECREF(errorHandler
);
1025 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1028 int encodeWhiteSpace
,
1032 /* It might be possible to tighten this worst case */
1033 unsigned int cbAllocated
= 5 * size
;
1036 unsigned int bitsleft
= 0;
1037 unsigned long charsleft
= 0;
1042 return PyString_FromStringAndSize(NULL
, 0);
1044 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
1048 start
= out
= PyString_AS_STRING(v
);
1049 for (;i
< size
; ++i
) {
1050 Py_UNICODE ch
= s
[i
];
1056 } else if (SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1060 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1061 inShift
= bitsleft
> 0;
1066 if (!SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1067 *out
++ = B64(charsleft
<< (6-bitsleft
));
1070 /* Characters not in the BASE64 set implicitly unshift the sequence
1071 so no '-' is required, except if the character is itself a '-' */
1072 if (B64CHAR(ch
) || ch
== '-') {
1079 charsleft
= (charsleft
<< 16) | ch
;
1080 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1082 /* If the next character is special then we dont' need to terminate
1083 the shift sequence. If the next character is not a BASE64 character
1084 or '-' then the shift sequence will be terminated implicitly and we
1085 don't have to insert a '-'. */
1087 if (bitsleft
== 0) {
1089 Py_UNICODE ch2
= s
[i
+1];
1091 if (SPECIAL(ch2
, encodeSetO
, encodeWhiteSpace
)) {
1093 } else if (B64CHAR(ch2
) || ch2
== '-') {
1110 *out
++= B64(charsleft
<< (6-bitsleft
) );
1114 _PyString_Resize(&v
, out
- start
);
1125 /* --- UTF-8 Codec -------------------------------------------------------- */
1128 char utf8_code_length
[256] = {
1129 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1130 illegal prefix. see RFC 2279 for details */
1131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1144 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1145 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1146 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1149 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1153 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1156 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1161 const char *starts
= s
;
1167 PyUnicodeObject
*unicode
;
1169 const char *errmsg
= "";
1170 PyObject
*errorHandler
= NULL
;
1171 PyObject
*exc
= NULL
;
1173 /* Note: size will always be longer than the resulting Unicode
1175 unicode
= _PyUnicode_New(size
);
1181 return (PyObject
*)unicode
;
1184 /* Unpack UTF-8 encoded data */
1189 Py_UCS4 ch
= (unsigned char)*s
;
1192 *p
++ = (Py_UNICODE
)ch
;
1197 n
= utf8_code_length
[ch
];
1203 errmsg
= "unexpected end of data";
1204 startinpos
= s
-starts
;
1213 errmsg
= "unexpected code byte";
1214 startinpos
= s
-starts
;
1215 endinpos
= startinpos
+1;
1219 errmsg
= "internal error";
1220 startinpos
= s
-starts
;
1221 endinpos
= startinpos
+1;
1225 if ((s
[1] & 0xc0) != 0x80) {
1226 errmsg
= "invalid data";
1227 startinpos
= s
-starts
;
1228 endinpos
= startinpos
+2;
1231 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1233 startinpos
= s
-starts
;
1234 endinpos
= startinpos
+2;
1235 errmsg
= "illegal encoding";
1239 *p
++ = (Py_UNICODE
)ch
;
1243 if ((s
[1] & 0xc0) != 0x80 ||
1244 (s
[2] & 0xc0) != 0x80) {
1245 errmsg
= "invalid data";
1246 startinpos
= s
-starts
;
1247 endinpos
= startinpos
+3;
1250 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1252 /* Note: UTF-8 encodings of surrogates are considered
1253 legal UTF-8 sequences;
1255 XXX For wide builds (UCS-4) we should probably try
1256 to recombine the surrogates into a single code
1259 errmsg
= "illegal encoding";
1260 startinpos
= s
-starts
;
1261 endinpos
= startinpos
+3;
1265 *p
++ = (Py_UNICODE
)ch
;
1269 if ((s
[1] & 0xc0) != 0x80 ||
1270 (s
[2] & 0xc0) != 0x80 ||
1271 (s
[3] & 0xc0) != 0x80) {
1272 errmsg
= "invalid data";
1273 startinpos
= s
-starts
;
1274 endinpos
= startinpos
+4;
1277 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
1278 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
1279 /* validate and convert to UTF-16 */
1280 if ((ch
< 0x10000) /* minimum value allowed for 4
1282 || (ch
> 0x10ffff)) /* maximum value allowed for
1285 errmsg
= "illegal encoding";
1286 startinpos
= s
-starts
;
1287 endinpos
= startinpos
+4;
1290 #ifdef Py_UNICODE_WIDE
1291 *p
++ = (Py_UNICODE
)ch
;
1293 /* compute and append the two surrogates: */
1295 /* translate from 10000..10FFFF to 0..FFFF */
1298 /* high surrogate = top 10 bits added to D800 */
1299 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
1301 /* low surrogate = bottom 10 bits added to DC00 */
1302 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
1307 /* Other sizes are only needed for UCS-4 */
1308 errmsg
= "unsupported Unicode code range";
1309 startinpos
= s
-starts
;
1310 endinpos
= startinpos
+n
;
1317 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1318 if (unicode_decode_call_errorhandler(
1319 errors
, &errorHandler
,
1321 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1322 (PyObject
**)&unicode
, &outpos
, &p
))
1326 *consumed
= s
-starts
;
1329 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
1332 Py_XDECREF(errorHandler
);
1334 return (PyObject
*)unicode
;
1337 Py_XDECREF(errorHandler
);
1343 /* Allocation strategy: if the string is short, convert into a stack buffer
1344 and allocate exactly as much space needed at the end. Else allocate the
1345 maximum possible needed (4 result bytes per Unicode character), and return
1346 the excess memory at the end.
1349 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
1353 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1355 int i
; /* index into s of next input byte */
1356 PyObject
*v
; /* result string object */
1357 char *p
; /* next free byte in output buffer */
1358 int nallocated
; /* number of result bytes allocated */
1359 int nneeded
; /* number of result bytes needed */
1360 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
1365 if (size
<= MAX_SHORT_UNICHARS
) {
1366 /* Write into the stack buffer; nallocated can't overflow.
1367 * At the end, we'll allocate exactly as much heap space as it
1368 * turns out we need.
1370 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
1371 v
= NULL
; /* will allocate after we're done */
1375 /* Overallocate on the heap, and give the excess back at the end. */
1376 nallocated
= size
* 4;
1377 if (nallocated
/ 4 != size
) /* overflow! */
1378 return PyErr_NoMemory();
1379 v
= PyString_FromStringAndSize(NULL
, nallocated
);
1382 p
= PyString_AS_STRING(v
);
1385 for (i
= 0; i
< size
;) {
1386 Py_UCS4 ch
= s
[i
++];
1392 else if (ch
< 0x0800) {
1393 /* Encode Latin-1 */
1394 *p
++ = (char)(0xc0 | (ch
>> 6));
1395 *p
++ = (char)(0x80 | (ch
& 0x3f));
1398 /* Encode UCS2 Unicode ordinals */
1400 /* Special case: check for high surrogate */
1401 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
1403 /* Check for low surrogate and combine the two to
1404 form a UCS4 value */
1405 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1406 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
1410 /* Fall through: handles isolated high surrogates */
1412 *p
++ = (char)(0xe0 | (ch
>> 12));
1413 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1414 *p
++ = (char)(0x80 | (ch
& 0x3f));
1418 /* Encode UCS4 Unicode ordinals */
1419 *p
++ = (char)(0xf0 | (ch
>> 18));
1420 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
1421 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1422 *p
++ = (char)(0x80 | (ch
& 0x3f));
1427 /* This was stack allocated. */
1428 nneeded
= Py_SAFE_DOWNCAST(p
- stackbuf
, long, int);
1429 assert(nneeded
<= nallocated
);
1430 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
1433 /* Cut back to size actually needed. */
1434 nneeded
= Py_SAFE_DOWNCAST(p
- PyString_AS_STRING(v
), long, int);
1435 assert(nneeded
<= nallocated
);
1436 _PyString_Resize(&v
, nneeded
);
1440 #undef MAX_SHORT_UNICHARS
1443 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
1445 if (!PyUnicode_Check(unicode
)) {
1446 PyErr_BadArgument();
1449 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
1450 PyUnicode_GET_SIZE(unicode
),
1454 /* --- UTF-16 Codec ------------------------------------------------------- */
1457 PyUnicode_DecodeUTF16(const char *s
,
1462 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
1466 PyUnicode_DecodeUTF16Stateful(const char *s
,
1472 const char *starts
= s
;
1476 PyUnicodeObject
*unicode
;
1478 const unsigned char *q
, *e
;
1479 int bo
= 0; /* assume native ordering by default */
1480 const char *errmsg
= "";
1481 /* Offsets from q for retrieving byte pairs in the right order. */
1482 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1483 int ihi
= 1, ilo
= 0;
1485 int ihi
= 0, ilo
= 1;
1487 PyObject
*errorHandler
= NULL
;
1488 PyObject
*exc
= NULL
;
1490 /* Note: size will always be longer than the resulting Unicode
1492 unicode
= _PyUnicode_New(size
);
1496 return (PyObject
*)unicode
;
1498 /* Unpack UTF-16 encoded data */
1500 q
= (unsigned char *)s
;
1506 /* Check for BOM marks (U+FEFF) in the input and adjust current
1507 byte order setting accordingly. In native mode, the leading BOM
1508 mark is skipped, in all other modes, it is copied to the output
1509 stream as-is (giving a ZWNBSP character). */
1512 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
1513 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1514 if (bom
== 0xFEFF) {
1518 else if (bom
== 0xFFFE) {
1523 if (bom
== 0xFEFF) {
1527 else if (bom
== 0xFFFE) {
1548 /* remaining bytes at the end? (size should be even) */
1552 errmsg
= "truncated data";
1553 startinpos
= ((const char *)q
)-starts
;
1554 endinpos
= ((const char *)e
)-starts
;
1556 /* The remaining input chars are ignored if the callback
1557 chooses to skip the input */
1559 ch
= (q
[ihi
] << 8) | q
[ilo
];
1563 if (ch
< 0xD800 || ch
> 0xDFFF) {
1568 /* UTF-16 code pair: */
1570 errmsg
= "unexpected end of data";
1571 startinpos
= (((const char *)q
)-2)-starts
;
1572 endinpos
= ((const char *)e
)-starts
;
1575 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1576 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
1578 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1579 #ifndef Py_UNICODE_WIDE
1583 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
1588 errmsg
= "illegal UTF-16 surrogate";
1589 startinpos
= (((const char *)q
)-4)-starts
;
1590 endinpos
= startinpos
+2;
1595 errmsg
= "illegal encoding";
1596 startinpos
= (((const char *)q
)-2)-starts
;
1597 endinpos
= startinpos
+2;
1598 /* Fall through to report the error */
1601 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1602 if (unicode_decode_call_errorhandler(
1603 errors
, &errorHandler
,
1605 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
1606 (PyObject
**)&unicode
, &outpos
, &p
))
1614 *consumed
= (const char *)q
-starts
;
1617 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
1620 Py_XDECREF(errorHandler
);
1622 return (PyObject
*)unicode
;
1626 Py_XDECREF(errorHandler
);
1632 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1639 #ifdef Py_UNICODE_WIDE
1642 const int pairs
= 0;
1644 /* Offsets from p for storing byte pairs in the right order. */
1645 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1646 int ihi
= 1, ilo
= 0;
1648 int ihi
= 0, ilo
= 1;
1651 #define STORECHAR(CH) \
1653 p[ihi] = ((CH) >> 8) & 0xff; \
1654 p[ilo] = (CH) & 0xff; \
1658 #ifdef Py_UNICODE_WIDE
1659 for (i
= pairs
= 0; i
< size
; i
++)
1660 if (s
[i
] >= 0x10000)
1663 v
= PyString_FromStringAndSize(NULL
,
1664 2 * (size
+ pairs
+ (byteorder
== 0)));
1668 p
= (unsigned char *)PyString_AS_STRING(v
);
1674 if (byteorder
== -1) {
1679 else if (byteorder
== 1) {
1685 while (size
-- > 0) {
1686 Py_UNICODE ch
= *s
++;
1688 #ifdef Py_UNICODE_WIDE
1689 if (ch
>= 0x10000) {
1690 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1691 ch
= 0xD800 | ((ch
-0x10000) >> 10);
1702 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1704 if (!PyUnicode_Check(unicode
)) {
1705 PyErr_BadArgument();
1708 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1709 PyUnicode_GET_SIZE(unicode
),
1714 /* --- Unicode Escape Codec ----------------------------------------------- */
1716 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
1718 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1722 const char *starts
= s
;
1731 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
1732 PyObject
*errorHandler
= NULL
;
1733 PyObject
*exc
= NULL
;
1735 /* Escaped strings will always be longer than the resulting
1736 Unicode string, so we start with size here and then reduce the
1737 length after conversion to the true value.
1738 (but if the error callback returns a long replacement string
1739 we'll have to allocate more space) */
1740 v
= _PyUnicode_New(size
);
1744 return (PyObject
*)v
;
1746 p
= PyUnicode_AS_UNICODE(v
);
1754 /* Non-escape characters are interpreted as Unicode ordinals */
1756 *p
++ = (unsigned char) *s
++;
1760 startinpos
= s
-starts
;
1767 case '\\': *p
++ = '\\'; break;
1768 case '\'': *p
++ = '\''; break;
1769 case '\"': *p
++ = '\"'; break;
1770 case 'b': *p
++ = '\b'; break;
1771 case 'f': *p
++ = '\014'; break; /* FF */
1772 case 't': *p
++ = '\t'; break;
1773 case 'n': *p
++ = '\n'; break;
1774 case 'r': *p
++ = '\r'; break;
1775 case 'v': *p
++ = '\013'; break; /* VT */
1776 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1778 /* \OOO (octal) escapes */
1779 case '0': case '1': case '2': case '3':
1780 case '4': case '5': case '6': case '7':
1782 if ('0' <= *s
&& *s
<= '7') {
1783 x
= (x
<<3) + *s
++ - '0';
1784 if ('0' <= *s
&& *s
<= '7')
1785 x
= (x
<<3) + *s
++ - '0';
1794 message
= "truncated \\xXX escape";
1800 message
= "truncated \\uXXXX escape";
1806 message
= "truncated \\UXXXXXXXX escape";
1809 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1812 if (unicode_decode_call_errorhandler(
1813 errors
, &errorHandler
,
1814 "unicodeescape", "end of string in escape sequence",
1815 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1816 (PyObject
**)&v
, &outpos
, &p
))
1820 for (i
= 0; i
< digits
; ++i
) {
1821 c
= (unsigned char) s
[i
];
1823 endinpos
= (s
+i
+1)-starts
;
1824 if (unicode_decode_call_errorhandler(
1825 errors
, &errorHandler
,
1826 "unicodeescape", message
,
1827 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1828 (PyObject
**)&v
, &outpos
, &p
))
1832 chr
= (chr
<<4) & ~0xF;
1833 if (c
>= '0' && c
<= '9')
1835 else if (c
>= 'a' && c
<= 'f')
1836 chr
+= 10 + c
- 'a';
1838 chr
+= 10 + c
- 'A';
1841 if (chr
== 0xffffffff && PyErr_Occurred())
1842 /* _decoding_error will have already written into the
1846 /* when we get here, chr is a 32-bit unicode character */
1848 /* UCS-2 character */
1849 *p
++ = (Py_UNICODE
) chr
;
1850 else if (chr
<= 0x10ffff) {
1851 /* UCS-4 character. Either store directly, or as
1853 #ifdef Py_UNICODE_WIDE
1857 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
1858 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
1861 endinpos
= s
-starts
;
1862 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1863 if (unicode_decode_call_errorhandler(
1864 errors
, &errorHandler
,
1865 "unicodeescape", "illegal Unicode character",
1866 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1867 (PyObject
**)&v
, &outpos
, &p
))
1874 message
= "malformed \\N character escape";
1875 if (ucnhash_CAPI
== NULL
) {
1876 /* load the unicode data module */
1878 m
= PyImport_ImportModule("unicodedata");
1881 v
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
1885 ucnhash_CAPI
= PyCObject_AsVoidPtr(v
);
1887 if (ucnhash_CAPI
== NULL
)
1891 const char *start
= s
+1;
1892 /* look for the closing brace */
1893 while (*s
!= '}' && s
< end
)
1895 if (s
> start
&& s
< end
&& *s
== '}') {
1896 /* found a name. look it up in the unicode database */
1897 message
= "unknown Unicode character name";
1899 if (ucnhash_CAPI
->getcode(start
, s
-start
-1, &chr
))
1903 endinpos
= s
-starts
;
1904 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1905 if (unicode_decode_call_errorhandler(
1906 errors
, &errorHandler
,
1907 "unicodeescape", message
,
1908 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1909 (PyObject
**)&v
, &outpos
, &p
))
1915 message
= "\\ at end of string";
1917 endinpos
= s
-starts
;
1918 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1919 if (unicode_decode_call_errorhandler(
1920 errors
, &errorHandler
,
1921 "unicodeescape", message
,
1922 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1923 (PyObject
**)&v
, &outpos
, &p
))
1928 *p
++ = (unsigned char)s
[-1];
1935 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))) < 0)
1937 Py_XDECREF(errorHandler
);
1939 return (PyObject
*)v
;
1944 "\\N escapes not supported (can't load unicodedata module)"
1946 Py_XDECREF(errorHandler
);
1952 Py_XDECREF(errorHandler
);
1957 /* Return a Unicode-Escape string version of the Unicode object.
1959 If quotes is true, the string is enclosed in u"" or u'' quotes as
1964 static const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
1969 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
1976 static const char *hexdigit
= "0123456789abcdef";
1978 repr
= PyString_FromStringAndSize(NULL
, 2 + 6*size
+ 1);
1982 p
= PyString_AS_STRING(repr
);
1986 *p
++ = (findchar(s
, size
, '\'') &&
1987 !findchar(s
, size
, '"')) ? '"' : '\'';
1989 while (size
-- > 0) {
1990 Py_UNICODE ch
= *s
++;
1994 (ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1] || ch
== '\\')) {
2000 #ifdef Py_UNICODE_WIDE
2001 /* Map 21-bit characters to '\U00xxxxxx' */
2002 else if (ch
>= 0x10000) {
2003 int offset
= p
- PyString_AS_STRING(repr
);
2005 /* Resize the string if necessary */
2006 if (offset
+ 12 > PyString_GET_SIZE(repr
)) {
2007 if (_PyString_Resize(&repr
, PyString_GET_SIZE(repr
) + 100))
2009 p
= PyString_AS_STRING(repr
) + offset
;
2014 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
2015 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
2016 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
2017 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
2018 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
2019 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
2020 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
2021 *p
++ = hexdigit
[ch
& 0x0000000F];
2025 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2026 else if (ch
>= 0xD800 && ch
< 0xDC00) {
2032 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
2033 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
2036 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
2037 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
2038 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
2039 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
2040 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
2041 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
2042 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
2043 *p
++ = hexdigit
[ucs
& 0x0000000F];
2046 /* Fall through: isolated surrogates are copied as-is */
2051 /* Map 16-bit characters to '\uxxxx' */
2055 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
2056 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
2057 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
2058 *p
++ = hexdigit
[ch
& 0x000F];
2061 /* Map special whitespace to '\t', \n', '\r' */
2062 else if (ch
== '\t') {
2066 else if (ch
== '\n') {
2070 else if (ch
== '\r') {
2075 /* Map non-printable US ASCII to '\xhh' */
2076 else if (ch
< ' ' || ch
>= 0x7F) {
2079 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
2080 *p
++ = hexdigit
[ch
& 0x000F];
2083 /* Copy everything else as-is */
2088 *p
++ = PyString_AS_STRING(repr
)[1];
2091 _PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
));
2095 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
2098 return unicodeescape_string(s
, size
, 0);
2101 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
2103 if (!PyUnicode_Check(unicode
)) {
2104 PyErr_BadArgument();
2107 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2108 PyUnicode_GET_SIZE(unicode
));
2111 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2113 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
2117 const char *starts
= s
;
2125 PyObject
*errorHandler
= NULL
;
2126 PyObject
*exc
= NULL
;
2128 /* Escaped strings will always be longer than the resulting
2129 Unicode string, so we start with size here and then reduce the
2130 length after conversion to the true value. (But decoding error
2131 handler might have to resize the string) */
2132 v
= _PyUnicode_New(size
);
2136 return (PyObject
*)v
;
2137 p
= PyUnicode_AS_UNICODE(v
);
2145 /* Non-escape characters are interpreted as Unicode ordinals */
2147 *p
++ = (unsigned char)*s
++;
2150 startinpos
= s
-starts
;
2152 /* \u-escapes are only interpreted iff the number of leading
2153 backslashes if odd */
2158 *p
++ = (unsigned char)*s
++;
2160 if (((s
- bs
) & 1) == 0 ||
2162 (*s
!= 'u' && *s
!= 'U')) {
2166 count
= *s
=='u' ? 4 : 8;
2169 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
2170 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2171 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
2172 c
= (unsigned char)*s
;
2174 endinpos
= s
-starts
;
2175 if (unicode_decode_call_errorhandler(
2176 errors
, &errorHandler
,
2177 "rawunicodeescape", "truncated \\uXXXX",
2178 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2179 (PyObject
**)&v
, &outpos
, &p
))
2184 if (c
>= '0' && c
<= '9')
2186 else if (c
>= 'a' && c
<= 'f')
2191 #ifndef Py_UNICODE_WIDE
2193 if (unicode_decode_call_errorhandler(
2194 errors
, &errorHandler
,
2195 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2196 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2197 (PyObject
**)&v
, &outpos
, &p
))
2205 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))) < 0)
2207 Py_XDECREF(errorHandler
);
2209 return (PyObject
*)v
;
2213 Py_XDECREF(errorHandler
);
2218 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
2225 static const char *hexdigit
= "0123456789abcdef";
2227 #ifdef Py_UNICODE_WIDE
2228 repr
= PyString_FromStringAndSize(NULL
, 10 * size
);
2230 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
2237 p
= q
= PyString_AS_STRING(repr
);
2238 while (size
-- > 0) {
2239 Py_UNICODE ch
= *s
++;
2240 #ifdef Py_UNICODE_WIDE
2241 /* Map 32-bit characters to '\Uxxxxxxxx' */
2242 if (ch
>= 0x10000) {
2245 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
2246 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
2247 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
2248 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
2249 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
2250 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
2251 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
2252 *p
++ = hexdigit
[ch
& 15];
2256 /* Map 16-bit characters to '\uxxxx' */
2260 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
2261 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
2262 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
2263 *p
++ = hexdigit
[ch
& 15];
2265 /* Copy everything else as-is */
2270 _PyString_Resize(&repr
, p
- q
);
2274 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
2276 if (!PyUnicode_Check(unicode
)) {
2277 PyErr_BadArgument();
2280 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2281 PyUnicode_GET_SIZE(unicode
));
2284 /* --- Unicode Internal Codec ------------------------------------------- */
2286 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
2290 const char *starts
= s
;
2299 PyObject
*errorHandler
= NULL
;
2300 PyObject
*exc
= NULL
;
2302 unimax
= PyUnicode_GetMax();
2303 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
2306 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
2307 return (PyObject
*)v
;
2308 p
= PyUnicode_AS_UNICODE(v
);
2312 *p
= *(Py_UNICODE
*)s
;
2313 /* We have to sanity check the raw data, otherwise doom looms for
2314 some malformed UCS-4 data. */
2316 #ifdef Py_UNICODE_WIDE
2317 *p
> unimax
|| *p
< 0 ||
2319 end
-s
< Py_UNICODE_SIZE
2322 startinpos
= s
- starts
;
2323 if (end
-s
< Py_UNICODE_SIZE
) {
2324 endinpos
= end
-starts
;
2325 reason
= "truncated input";
2328 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
2329 reason
= "illegal code point (> 0x10FFFF)";
2331 outpos
= p
- PyUnicode_AS_UNICODE(v
);
2332 if (unicode_decode_call_errorhandler(
2333 errors
, &errorHandler
,
2334 "unicode_internal", reason
,
2335 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2336 (PyObject
**)&v
, &outpos
, &p
)) {
2342 s
+= Py_UNICODE_SIZE
;
2346 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))) < 0)
2348 Py_XDECREF(errorHandler
);
2350 return (PyObject
*)v
;
2354 Py_XDECREF(errorHandler
);
2359 /* --- Latin-1 Codec ------------------------------------------------------ */
2361 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
2368 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2370 Py_UNICODE r
= *(unsigned char*)s
;
2371 return PyUnicode_FromUnicode(&r
, 1);
2374 v
= _PyUnicode_New(size
);
2378 return (PyObject
*)v
;
2379 p
= PyUnicode_AS_UNICODE(v
);
2381 *p
++ = (unsigned char)*s
++;
2382 return (PyObject
*)v
;
2389 /* create or adjust a UnicodeEncodeError */
2390 static void make_encode_exception(PyObject
**exceptionObject
,
2391 const char *encoding
,
2392 const Py_UNICODE
*unicode
, int size
,
2393 int startpos
, int endpos
,
2396 if (*exceptionObject
== NULL
) {
2397 *exceptionObject
= PyUnicodeEncodeError_Create(
2398 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2401 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
2403 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
2405 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
2409 Py_DECREF(*exceptionObject
);
2410 *exceptionObject
= NULL
;
2414 /* raises a UnicodeEncodeError */
2415 static void raise_encode_exception(PyObject
**exceptionObject
,
2416 const char *encoding
,
2417 const Py_UNICODE
*unicode
, int size
,
2418 int startpos
, int endpos
,
2421 make_encode_exception(exceptionObject
,
2422 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2423 if (*exceptionObject
!= NULL
)
2424 PyCodec_StrictErrors(*exceptionObject
);
2427 /* error handling callback helper:
2428 build arguments, call the callback and check the arguments,
2429 put the result into newpos and return the replacement string, which
2430 has to be freed by the caller */
2431 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
2432 PyObject
**errorHandler
,
2433 const char *encoding
, const char *reason
,
2434 const Py_UNICODE
*unicode
, int size
, PyObject
**exceptionObject
,
2435 int startpos
, int endpos
,
2438 static char *argparse
= "O!i;encoding error handler must return (unicode, int) tuple";
2441 PyObject
*resunicode
;
2443 if (*errorHandler
== NULL
) {
2444 *errorHandler
= PyCodec_LookupError(errors
);
2445 if (*errorHandler
== NULL
)
2449 make_encode_exception(exceptionObject
,
2450 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2451 if (*exceptionObject
== NULL
)
2454 restuple
= PyObject_CallFunctionObjArgs(
2455 *errorHandler
, *exceptionObject
, NULL
);
2456 if (restuple
== NULL
)
2458 if (!PyTuple_Check(restuple
)) {
2459 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
2460 Py_DECREF(restuple
);
2463 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
2464 &resunicode
, newpos
)) {
2465 Py_DECREF(restuple
);
2469 *newpos
= size
+*newpos
;
2470 if (*newpos
<0 || *newpos
>size
) {
2471 PyErr_Format(PyExc_IndexError
, "position %d from error handler out of bounds", *newpos
);
2472 Py_DECREF(restuple
);
2475 Py_INCREF(resunicode
);
2476 Py_DECREF(restuple
);
2480 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
2487 /* pointers to the beginning and end+1 of input */
2488 const Py_UNICODE
*startp
= p
;
2489 const Py_UNICODE
*endp
= p
+ size
;
2490 /* pointer to the beginning of the unencodable characters */
2491 /* const Py_UNICODE *badp = NULL; */
2492 /* pointer into the output */
2494 /* current output position */
2497 char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
2498 char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2499 PyObject
*errorHandler
= NULL
;
2500 PyObject
*exc
= NULL
;
2501 /* the following variable is used for caching string comparisons
2502 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2503 int known_errorHandler
= -1;
2505 /* allocate enough for a simple encoding without
2506 replacements, if we need more, we'll resize */
2507 res
= PyString_FromStringAndSize(NULL
, size
);
2512 str
= PyString_AS_STRING(res
);
2518 /* can we encode this? */
2520 /* no overflow check, because we know that the space is enough */
2525 int unicodepos
= p
-startp
;
2527 PyObject
*repunicode
;
2532 /* startpos for collecting unencodable chars */
2533 const Py_UNICODE
*collstart
= p
;
2534 const Py_UNICODE
*collend
= p
;
2535 /* find all unecodable characters */
2536 while ((collend
< endp
) && ((*collend
)>=limit
))
2538 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2539 if (known_errorHandler
==-1) {
2540 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
2541 known_errorHandler
= 1;
2542 else if (!strcmp(errors
, "replace"))
2543 known_errorHandler
= 2;
2544 else if (!strcmp(errors
, "ignore"))
2545 known_errorHandler
= 3;
2546 else if (!strcmp(errors
, "xmlcharrefreplace"))
2547 known_errorHandler
= 4;
2549 known_errorHandler
= 0;
2551 switch (known_errorHandler
) {
2552 case 1: /* strict */
2553 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
2555 case 2: /* replace */
2556 while (collstart
++<collend
)
2557 *str
++ = '?'; /* fall through */
2558 case 3: /* ignore */
2561 case 4: /* xmlcharrefreplace */
2562 respos
= str
-PyString_AS_STRING(res
);
2563 /* determine replacement size (temporarily (mis)uses p) */
2564 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
2573 #ifndef Py_UNICODE_WIDE
2579 else if (*p
<1000000)
2585 requiredsize
= respos
+repsize
+(endp
-collend
);
2586 if (requiredsize
> ressize
) {
2587 if (requiredsize
<2*ressize
)
2588 requiredsize
= 2*ressize
;
2589 if (_PyString_Resize(&res
, requiredsize
))
2591 str
= PyString_AS_STRING(res
) + respos
;
2592 ressize
= requiredsize
;
2594 /* generate replacement (temporarily (mis)uses p) */
2595 for (p
= collstart
; p
< collend
; ++p
) {
2596 str
+= sprintf(str
, "&#%d;", (int)*p
);
2601 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
2602 encoding
, reason
, startp
, size
, &exc
,
2603 collstart
-startp
, collend
-startp
, &newpos
);
2604 if (repunicode
== NULL
)
2606 /* need more space? (at least enough for what we
2607 have+the replacement+the rest of the string, so
2608 we won't have to check space for encodable characters) */
2609 respos
= str
-PyString_AS_STRING(res
);
2610 repsize
= PyUnicode_GET_SIZE(repunicode
);
2611 requiredsize
= respos
+repsize
+(endp
-collend
);
2612 if (requiredsize
> ressize
) {
2613 if (requiredsize
<2*ressize
)
2614 requiredsize
= 2*ressize
;
2615 if (_PyString_Resize(&res
, requiredsize
)) {
2616 Py_DECREF(repunicode
);
2619 str
= PyString_AS_STRING(res
) + respos
;
2620 ressize
= requiredsize
;
2622 /* check if there is anything unencodable in the replacement
2623 and copy it to the output */
2624 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
2627 raise_encode_exception(&exc
, encoding
, startp
, size
,
2628 unicodepos
, unicodepos
+1, reason
);
2629 Py_DECREF(repunicode
);
2634 p
= startp
+ newpos
;
2635 Py_DECREF(repunicode
);
2639 /* Resize if we allocated to much */
2640 respos
= str
-PyString_AS_STRING(res
);
2642 /* If this falls res will be NULL */
2643 _PyString_Resize(&res
, respos
);
2644 Py_XDECREF(errorHandler
);
2650 Py_XDECREF(errorHandler
);
2655 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
2659 return unicode_encode_ucs1(p
, size
, errors
, 256);
2662 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
2664 if (!PyUnicode_Check(unicode
)) {
2665 PyErr_BadArgument();
2668 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
2669 PyUnicode_GET_SIZE(unicode
),
2673 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2675 PyObject
*PyUnicode_DecodeASCII(const char *s
,
2679 const char *starts
= s
;
2686 PyObject
*errorHandler
= NULL
;
2687 PyObject
*exc
= NULL
;
2689 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2690 if (size
== 1 && *(unsigned char*)s
< 128) {
2691 Py_UNICODE r
= *(unsigned char*)s
;
2692 return PyUnicode_FromUnicode(&r
, 1);
2695 v
= _PyUnicode_New(size
);
2699 return (PyObject
*)v
;
2700 p
= PyUnicode_AS_UNICODE(v
);
2703 register unsigned char c
= (unsigned char)*s
;
2709 startinpos
= s
-starts
;
2710 endinpos
= startinpos
+ 1;
2711 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
2712 if (unicode_decode_call_errorhandler(
2713 errors
, &errorHandler
,
2714 "ascii", "ordinal not in range(128)",
2715 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2716 (PyObject
**)&v
, &outpos
, &p
))
2720 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
2721 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))) < 0)
2723 Py_XDECREF(errorHandler
);
2725 return (PyObject
*)v
;
2729 Py_XDECREF(errorHandler
);
2734 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
2738 return unicode_encode_ucs1(p
, size
, errors
, 128);
2741 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
2743 if (!PyUnicode_Check(unicode
)) {
2744 PyErr_BadArgument();
2747 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
2748 PyUnicode_GET_SIZE(unicode
),
2752 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2754 /* --- MBCS codecs for Windows -------------------------------------------- */
2756 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
2763 /* First get the size of the result */
2764 DWORD usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
2765 if (size
> 0 && usize
==0)
2766 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2768 v
= _PyUnicode_New(usize
);
2772 return (PyObject
*)v
;
2773 p
= PyUnicode_AS_UNICODE(v
);
2774 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
2776 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2779 return (PyObject
*)v
;
2782 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
2790 /* If there are no characters, bail now! */
2792 return PyString_FromString("");
2794 /* First get the size of the result */
2795 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
2797 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2799 repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
2805 /* Do the conversion */
2806 s
= PyString_AS_STRING(repr
);
2807 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
2809 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2814 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
2816 if (!PyUnicode_Check(unicode
)) {
2817 PyErr_BadArgument();
2820 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
2821 PyUnicode_GET_SIZE(unicode
),
2825 #endif /* MS_WINDOWS */
2827 /* --- Character Mapping Codec -------------------------------------------- */
2829 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
2834 const char *starts
= s
;
2842 PyObject
*errorHandler
= NULL
;
2843 PyObject
*exc
= NULL
;
2844 Py_UNICODE
*mapstring
= NULL
;
2847 /* Default to Latin-1 */
2848 if (mapping
== NULL
)
2849 return PyUnicode_DecodeLatin1(s
, size
, errors
);
2851 v
= _PyUnicode_New(size
);
2855 return (PyObject
*)v
;
2856 p
= PyUnicode_AS_UNICODE(v
);
2858 if (PyUnicode_CheckExact(mapping
)) {
2859 mapstring
= PyUnicode_AS_UNICODE(mapping
);
2860 maplen
= PyUnicode_GET_SIZE(mapping
);
2862 unsigned char ch
= *s
;
2863 Py_UNICODE x
= 0xfffe; /* illegal value */
2869 /* undefined mapping */
2870 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2871 startinpos
= s
-starts
;
2872 endinpos
= startinpos
+1;
2873 if (unicode_decode_call_errorhandler(
2874 errors
, &errorHandler
,
2875 "charmap", "character maps to <undefined>",
2876 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2877 (PyObject
**)&v
, &outpos
, &p
)) {
2888 unsigned char ch
= *s
;
2891 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2892 w
= PyInt_FromLong((long)ch
);
2895 x
= PyObject_GetItem(mapping
, w
);
2898 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2899 /* No mapping found means: mapping is undefined. */
2908 if (PyInt_Check(x
)) {
2909 long value
= PyInt_AS_LONG(x
);
2910 if (value
< 0 || value
> 65535) {
2911 PyErr_SetString(PyExc_TypeError
,
2912 "character mapping must be in range(65536)");
2916 *p
++ = (Py_UNICODE
)value
;
2918 else if (x
== Py_None
) {
2919 /* undefined mapping */
2920 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2921 startinpos
= s
-starts
;
2922 endinpos
= startinpos
+1;
2923 if (unicode_decode_call_errorhandler(
2924 errors
, &errorHandler
,
2925 "charmap", "character maps to <undefined>",
2926 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2927 (PyObject
**)&v
, &outpos
, &p
)) {
2933 else if (PyUnicode_Check(x
)) {
2934 int targetsize
= PyUnicode_GET_SIZE(x
);
2936 if (targetsize
== 1)
2938 *p
++ = *PyUnicode_AS_UNICODE(x
);
2940 else if (targetsize
> 1) {
2942 if (targetsize
> extrachars
) {
2944 int oldpos
= (int)(p
- PyUnicode_AS_UNICODE(v
));
2945 int needed
= (targetsize
- extrachars
) + \
2947 extrachars
+= needed
;
2948 if (_PyUnicode_Resize(&v
,
2949 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
2953 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
2956 PyUnicode_AS_UNICODE(x
),
2959 extrachars
-= targetsize
;
2961 /* 1-0 mapping: skip the character */
2964 /* wrong return value */
2965 PyErr_SetString(PyExc_TypeError
,
2966 "character mapping must return integer, None or unicode");
2974 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2975 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))) < 0)
2977 Py_XDECREF(errorHandler
);
2979 return (PyObject
*)v
;
2982 Py_XDECREF(errorHandler
);
2988 /* Lookup the character ch in the mapping. If the character
2989 can't be found, Py_None is returned (or NULL, if another
2991 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
2993 PyObject
*w
= PyInt_FromLong((long)c
);
2998 x
= PyObject_GetItem(mapping
, w
);
3001 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3002 /* No mapping found means: mapping is undefined. */
3010 else if (x
== Py_None
)
3012 else if (PyInt_Check(x
)) {
3013 long value
= PyInt_AS_LONG(x
);
3014 if (value
< 0 || value
> 255) {
3015 PyErr_SetString(PyExc_TypeError
,
3016 "character mapping must be in range(256)");
3022 else if (PyString_Check(x
))
3025 /* wrong return value */
3026 PyErr_SetString(PyExc_TypeError
,
3027 "character mapping must return integer, None or str");
3033 /* lookup the character, put the result in the output string and adjust
3034 various state variables. Reallocate the output string if not enough
3035 space is available. Return a new reference to the object that
3036 was put in the output buffer, or Py_None, if the mapping was undefined
3037 (in which case no character was written) or NULL, if a
3038 reallocation error ocurred. The called must decref the result */
3040 PyObject
*charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
3041 PyObject
**outobj
, int *outpos
)
3043 PyObject
*rep
= charmapencode_lookup(c
, mapping
);
3047 else if (rep
==Py_None
)
3050 char *outstart
= PyString_AS_STRING(*outobj
);
3051 int outsize
= PyString_GET_SIZE(*outobj
);
3052 if (PyInt_Check(rep
)) {
3053 int requiredsize
= *outpos
+1;
3054 if (outsize
<requiredsize
) {
3055 /* exponentially overallocate to minimize reallocations */
3056 if (requiredsize
< 2*outsize
)
3057 requiredsize
= 2*outsize
;
3058 if (_PyString_Resize(outobj
, requiredsize
)) {
3062 outstart
= PyString_AS_STRING(*outobj
);
3064 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
3067 const char *repchars
= PyString_AS_STRING(rep
);
3068 int repsize
= PyString_GET_SIZE(rep
);
3069 int requiredsize
= *outpos
+repsize
;
3070 if (outsize
<requiredsize
) {
3071 /* exponentially overallocate to minimize reallocations */
3072 if (requiredsize
< 2*outsize
)
3073 requiredsize
= 2*outsize
;
3074 if (_PyString_Resize(outobj
, requiredsize
)) {
3078 outstart
= PyString_AS_STRING(*outobj
);
3080 memcpy(outstart
+ *outpos
, repchars
, repsize
);
3087 /* handle an error in PyUnicode_EncodeCharmap
3088 Return 0 on success, -1 on error */
3090 int charmap_encoding_error(
3091 const Py_UNICODE
*p
, int size
, int *inpos
, PyObject
*mapping
,
3092 PyObject
**exceptionObject
,
3093 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
3094 PyObject
**res
, int *respos
)
3096 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
3100 /* startpos for collecting unencodable chars */
3101 int collstartpos
= *inpos
;
3102 int collendpos
= *inpos
+1;
3104 char *encoding
= "charmap";
3105 char *reason
= "character maps to <undefined>";
3108 /* find all unencodable characters */
3109 while (collendpos
< size
) {
3110 x
= charmapencode_lookup(p
[collendpos
], mapping
);
3113 else if (x
!=Py_None
) {
3120 /* cache callback name lookup
3121 * (if not done yet, i.e. it's the first error) */
3122 if (*known_errorHandler
==-1) {
3123 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3124 *known_errorHandler
= 1;
3125 else if (!strcmp(errors
, "replace"))
3126 *known_errorHandler
= 2;
3127 else if (!strcmp(errors
, "ignore"))
3128 *known_errorHandler
= 3;
3129 else if (!strcmp(errors
, "xmlcharrefreplace"))
3130 *known_errorHandler
= 4;
3132 *known_errorHandler
= 0;
3134 switch (*known_errorHandler
) {
3135 case 1: /* strict */
3136 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3138 case 2: /* replace */
3139 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
3140 x
= charmapencode_output('?', mapping
, res
, respos
);
3144 else if (x
==Py_None
) {
3146 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3152 case 3: /* ignore */
3153 *inpos
= collendpos
;
3155 case 4: /* xmlcharrefreplace */
3156 /* generate replacement (temporarily (mis)uses p) */
3157 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
3158 char buffer
[2+29+1+1];
3160 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
3161 for (cp
= buffer
; *cp
; ++cp
) {
3162 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
3165 else if (x
==Py_None
) {
3167 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3173 *inpos
= collendpos
;
3176 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
3177 encoding
, reason
, p
, size
, exceptionObject
,
3178 collstartpos
, collendpos
, &newpos
);
3179 if (repunicode
== NULL
)
3181 /* generate replacement */
3182 repsize
= PyUnicode_GET_SIZE(repunicode
);
3183 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
3184 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
3186 Py_DECREF(repunicode
);
3189 else if (x
==Py_None
) {
3190 Py_DECREF(repunicode
);
3192 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
3198 Py_DECREF(repunicode
);
3203 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
3209 PyObject
*res
= NULL
;
3210 /* current input position */
3212 /* current output position */
3214 PyObject
*errorHandler
= NULL
;
3215 PyObject
*exc
= NULL
;
3216 /* the following variable is used for caching string comparisons
3217 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3218 * 3=ignore, 4=xmlcharrefreplace */
3219 int known_errorHandler
= -1;
3221 /* Default to Latin-1 */
3222 if (mapping
== NULL
)
3223 return PyUnicode_EncodeLatin1(p
, size
, errors
);
3225 /* allocate enough for a simple encoding without
3226 replacements, if we need more, we'll resize */
3227 res
= PyString_FromStringAndSize(NULL
, size
);
3233 while (inpos
<size
) {
3234 /* try to encode it */
3235 PyObject
*x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
3236 if (x
==NULL
) /* error */
3238 if (x
==Py_None
) { /* unencodable character */
3239 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
3241 &known_errorHandler
, &errorHandler
, errors
,
3248 /* done with this character => adjust input position */
3253 /* Resize if we allocated to much */
3254 if (respos
<PyString_GET_SIZE(res
)) {
3255 if (_PyString_Resize(&res
, respos
))
3259 Py_XDECREF(errorHandler
);
3265 Py_XDECREF(errorHandler
);
3269 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
3272 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
3273 PyErr_BadArgument();
3276 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
3277 PyUnicode_GET_SIZE(unicode
),
3282 /* create or adjust a UnicodeTranslateError */
3283 static void make_translate_exception(PyObject
**exceptionObject
,
3284 const Py_UNICODE
*unicode
, int size
,
3285 int startpos
, int endpos
,
3288 if (*exceptionObject
== NULL
) {
3289 *exceptionObject
= PyUnicodeTranslateError_Create(
3290 unicode
, size
, startpos
, endpos
, reason
);
3293 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
3295 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
3297 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
3301 Py_DECREF(*exceptionObject
);
3302 *exceptionObject
= NULL
;
3306 /* raises a UnicodeTranslateError */
3307 static void raise_translate_exception(PyObject
**exceptionObject
,
3308 const Py_UNICODE
*unicode
, int size
,
3309 int startpos
, int endpos
,
3312 make_translate_exception(exceptionObject
,
3313 unicode
, size
, startpos
, endpos
, reason
);
3314 if (*exceptionObject
!= NULL
)
3315 PyCodec_StrictErrors(*exceptionObject
);
3318 /* error handling callback helper:
3319 build arguments, call the callback and check the arguments,
3320 put the result into newpos and return the replacement string, which
3321 has to be freed by the caller */
3322 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
3323 PyObject
**errorHandler
,
3325 const Py_UNICODE
*unicode
, int size
, PyObject
**exceptionObject
,
3326 int startpos
, int endpos
,
3329 static char *argparse
= "O!i;translating error handler must return (unicode, int) tuple";
3332 PyObject
*resunicode
;
3334 if (*errorHandler
== NULL
) {
3335 *errorHandler
= PyCodec_LookupError(errors
);
3336 if (*errorHandler
== NULL
)
3340 make_translate_exception(exceptionObject
,
3341 unicode
, size
, startpos
, endpos
, reason
);
3342 if (*exceptionObject
== NULL
)
3345 restuple
= PyObject_CallFunctionObjArgs(
3346 *errorHandler
, *exceptionObject
, NULL
);
3347 if (restuple
== NULL
)
3349 if (!PyTuple_Check(restuple
)) {
3350 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
3351 Py_DECREF(restuple
);
3354 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3355 &resunicode
, newpos
)) {
3356 Py_DECREF(restuple
);
3360 *newpos
= size
+*newpos
;
3361 if (*newpos
<0 || *newpos
>size
) {
3362 PyErr_Format(PyExc_IndexError
, "position %d from error handler out of bounds", *newpos
);
3363 Py_DECREF(restuple
);
3366 Py_INCREF(resunicode
);
3367 Py_DECREF(restuple
);
3371 /* Lookup the character ch in the mapping and put the result in result,
3372 which must be decrefed by the caller.
3373 Return 0 on success, -1 on error */
3375 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
3377 PyObject
*w
= PyInt_FromLong((long)c
);
3382 x
= PyObject_GetItem(mapping
, w
);
3385 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3386 /* No mapping found means: use 1:1 mapping. */
3393 else if (x
== Py_None
) {
3397 else if (PyInt_Check(x
)) {
3398 long value
= PyInt_AS_LONG(x
);
3399 long max
= PyUnicode_GetMax();
3400 if (value
< 0 || value
> max
) {
3401 PyErr_Format(PyExc_TypeError
,
3402 "character mapping must be in range(0x%lx)", max
+1);
3409 else if (PyUnicode_Check(x
)) {
3414 /* wrong return value */
3415 PyErr_SetString(PyExc_TypeError
,
3416 "character mapping must return integer, None or unicode");
3421 /* ensure that *outobj is at least requiredsize characters long,
3422 if not reallocate and adjust various state variables.
3423 Return 0 on success, -1 on error */
3425 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
3428 int oldsize
= PyUnicode_GET_SIZE(*outobj
);
3429 if (requiredsize
> oldsize
) {
3430 /* remember old output position */
3431 int outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
3432 /* exponentially overallocate to minimize reallocations */
3433 if (requiredsize
< 2 * oldsize
)
3434 requiredsize
= 2 * oldsize
;
3435 if (_PyUnicode_Resize(outobj
, requiredsize
) < 0)
3437 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
3441 /* lookup the character, put the result in the output string and adjust
3442 various state variables. Return a new reference to the object that
3443 was put in the output buffer in *result, or Py_None, if the mapping was
3444 undefined (in which case no character was written).
3445 The called must decref result.
3446 Return 0 on success, -1 on error. */
3448 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
3449 int insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
3452 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
3455 /* not found => default to 1:1 mapping */
3456 *(*outp
)++ = *curinp
;
3458 else if (*res
==Py_None
)
3460 else if (PyInt_Check(*res
)) {
3461 /* no overflow check, because we know that the space is enough */
3462 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
3464 else if (PyUnicode_Check(*res
)) {
3465 int repsize
= PyUnicode_GET_SIZE(*res
);
3467 /* no overflow check, because we know that the space is enough */
3468 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
3470 else if (repsize
!=0) {
3471 /* more than one character */
3472 int requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
3473 (insize
- (curinp
-startinp
)) +
3475 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
3477 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
3486 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
3492 PyObject
*res
= NULL
;
3493 /* pointers to the beginning and end+1 of input */
3494 const Py_UNICODE
*startp
= p
;
3495 const Py_UNICODE
*endp
= p
+ size
;
3496 /* pointer into the output */
3498 /* current output position */
3500 char *reason
= "character maps to <undefined>";
3501 PyObject
*errorHandler
= NULL
;
3502 PyObject
*exc
= NULL
;
3503 /* the following variable is used for caching string comparisons
3504 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3505 * 3=ignore, 4=xmlcharrefreplace */
3506 int known_errorHandler
= -1;
3508 if (mapping
== NULL
) {
3509 PyErr_BadArgument();
3513 /* allocate enough for a simple 1:1 translation without
3514 replacements, if we need more, we'll resize */
3515 res
= PyUnicode_FromUnicode(NULL
, size
);
3520 str
= PyUnicode_AS_UNICODE(res
);
3523 /* try to encode it */
3525 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
3530 if (x
!=Py_None
) /* it worked => adjust input pointer */
3532 else { /* untranslatable character */
3533 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
3537 /* startpos for collecting untranslatable chars */
3538 const Py_UNICODE
*collstart
= p
;
3539 const Py_UNICODE
*collend
= p
+1;
3540 const Py_UNICODE
*coll
;
3542 /* find all untranslatable characters */
3543 while (collend
< endp
) {
3544 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
3551 /* cache callback name lookup
3552 * (if not done yet, i.e. it's the first error) */
3553 if (known_errorHandler
==-1) {
3554 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3555 known_errorHandler
= 1;
3556 else if (!strcmp(errors
, "replace"))
3557 known_errorHandler
= 2;
3558 else if (!strcmp(errors
, "ignore"))
3559 known_errorHandler
= 3;
3560 else if (!strcmp(errors
, "xmlcharrefreplace"))
3561 known_errorHandler
= 4;
3563 known_errorHandler
= 0;
3565 switch (known_errorHandler
) {
3566 case 1: /* strict */
3567 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
3569 case 2: /* replace */
3570 /* No need to check for space, this is a 1:1 replacement */
3571 for (coll
= collstart
; coll
<collend
; ++coll
)
3574 case 3: /* ignore */
3577 case 4: /* xmlcharrefreplace */
3578 /* generate replacement (temporarily (mis)uses p) */
3579 for (p
= collstart
; p
< collend
; ++p
) {
3580 char buffer
[2+29+1+1];
3582 sprintf(buffer
, "&#%d;", (int)*p
);
3583 if (charmaptranslate_makespace(&res
, &str
,
3584 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
3586 for (cp
= buffer
; *cp
; ++cp
)
3592 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
3593 reason
, startp
, size
, &exc
,
3594 collstart
-startp
, collend
-startp
, &newpos
);
3595 if (repunicode
== NULL
)
3597 /* generate replacement */
3598 repsize
= PyUnicode_GET_SIZE(repunicode
);
3599 if (charmaptranslate_makespace(&res
, &str
,
3600 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
3601 Py_DECREF(repunicode
);
3604 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
3606 p
= startp
+ newpos
;
3607 Py_DECREF(repunicode
);
3611 /* Resize if we allocated to much */
3612 respos
= str
-PyUnicode_AS_UNICODE(res
);
3613 if (respos
<PyUnicode_GET_SIZE(res
)) {
3614 if (_PyUnicode_Resize(&res
, respos
) < 0)
3618 Py_XDECREF(errorHandler
);
3624 Py_XDECREF(errorHandler
);
3628 PyObject
*PyUnicode_Translate(PyObject
*str
,
3634 str
= PyUnicode_FromObject(str
);
3637 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
3638 PyUnicode_GET_SIZE(str
),
3649 /* --- Decimal Encoder ---------------------------------------------------- */
3651 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
3656 Py_UNICODE
*p
, *end
;
3657 PyObject
*errorHandler
= NULL
;
3658 PyObject
*exc
= NULL
;
3659 const char *encoding
= "decimal";
3660 const char *reason
= "invalid decimal Unicode string";
3661 /* the following variable is used for caching string comparisons
3662 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3663 int known_errorHandler
= -1;
3665 if (output
== NULL
) {
3666 PyErr_BadArgument();
3673 register Py_UNICODE ch
= *p
;
3675 PyObject
*repunicode
;
3679 Py_UNICODE
*collstart
;
3680 Py_UNICODE
*collend
;
3682 if (Py_UNICODE_ISSPACE(ch
)) {
3687 decimal
= Py_UNICODE_TODECIMAL(ch
);
3689 *output
++ = '0' + decimal
;
3693 if (0 < ch
&& ch
< 256) {
3694 *output
++ = (char)ch
;
3698 /* All other characters are considered unencodable */
3701 while (collend
< end
) {
3702 if ((0 < *collend
&& *collend
< 256) ||
3703 !Py_UNICODE_ISSPACE(*collend
) ||
3704 Py_UNICODE_TODECIMAL(*collend
))
3707 /* cache callback name lookup
3708 * (if not done yet, i.e. it's the first error) */
3709 if (known_errorHandler
==-1) {
3710 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3711 known_errorHandler
= 1;
3712 else if (!strcmp(errors
, "replace"))
3713 known_errorHandler
= 2;
3714 else if (!strcmp(errors
, "ignore"))
3715 known_errorHandler
= 3;
3716 else if (!strcmp(errors
, "xmlcharrefreplace"))
3717 known_errorHandler
= 4;
3719 known_errorHandler
= 0;
3721 switch (known_errorHandler
) {
3722 case 1: /* strict */
3723 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
3725 case 2: /* replace */
3726 for (p
= collstart
; p
< collend
; ++p
)
3729 case 3: /* ignore */
3732 case 4: /* xmlcharrefreplace */
3733 /* generate replacement (temporarily (mis)uses p) */
3734 for (p
= collstart
; p
< collend
; ++p
)
3735 output
+= sprintf(output
, "&#%d;", (int)*p
);
3739 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
3740 encoding
, reason
, s
, length
, &exc
,
3741 collstart
-s
, collend
-s
, &newpos
);
3742 if (repunicode
== NULL
)
3744 /* generate replacement */
3745 repsize
= PyUnicode_GET_SIZE(repunicode
);
3746 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
3747 Py_UNICODE ch
= *uni2
;
3748 if (Py_UNICODE_ISSPACE(ch
))
3751 decimal
= Py_UNICODE_TODECIMAL(ch
);
3753 *output
++ = '0' + decimal
;
3754 else if (0 < ch
&& ch
< 256)
3755 *output
++ = (char)ch
;
3757 Py_DECREF(repunicode
);
3758 raise_encode_exception(&exc
, encoding
,
3759 s
, length
, collstart
-s
, collend
-s
, reason
);
3765 Py_DECREF(repunicode
);
3768 /* 0-terminate the output string */
3771 Py_XDECREF(errorHandler
);
3776 Py_XDECREF(errorHandler
);
3780 /* --- Helpers ------------------------------------------------------------ */
3783 int count(PyUnicodeObject
*self
,
3786 PyUnicodeObject
*substring
)
3791 start
+= self
->length
;
3794 if (end
> self
->length
)
3797 end
+= self
->length
;
3801 if (substring
->length
== 0)
3802 return (end
- start
+ 1);
3804 end
-= substring
->length
;
3806 while (start
<= end
)
3807 if (Py_UNICODE_MATCH(self
, start
, substring
)) {
3809 start
+= substring
->length
;
3816 int PyUnicode_Count(PyObject
*str
,
3823 str
= PyUnicode_FromObject(str
);
3826 substr
= PyUnicode_FromObject(substr
);
3827 if (substr
== NULL
) {
3832 result
= count((PyUnicodeObject
*)str
,
3834 (PyUnicodeObject
*)substr
);
3842 int findstring(PyUnicodeObject
*self
,
3843 PyUnicodeObject
*substring
,
3849 start
+= self
->length
;
3853 if (end
> self
->length
)
3856 end
+= self
->length
;
3860 if (substring
->length
== 0)
3861 return (direction
> 0) ? start
: end
;
3863 end
-= substring
->length
;
3865 if (direction
< 0) {
3866 for (; end
>= start
; end
--)
3867 if (Py_UNICODE_MATCH(self
, end
, substring
))
3870 for (; start
<= end
; start
++)
3871 if (Py_UNICODE_MATCH(self
, start
, substring
))
3878 int PyUnicode_Find(PyObject
*str
,
3886 str
= PyUnicode_FromObject(str
);
3889 substr
= PyUnicode_FromObject(substr
);
3890 if (substr
== NULL
) {
3895 result
= findstring((PyUnicodeObject
*)str
,
3896 (PyUnicodeObject
*)substr
,
3897 start
, end
, direction
);
3904 int tailmatch(PyUnicodeObject
*self
,
3905 PyUnicodeObject
*substring
,
3911 start
+= self
->length
;
3915 if (substring
->length
== 0)
3918 if (end
> self
->length
)
3921 end
+= self
->length
;
3925 end
-= substring
->length
;
3929 if (direction
> 0) {
3930 if (Py_UNICODE_MATCH(self
, end
, substring
))
3933 if (Py_UNICODE_MATCH(self
, start
, substring
))
3940 int PyUnicode_Tailmatch(PyObject
*str
,
3948 str
= PyUnicode_FromObject(str
);
3951 substr
= PyUnicode_FromObject(substr
);
3952 if (substr
== NULL
) {
3957 result
= tailmatch((PyUnicodeObject
*)str
,
3958 (PyUnicodeObject
*)substr
,
3959 start
, end
, direction
);
3966 const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
3970 /* like wcschr, but doesn't stop at NULL characters */
3972 while (size
-- > 0) {
3981 /* Apply fixfct filter to the Unicode object self and return a
3982 reference to the modified object */
3985 PyObject
*fixup(PyUnicodeObject
*self
,
3986 int (*fixfct
)(PyUnicodeObject
*s
))
3991 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
3995 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
3997 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
3998 /* fixfct should return TRUE if it modified the buffer. If
3999 FALSE, return a reference to the original buffer instead
4000 (to save space, not time) */
4003 return (PyObject
*) self
;
4005 return (PyObject
*) u
;
4009 int fixupper(PyUnicodeObject
*self
)
4011 int len
= self
->length
;
4012 Py_UNICODE
*s
= self
->str
;
4016 register Py_UNICODE ch
;
4018 ch
= Py_UNICODE_TOUPPER(*s
);
4030 int fixlower(PyUnicodeObject
*self
)
4032 int len
= self
->length
;
4033 Py_UNICODE
*s
= self
->str
;
4037 register Py_UNICODE ch
;
4039 ch
= Py_UNICODE_TOLOWER(*s
);
4051 int fixswapcase(PyUnicodeObject
*self
)
4053 int len
= self
->length
;
4054 Py_UNICODE
*s
= self
->str
;
4058 if (Py_UNICODE_ISUPPER(*s
)) {
4059 *s
= Py_UNICODE_TOLOWER(*s
);
4061 } else if (Py_UNICODE_ISLOWER(*s
)) {
4062 *s
= Py_UNICODE_TOUPPER(*s
);
4072 int fixcapitalize(PyUnicodeObject
*self
)
4074 int len
= self
->length
;
4075 Py_UNICODE
*s
= self
->str
;
4080 if (Py_UNICODE_ISLOWER(*s
)) {
4081 *s
= Py_UNICODE_TOUPPER(*s
);
4086 if (Py_UNICODE_ISUPPER(*s
)) {
4087 *s
= Py_UNICODE_TOLOWER(*s
);
4096 int fixtitle(PyUnicodeObject
*self
)
4098 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4099 register Py_UNICODE
*e
;
4100 int previous_is_cased
;
4102 /* Shortcut for single character strings */
4103 if (PyUnicode_GET_SIZE(self
) == 1) {
4104 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
4113 e
= p
+ PyUnicode_GET_SIZE(self
);
4114 previous_is_cased
= 0;
4115 for (; p
< e
; p
++) {
4116 register const Py_UNICODE ch
= *p
;
4118 if (previous_is_cased
)
4119 *p
= Py_UNICODE_TOLOWER(ch
);
4121 *p
= Py_UNICODE_TOTITLE(ch
);
4123 if (Py_UNICODE_ISLOWER(ch
) ||
4124 Py_UNICODE_ISUPPER(ch
) ||
4125 Py_UNICODE_ISTITLE(ch
))
4126 previous_is_cased
= 1;
4128 previous_is_cased
= 0;
4134 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
4136 PyObject
*internal_separator
= NULL
;
4137 const Py_UNICODE blank
= ' ';
4138 const Py_UNICODE
*sep
= &blank
;
4140 PyUnicodeObject
*res
= NULL
; /* the result */
4141 size_t res_alloc
= 100; /* # allocated bytes for string in res */
4142 size_t res_used
; /* # used bytes */
4143 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
4144 PyObject
*fseq
; /* PySequence_Fast(seq) */
4145 int seqlen
; /* len(fseq) -- number of items in sequence */
4149 fseq
= PySequence_Fast(seq
, "");
4154 /* Grrrr. A codec may be invoked to convert str objects to
4155 * Unicode, and so it's possible to call back into Python code
4156 * during PyUnicode_FromObject(), and so it's possible for a sick
4157 * codec to change the size of fseq (if seq is a list). Therefore
4158 * we have to keep refetching the size -- can't assume seqlen
4161 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4162 /* If empty sequence, return u"". */
4164 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
4167 /* If singleton sequence with an exact Unicode, return that. */
4169 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
4170 if (PyUnicode_CheckExact(item
)) {
4172 res
= (PyUnicodeObject
*)item
;
4177 /* At least two items to join, or one that isn't exact Unicode. */
4179 /* Set up sep and seplen -- they're needed. */
4180 if (separator
== NULL
) {
4185 internal_separator
= PyUnicode_FromObject(separator
);
4186 if (internal_separator
== NULL
)
4188 sep
= PyUnicode_AS_UNICODE(internal_separator
);
4189 seplen
= PyUnicode_GET_SIZE(internal_separator
);
4190 /* In case PyUnicode_FromObject() mutated seq. */
4191 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4196 res
= _PyUnicode_New((int)res_alloc
);
4199 res_p
= PyUnicode_AS_UNICODE(res
);
4202 for (i
= 0; i
< seqlen
; ++i
) {
4204 size_t new_res_used
;
4206 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
4207 /* Convert item to Unicode. */
4208 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
4209 PyErr_Format(PyExc_TypeError
,
4210 "sequence item %i: expected string or Unicode,"
4212 i
, item
->ob_type
->tp_name
);
4215 item
= PyUnicode_FromObject(item
);
4218 /* We own a reference to item from here on. */
4220 /* In case PyUnicode_FromObject() mutated seq. */
4221 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
4223 /* Make sure we have enough space for the separator and the item. */
4224 itemlen
= PyUnicode_GET_SIZE(item
);
4225 new_res_used
= res_used
+ itemlen
;
4226 if (new_res_used
< res_used
|| new_res_used
> INT_MAX
)
4228 if (i
< seqlen
- 1) {
4229 new_res_used
+= seplen
;
4230 if (new_res_used
< res_used
|| new_res_used
> INT_MAX
)
4233 if (new_res_used
> res_alloc
) {
4234 /* double allocated size until it's big enough */
4236 size_t oldsize
= res_alloc
;
4237 res_alloc
+= res_alloc
;
4238 if (res_alloc
< oldsize
|| res_alloc
> INT_MAX
)
4240 } while (new_res_used
> res_alloc
);
4241 if (_PyUnicode_Resize(&res
, (int)res_alloc
) < 0) {
4245 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
4248 /* Copy item, and maybe the separator. */
4249 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), (int)itemlen
);
4251 if (i
< seqlen
- 1) {
4252 Py_UNICODE_COPY(res_p
, sep
, (int)seplen
);
4256 res_used
= new_res_used
;
4259 /* Shrink res to match the used area; this probably can't fail,
4260 * but it's cheap to check.
4262 if (_PyUnicode_Resize(&res
, (int)res_used
) < 0)
4266 Py_XDECREF(internal_separator
);
4268 return (PyObject
*)res
;
4271 PyErr_SetString(PyExc_OverflowError
,
4272 "join() is too long for a Python string");
4277 Py_XDECREF(internal_separator
);
4284 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
4296 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
4301 u
= _PyUnicode_New(left
+ self
->length
+ right
);
4304 Py_UNICODE_FILL(u
->str
, fill
, left
);
4305 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
4307 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
4313 #define SPLIT_APPEND(data, left, right) \
4314 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4317 if (PyList_Append(list, str)) { \
4324 #define SPLIT_INSERT(data, left, right) \
4325 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4328 if (PyList_Insert(list, 0, str)) { \
4336 PyObject
*split_whitespace(PyUnicodeObject
*self
,
4342 int len
= self
->length
;
4345 for (i
= j
= 0; i
< len
; ) {
4347 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
4350 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
4353 if (maxcount
-- <= 0)
4355 SPLIT_APPEND(self
->str
, j
, i
);
4356 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
4362 SPLIT_APPEND(self
->str
, j
, len
);
4371 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
4381 string
= PyUnicode_FromObject(string
);
4384 data
= PyUnicode_AS_UNICODE(string
);
4385 len
= PyUnicode_GET_SIZE(string
);
4387 list
= PyList_New(0);
4391 for (i
= j
= 0; i
< len
; ) {
4394 /* Find a line and append it */
4395 while (i
< len
&& !Py_UNICODE_ISLINEBREAK(data
[i
]))
4398 /* Skip the line break reading CRLF as one line break */
4401 if (data
[i
] == '\r' && i
+ 1 < len
&&
4409 SPLIT_APPEND(data
, j
, eol
);
4413 SPLIT_APPEND(data
, j
, len
);
4426 PyObject
*split_char(PyUnicodeObject
*self
,
4433 int len
= self
->length
;
4436 for (i
= j
= 0; i
< len
; ) {
4437 if (self
->str
[i
] == ch
) {
4438 if (maxcount
-- <= 0)
4440 SPLIT_APPEND(self
->str
, j
, i
);
4446 SPLIT_APPEND(self
->str
, j
, len
);
4456 PyObject
*split_substring(PyUnicodeObject
*self
,
4458 PyUnicodeObject
*substring
,
4463 int len
= self
->length
;
4464 int sublen
= substring
->length
;
4467 for (i
= j
= 0; i
<= len
- sublen
; ) {
4468 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
4469 if (maxcount
-- <= 0)
4471 SPLIT_APPEND(self
->str
, j
, i
);
4477 SPLIT_APPEND(self
->str
, j
, len
);
4487 PyObject
*rsplit_whitespace(PyUnicodeObject
*self
,
4493 int len
= self
->length
;
4496 for (i
= j
= len
- 1; i
>= 0; ) {
4498 while (i
>= 0 && Py_UNICODE_ISSPACE(self
->str
[i
]))
4501 while (i
>= 0 && !Py_UNICODE_ISSPACE(self
->str
[i
]))
4504 if (maxcount
-- <= 0)
4506 SPLIT_INSERT(self
->str
, i
+ 1, j
+ 1);
4507 while (i
>= 0 && Py_UNICODE_ISSPACE(self
->str
[i
]))
4513 SPLIT_INSERT(self
->str
, 0, j
+ 1);
4523 PyObject
*rsplit_char(PyUnicodeObject
*self
,
4530 int len
= self
->length
;
4533 for (i
= j
= len
- 1; i
>= 0; ) {
4534 if (self
->str
[i
] == ch
) {
4535 if (maxcount
-- <= 0)
4537 SPLIT_INSERT(self
->str
, i
+ 1, j
+ 1);
4543 SPLIT_INSERT(self
->str
, 0, j
+ 1);
4553 PyObject
*rsplit_substring(PyUnicodeObject
*self
,
4555 PyUnicodeObject
*substring
,
4560 int len
= self
->length
;
4561 int sublen
= substring
->length
;
4564 for (i
= len
- sublen
, j
= len
; i
>= 0; ) {
4565 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
4566 if (maxcount
-- <= 0)
4568 SPLIT_INSERT(self
->str
, i
+ sublen
, j
);
4575 SPLIT_INSERT(self
->str
, 0, j
);
4588 PyObject
*split(PyUnicodeObject
*self
,
4589 PyUnicodeObject
*substring
,
4597 list
= PyList_New(0);
4601 if (substring
== NULL
)
4602 return split_whitespace(self
,list
,maxcount
);
4604 else if (substring
->length
== 1)
4605 return split_char(self
,list
,substring
->str
[0],maxcount
);
4607 else if (substring
->length
== 0) {
4609 PyErr_SetString(PyExc_ValueError
, "empty separator");
4613 return split_substring(self
,list
,substring
,maxcount
);
4617 PyObject
*rsplit(PyUnicodeObject
*self
,
4618 PyUnicodeObject
*substring
,
4626 list
= PyList_New(0);
4630 if (substring
== NULL
)
4631 return rsplit_whitespace(self
,list
,maxcount
);
4633 else if (substring
->length
== 1)
4634 return rsplit_char(self
,list
,substring
->str
[0],maxcount
);
4636 else if (substring
->length
== 0) {
4638 PyErr_SetString(PyExc_ValueError
, "empty separator");
4642 return rsplit_substring(self
,list
,substring
,maxcount
);
4646 PyObject
*replace(PyUnicodeObject
*self
,
4647 PyUnicodeObject
*str1
,
4648 PyUnicodeObject
*str2
,
4656 if (str1
->length
== 1 && str2
->length
== 1) {
4659 /* replace characters */
4660 if (!findchar(self
->str
, self
->length
, str1
->str
[0]) &&
4661 PyUnicode_CheckExact(self
)) {
4662 /* nothing to replace, return original string */
4666 Py_UNICODE u1
= str1
->str
[0];
4667 Py_UNICODE u2
= str2
->str
[0];
4669 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(
4674 Py_UNICODE_COPY(u
->str
, self
->str
,
4676 for (i
= 0; i
< u
->length
; i
++)
4677 if (u
->str
[i
] == u1
) {
4689 /* replace strings */
4690 n
= count(self
, 0, self
->length
, str1
);
4694 /* nothing to replace, return original string */
4695 if (PyUnicode_CheckExact(self
)) {
4700 u
= (PyUnicodeObject
*)
4701 PyUnicode_FromUnicode(self
->str
, self
->length
);
4705 self
->length
+ n
* (str2
->length
- str1
->length
));
4709 if (str1
->length
> 0) {
4710 while (i
<= self
->length
- str1
->length
)
4711 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
4712 /* replace string segment */
4713 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
4717 /* copy remaining part */
4718 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
4722 *p
++ = self
->str
[i
++];
4725 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
4729 *p
++ = self
->str
[i
++];
4731 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
4737 return (PyObject
*) u
;
4740 /* --- Unicode Object Methods --------------------------------------------- */
4742 PyDoc_STRVAR(title__doc__
,
4743 "S.title() -> unicode\n\
4745 Return a titlecased version of S, i.e. words start with title case\n\
4746 characters, all remaining cased characters have lower case.");
4749 unicode_title(PyUnicodeObject
*self
)
4751 return fixup(self
, fixtitle
);
4754 PyDoc_STRVAR(capitalize__doc__
,
4755 "S.capitalize() -> unicode\n\
4757 Return a capitalized version of S, i.e. make the first character\n\
4761 unicode_capitalize(PyUnicodeObject
*self
)
4763 return fixup(self
, fixcapitalize
);
4767 PyDoc_STRVAR(capwords__doc__
,
4768 "S.capwords() -> unicode\n\
4770 Apply .capitalize() to all words in S and return the result with\n\
4771 normalized whitespace (all whitespace strings are replaced by ' ').");
4774 unicode_capwords(PyUnicodeObject
*self
)
4780 /* Split into words */
4781 list
= split(self
, NULL
, -1);
4785 /* Capitalize each word */
4786 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
4787 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
4791 Py_DECREF(PyList_GET_ITEM(list
, i
));
4792 PyList_SET_ITEM(list
, i
, item
);
4795 /* Join the words to form a new string */
4796 item
= PyUnicode_Join(NULL
, list
);
4800 return (PyObject
*)item
;
4804 /* Argument converter. Coerces to a single unicode character */
4807 convert_uc(PyObject
*obj
, void *addr
)
4809 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
4813 uniobj
= PyUnicode_FromObject(obj
);
4814 if (uniobj
== NULL
) {
4815 PyErr_SetString(PyExc_TypeError
,
4816 "The fill character cannot be converted to Unicode");
4819 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
4820 PyErr_SetString(PyExc_TypeError
,
4821 "The fill character must be exactly one character long");
4825 unistr
= PyUnicode_AS_UNICODE(uniobj
);
4826 *fillcharloc
= unistr
[0];
4831 PyDoc_STRVAR(center__doc__
,
4832 "S.center(width[, fillchar]) -> unicode\n\
4834 Return S centered in a Unicode string of length width. Padding is\n\
4835 done using the specified fill character (default is a space)");
4838 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
4842 Py_UNICODE fillchar
= ' ';
4844 if (!PyArg_ParseTuple(args
, "i|O&:center", &width
, convert_uc
, &fillchar
))
4847 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
4849 return (PyObject
*) self
;
4852 marg
= width
- self
->length
;
4853 left
= marg
/ 2 + (marg
& width
& 1);
4855 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
4860 /* This code should go into some future Unicode collation support
4861 module. The basic comparison should compare ordinals on a naive
4862 basis (this is what Java does and thus JPython too). */
4864 /* speedy UTF-16 code point order comparison */
4866 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4868 static short utf16Fixup
[32] =
4870 0, 0, 0, 0, 0, 0, 0, 0,
4871 0, 0, 0, 0, 0, 0, 0, 0,
4872 0, 0, 0, 0, 0, 0, 0, 0,
4873 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
4877 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
4881 Py_UNICODE
*s1
= str1
->str
;
4882 Py_UNICODE
*s2
= str2
->str
;
4884 len1
= str1
->length
;
4885 len2
= str2
->length
;
4887 while (len1
> 0 && len2
> 0) {
4893 if (c1
> (1<<11) * 26)
4894 c1
+= utf16Fixup
[c1
>>11];
4895 if (c2
> (1<<11) * 26)
4896 c2
+= utf16Fixup
[c2
>>11];
4897 /* now c1 and c2 are in UTF-32-compatible order */
4900 return (c1
< c2
) ? -1 : 1;
4905 return (len1
< len2
) ? -1 : (len1
!= len2
);
4911 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
4913 register int len1
, len2
;
4915 Py_UNICODE
*s1
= str1
->str
;
4916 Py_UNICODE
*s2
= str2
->str
;
4918 len1
= str1
->length
;
4919 len2
= str2
->length
;
4921 while (len1
> 0 && len2
> 0) {
4928 return (c1
< c2
) ? -1 : 1;
4933 return (len1
< len2
) ? -1 : (len1
!= len2
);
4938 int PyUnicode_Compare(PyObject
*left
,
4941 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
4944 /* Coerce the two arguments */
4945 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
4948 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
4952 /* Shortcut for empty or interned objects */
4959 result
= unicode_compare(u
, v
);
4971 int PyUnicode_Contains(PyObject
*container
,
4974 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
4976 register const Py_UNICODE
*lhs
, *end
, *rhs
;
4978 /* Coerce the two arguments */
4979 v
= (PyUnicodeObject
*)PyUnicode_FromObject(element
);
4981 PyErr_SetString(PyExc_TypeError
,
4982 "'in <string>' requires string as left operand");
4985 u
= (PyUnicodeObject
*)PyUnicode_FromObject(container
);
4989 size
= PyUnicode_GET_SIZE(v
);
4990 rhs
= PyUnicode_AS_UNICODE(v
);
4991 lhs
= PyUnicode_AS_UNICODE(u
);
4995 end
= lhs
+ PyUnicode_GET_SIZE(u
);
4997 if (*lhs
++ == *rhs
) {
5004 end
= lhs
+ (PyUnicode_GET_SIZE(u
) - size
);
5005 while (lhs
<= end
) {
5006 if (memcmp(lhs
++, rhs
, size
* sizeof(Py_UNICODE
)) == 0) {
5023 /* Concat to string or Unicode object giving a new Unicode object. */
5025 PyObject
*PyUnicode_Concat(PyObject
*left
,
5028 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
5030 /* Coerce the two arguments */
5031 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
5034 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
5039 if (v
== unicode_empty
) {
5041 return (PyObject
*)u
;
5043 if (u
== unicode_empty
) {
5045 return (PyObject
*)v
;
5048 /* Concat the two Unicode strings */
5049 w
= _PyUnicode_New(u
->length
+ v
->length
);
5052 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
5053 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
5057 return (PyObject
*)w
;
5065 PyDoc_STRVAR(count__doc__
,
5066 "S.count(sub[, start[, end]]) -> int\n\
5068 Return the number of occurrences of substring sub in Unicode string\n\
5069 S[start:end]. Optional arguments start and end are\n\
5070 interpreted as in slice notation.");
5073 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
5075 PyUnicodeObject
*substring
;
5080 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
5081 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5084 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
5085 (PyObject
*)substring
);
5086 if (substring
== NULL
)
5090 start
+= self
->length
;
5093 if (end
> self
->length
)
5096 end
+= self
->length
;
5100 result
= PyInt_FromLong((long) count(self
, start
, end
, substring
));
5102 Py_DECREF(substring
);
5106 PyDoc_STRVAR(encode__doc__
,
5107 "S.encode([encoding[,errors]]) -> string or unicode\n\
5109 Encodes S using the codec registered for encoding. encoding defaults\n\
5110 to the default encoding. errors may be given to set a different error\n\
5111 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5112 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5113 'xmlcharrefreplace' as well as any other name registered with\n\
5114 codecs.register_error that can handle UnicodeEncodeErrors.");
5117 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
5119 char *encoding
= NULL
;
5120 char *errors
= NULL
;
5123 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
5125 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
5128 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
5129 PyErr_Format(PyExc_TypeError
,
5130 "encoder did not return a string/unicode object "
5132 v
->ob_type
->tp_name
);
5142 PyDoc_STRVAR(decode__doc__
,
5143 "S.decode([encoding[,errors]]) -> string or unicode\n\
5145 Decodes S using the codec registered for encoding. encoding defaults\n\
5146 to the default encoding. errors may be given to set a different error\n\
5147 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5148 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5149 as well as any other name registerd with codecs.register_error that is\n\
5150 able to handle UnicodeDecodeErrors.");
5153 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
)
5155 char *encoding
= NULL
;
5156 char *errors
= NULL
;
5159 if (!PyArg_ParseTuple(args
, "|ss:decode", &encoding
, &errors
))
5161 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
5164 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
5165 PyErr_Format(PyExc_TypeError
,
5166 "decoder did not return a string/unicode object "
5168 v
->ob_type
->tp_name
);
5178 PyDoc_STRVAR(expandtabs__doc__
,
5179 "S.expandtabs([tabsize]) -> unicode\n\
5181 Return a copy of S where all tab characters are expanded using spaces.\n\
5182 If tabsize is not given, a tab size of 8 characters is assumed.");
5185 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
5194 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
5197 /* First pass: determine size of output string */
5199 e
= self
->str
+ self
->length
;
5200 for (p
= self
->str
; p
< e
; p
++)
5203 j
+= tabsize
- (j
% tabsize
);
5207 if (*p
== '\n' || *p
== '\r') {
5213 /* Second pass: create output string and fill it */
5214 u
= _PyUnicode_New(i
+ j
);
5221 for (p
= self
->str
; p
< e
; p
++)
5224 i
= tabsize
- (j
% tabsize
);
5233 if (*p
== '\n' || *p
== '\r')
5237 return (PyObject
*) u
;
5240 PyDoc_STRVAR(find__doc__
,
5241 "S.find(sub [,start [,end]]) -> int\n\
5243 Return the lowest index in S where substring sub is found,\n\
5244 such that sub is contained within s[start,end]. Optional\n\
5245 arguments start and end are interpreted as in slice notation.\n\
5247 Return -1 on failure.");
5250 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
5252 PyUnicodeObject
*substring
;
5257 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
5258 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5260 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
5261 (PyObject
*)substring
);
5262 if (substring
== NULL
)
5265 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, 1));
5267 Py_DECREF(substring
);
5272 unicode_getitem(PyUnicodeObject
*self
, int index
)
5274 if (index
< 0 || index
>= self
->length
) {
5275 PyErr_SetString(PyExc_IndexError
, "string index out of range");
5279 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
5283 unicode_hash(PyUnicodeObject
*self
)
5285 /* Since Unicode objects compare equal to their ASCII string
5286 counterparts, they should use the individual character values
5287 as basis for their hash value. This is needed to assure that
5288 strings and Unicode objects behave in the same way as
5292 register Py_UNICODE
*p
;
5295 if (self
->hash
!= -1)
5297 len
= PyUnicode_GET_SIZE(self
);
5298 p
= PyUnicode_AS_UNICODE(self
);
5301 x
= (1000003*x
) ^ *p
++;
5302 x
^= PyUnicode_GET_SIZE(self
);
5309 PyDoc_STRVAR(index__doc__
,
5310 "S.index(sub [,start [,end]]) -> int\n\
5312 Like S.find() but raise ValueError when the substring is not found.");
5315 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
5318 PyUnicodeObject
*substring
;
5322 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
5323 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5326 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
5327 (PyObject
*)substring
);
5328 if (substring
== NULL
)
5331 result
= findstring(self
, substring
, start
, end
, 1);
5333 Py_DECREF(substring
);
5335 PyErr_SetString(PyExc_ValueError
, "substring not found");
5338 return PyInt_FromLong(result
);
5341 PyDoc_STRVAR(islower__doc__
,
5342 "S.islower() -> bool\n\
5344 Return True if all cased characters in S are lowercase and there is\n\
5345 at least one cased character in S, False otherwise.");
5348 unicode_islower(PyUnicodeObject
*self
)
5350 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5351 register const Py_UNICODE
*e
;
5354 /* Shortcut for single character strings */
5355 if (PyUnicode_GET_SIZE(self
) == 1)
5356 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
5358 /* Special case for empty strings */
5359 if (PyString_GET_SIZE(self
) == 0)
5360 return PyBool_FromLong(0);
5362 e
= p
+ PyUnicode_GET_SIZE(self
);
5364 for (; p
< e
; p
++) {
5365 register const Py_UNICODE ch
= *p
;
5367 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
5368 return PyBool_FromLong(0);
5369 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
5372 return PyBool_FromLong(cased
);
5375 PyDoc_STRVAR(isupper__doc__
,
5376 "S.isupper() -> bool\n\
5378 Return True if all cased characters in S are uppercase and there is\n\
5379 at least one cased character in S, False otherwise.");
5382 unicode_isupper(PyUnicodeObject
*self
)
5384 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5385 register const Py_UNICODE
*e
;
5388 /* Shortcut for single character strings */
5389 if (PyUnicode_GET_SIZE(self
) == 1)
5390 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
5392 /* Special case for empty strings */
5393 if (PyString_GET_SIZE(self
) == 0)
5394 return PyBool_FromLong(0);
5396 e
= p
+ PyUnicode_GET_SIZE(self
);
5398 for (; p
< e
; p
++) {
5399 register const Py_UNICODE ch
= *p
;
5401 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
5402 return PyBool_FromLong(0);
5403 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
5406 return PyBool_FromLong(cased
);
5409 PyDoc_STRVAR(istitle__doc__
,
5410 "S.istitle() -> bool\n\
5412 Return True if S is a titlecased string and there is at least one\n\
5413 character in S, i.e. upper- and titlecase characters may only\n\
5414 follow uncased characters and lowercase characters only cased ones.\n\
5415 Return False otherwise.");
5418 unicode_istitle(PyUnicodeObject
*self
)
5420 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5421 register const Py_UNICODE
*e
;
5422 int cased
, previous_is_cased
;
5424 /* Shortcut for single character strings */
5425 if (PyUnicode_GET_SIZE(self
) == 1)
5426 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
5427 (Py_UNICODE_ISUPPER(*p
) != 0));
5429 /* Special case for empty strings */
5430 if (PyString_GET_SIZE(self
) == 0)
5431 return PyBool_FromLong(0);
5433 e
= p
+ PyUnicode_GET_SIZE(self
);
5435 previous_is_cased
= 0;
5436 for (; p
< e
; p
++) {
5437 register const Py_UNICODE ch
= *p
;
5439 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
5440 if (previous_is_cased
)
5441 return PyBool_FromLong(0);
5442 previous_is_cased
= 1;
5445 else if (Py_UNICODE_ISLOWER(ch
)) {
5446 if (!previous_is_cased
)
5447 return PyBool_FromLong(0);
5448 previous_is_cased
= 1;
5452 previous_is_cased
= 0;
5454 return PyBool_FromLong(cased
);
5457 PyDoc_STRVAR(isspace__doc__
,
5458 "S.isspace() -> bool\n\
5460 Return True if all characters in S are whitespace\n\
5461 and there is at least one character in S, False otherwise.");
5464 unicode_isspace(PyUnicodeObject
*self
)
5466 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5467 register const Py_UNICODE
*e
;
5469 /* Shortcut for single character strings */
5470 if (PyUnicode_GET_SIZE(self
) == 1 &&
5471 Py_UNICODE_ISSPACE(*p
))
5472 return PyBool_FromLong(1);
5474 /* Special case for empty strings */
5475 if (PyString_GET_SIZE(self
) == 0)
5476 return PyBool_FromLong(0);
5478 e
= p
+ PyUnicode_GET_SIZE(self
);
5479 for (; p
< e
; p
++) {
5480 if (!Py_UNICODE_ISSPACE(*p
))
5481 return PyBool_FromLong(0);
5483 return PyBool_FromLong(1);
5486 PyDoc_STRVAR(isalpha__doc__
,
5487 "S.isalpha() -> bool\n\
5489 Return True if all characters in S are alphabetic\n\
5490 and there is at least one character in S, False otherwise.");
5493 unicode_isalpha(PyUnicodeObject
*self
)
5495 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5496 register const Py_UNICODE
*e
;
5498 /* Shortcut for single character strings */
5499 if (PyUnicode_GET_SIZE(self
) == 1 &&
5500 Py_UNICODE_ISALPHA(*p
))
5501 return PyBool_FromLong(1);
5503 /* Special case for empty strings */
5504 if (PyString_GET_SIZE(self
) == 0)
5505 return PyBool_FromLong(0);
5507 e
= p
+ PyUnicode_GET_SIZE(self
);
5508 for (; p
< e
; p
++) {
5509 if (!Py_UNICODE_ISALPHA(*p
))
5510 return PyBool_FromLong(0);
5512 return PyBool_FromLong(1);
5515 PyDoc_STRVAR(isalnum__doc__
,
5516 "S.isalnum() -> bool\n\
5518 Return True if all characters in S are alphanumeric\n\
5519 and there is at least one character in S, False otherwise.");
5522 unicode_isalnum(PyUnicodeObject
*self
)
5524 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5525 register const Py_UNICODE
*e
;
5527 /* Shortcut for single character strings */
5528 if (PyUnicode_GET_SIZE(self
) == 1 &&
5529 Py_UNICODE_ISALNUM(*p
))
5530 return PyBool_FromLong(1);
5532 /* Special case for empty strings */
5533 if (PyString_GET_SIZE(self
) == 0)
5534 return PyBool_FromLong(0);
5536 e
= p
+ PyUnicode_GET_SIZE(self
);
5537 for (; p
< e
; p
++) {
5538 if (!Py_UNICODE_ISALNUM(*p
))
5539 return PyBool_FromLong(0);
5541 return PyBool_FromLong(1);
5544 PyDoc_STRVAR(isdecimal__doc__
,
5545 "S.isdecimal() -> bool\n\
5547 Return True if there are only decimal characters in S,\n\
5551 unicode_isdecimal(PyUnicodeObject
*self
)
5553 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5554 register const Py_UNICODE
*e
;
5556 /* Shortcut for single character strings */
5557 if (PyUnicode_GET_SIZE(self
) == 1 &&
5558 Py_UNICODE_ISDECIMAL(*p
))
5559 return PyBool_FromLong(1);
5561 /* Special case for empty strings */
5562 if (PyString_GET_SIZE(self
) == 0)
5563 return PyBool_FromLong(0);
5565 e
= p
+ PyUnicode_GET_SIZE(self
);
5566 for (; p
< e
; p
++) {
5567 if (!Py_UNICODE_ISDECIMAL(*p
))
5568 return PyBool_FromLong(0);
5570 return PyBool_FromLong(1);
5573 PyDoc_STRVAR(isdigit__doc__
,
5574 "S.isdigit() -> bool\n\
5576 Return True if all characters in S are digits\n\
5577 and there is at least one character in S, False otherwise.");
5580 unicode_isdigit(PyUnicodeObject
*self
)
5582 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5583 register const Py_UNICODE
*e
;
5585 /* Shortcut for single character strings */
5586 if (PyUnicode_GET_SIZE(self
) == 1 &&
5587 Py_UNICODE_ISDIGIT(*p
))
5588 return PyBool_FromLong(1);
5590 /* Special case for empty strings */
5591 if (PyString_GET_SIZE(self
) == 0)
5592 return PyBool_FromLong(0);
5594 e
= p
+ PyUnicode_GET_SIZE(self
);
5595 for (; p
< e
; p
++) {
5596 if (!Py_UNICODE_ISDIGIT(*p
))
5597 return PyBool_FromLong(0);
5599 return PyBool_FromLong(1);
5602 PyDoc_STRVAR(isnumeric__doc__
,
5603 "S.isnumeric() -> bool\n\
5605 Return True if there are only numeric characters in S,\n\
5609 unicode_isnumeric(PyUnicodeObject
*self
)
5611 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5612 register const Py_UNICODE
*e
;
5614 /* Shortcut for single character strings */
5615 if (PyUnicode_GET_SIZE(self
) == 1 &&
5616 Py_UNICODE_ISNUMERIC(*p
))
5617 return PyBool_FromLong(1);
5619 /* Special case for empty strings */
5620 if (PyString_GET_SIZE(self
) == 0)
5621 return PyBool_FromLong(0);
5623 e
= p
+ PyUnicode_GET_SIZE(self
);
5624 for (; p
< e
; p
++) {
5625 if (!Py_UNICODE_ISNUMERIC(*p
))
5626 return PyBool_FromLong(0);
5628 return PyBool_FromLong(1);
5631 PyDoc_STRVAR(join__doc__
,
5632 "S.join(sequence) -> unicode\n\
5634 Return a string which is the concatenation of the strings in the\n\
5635 sequence. The separator between elements is S.");
5638 unicode_join(PyObject
*self
, PyObject
*data
)
5640 return PyUnicode_Join(self
, data
);
5644 unicode_length(PyUnicodeObject
*self
)
5646 return self
->length
;
5649 PyDoc_STRVAR(ljust__doc__
,
5650 "S.ljust(width[, fillchar]) -> int\n\
5652 Return S left justified in a Unicode string of length width. Padding is\n\
5653 done using the specified fill character (default is a space).");
5656 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
5659 Py_UNICODE fillchar
= ' ';
5661 if (!PyArg_ParseTuple(args
, "i|O&:ljust", &width
, convert_uc
, &fillchar
))
5664 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
5666 return (PyObject
*) self
;
5669 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
5672 PyDoc_STRVAR(lower__doc__
,
5673 "S.lower() -> unicode\n\
5675 Return a copy of the string S converted to lowercase.");
5678 unicode_lower(PyUnicodeObject
*self
)
5680 return fixup(self
, fixlower
);
5684 #define RIGHTSTRIP 1
5687 /* Arrays indexed by above */
5688 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5690 #define STRIPNAME(i) (stripformat[i]+3)
5692 static const Py_UNICODE
*
5693 unicode_memchr(const Py_UNICODE
*s
, Py_UNICODE c
, size_t n
)
5696 for (i
= 0; i
< n
; ++i
)
5702 /* externally visible for str.strip(unicode) */
5704 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
5706 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
5707 int len
= PyUnicode_GET_SIZE(self
);
5708 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
5709 int seplen
= PyUnicode_GET_SIZE(sepobj
);
5713 if (striptype
!= RIGHTSTRIP
) {
5714 while (i
< len
&& unicode_memchr(sep
, s
[i
], seplen
)) {
5720 if (striptype
!= LEFTSTRIP
) {
5723 } while (j
>= i
&& unicode_memchr(sep
, s
[j
], seplen
));
5727 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
5729 return (PyObject
*)self
;
5732 return PyUnicode_FromUnicode(s
+i
, j
-i
);
5737 do_strip(PyUnicodeObject
*self
, int striptype
)
5739 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
5740 int len
= PyUnicode_GET_SIZE(self
), i
, j
;
5743 if (striptype
!= RIGHTSTRIP
) {
5744 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
5750 if (striptype
!= LEFTSTRIP
) {
5753 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
5757 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
5759 return (PyObject
*)self
;
5762 return PyUnicode_FromUnicode(s
+i
, j
-i
);
5767 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
5769 PyObject
*sep
= NULL
;
5771 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
5774 if (sep
!= NULL
&& sep
!= Py_None
) {
5775 if (PyUnicode_Check(sep
))
5776 return _PyUnicode_XStrip(self
, striptype
, sep
);
5777 else if (PyString_Check(sep
)) {
5779 sep
= PyUnicode_FromObject(sep
);
5782 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
5787 PyErr_Format(PyExc_TypeError
,
5788 "%s arg must be None, unicode or str",
5789 STRIPNAME(striptype
));
5794 return do_strip(self
, striptype
);
5798 PyDoc_STRVAR(strip__doc__
,
5799 "S.strip([chars]) -> unicode\n\
5801 Return a copy of the string S with leading and trailing\n\
5802 whitespace removed.\n\
5803 If chars is given and not None, remove characters in chars instead.\n\
5804 If chars is a str, it will be converted to unicode before stripping");
5807 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
5809 if (PyTuple_GET_SIZE(args
) == 0)
5810 return do_strip(self
, BOTHSTRIP
); /* Common case */
5812 return do_argstrip(self
, BOTHSTRIP
, args
);
5816 PyDoc_STRVAR(lstrip__doc__
,
5817 "S.lstrip([chars]) -> unicode\n\
5819 Return a copy of the string S with leading whitespace removed.\n\
5820 If chars is given and not None, remove characters in chars instead.\n\
5821 If chars is a str, it will be converted to unicode before stripping");
5824 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
5826 if (PyTuple_GET_SIZE(args
) == 0)
5827 return do_strip(self
, LEFTSTRIP
); /* Common case */
5829 return do_argstrip(self
, LEFTSTRIP
, args
);
5833 PyDoc_STRVAR(rstrip__doc__
,
5834 "S.rstrip([chars]) -> unicode\n\
5836 Return a copy of the string S with trailing whitespace removed.\n\
5837 If chars is given and not None, remove characters in chars instead.\n\
5838 If chars is a str, it will be converted to unicode before stripping");
5841 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
5843 if (PyTuple_GET_SIZE(args
) == 0)
5844 return do_strip(self
, RIGHTSTRIP
); /* Common case */
5846 return do_argstrip(self
, RIGHTSTRIP
, args
);
5851 unicode_repeat(PyUnicodeObject
*str
, int len
)
5861 if (len
== 1 && PyUnicode_CheckExact(str
)) {
5862 /* no repeat, return original string */
5864 return (PyObject
*) str
;
5867 /* ensure # of chars needed doesn't overflow int and # of bytes
5868 * needed doesn't overflow size_t
5870 nchars
= len
* str
->length
;
5871 if (len
&& nchars
/ len
!= str
->length
) {
5872 PyErr_SetString(PyExc_OverflowError
,
5873 "repeated string is too long");
5876 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
5877 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
5878 PyErr_SetString(PyExc_OverflowError
,
5879 "repeated string is too long");
5882 u
= _PyUnicode_New(nchars
);
5889 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
5893 return (PyObject
*) u
;
5896 PyObject
*PyUnicode_Replace(PyObject
*obj
,
5906 self
= PyUnicode_FromObject(obj
);
5909 str1
= PyUnicode_FromObject(subobj
);
5914 str2
= PyUnicode_FromObject(replobj
);
5920 result
= replace((PyUnicodeObject
*)self
,
5921 (PyUnicodeObject
*)str1
,
5922 (PyUnicodeObject
*)str2
,
5930 PyDoc_STRVAR(replace__doc__
,
5931 "S.replace (old, new[, maxsplit]) -> unicode\n\
5933 Return a copy of S with all occurrences of substring\n\
5934 old replaced by new. If the optional argument maxsplit is\n\
5935 given, only the first maxsplit occurrences are replaced.");
5938 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
5940 PyUnicodeObject
*str1
;
5941 PyUnicodeObject
*str2
;
5945 if (!PyArg_ParseTuple(args
, "OO|i:replace", &str1
, &str2
, &maxcount
))
5947 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
5950 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
5956 result
= replace(self
, str1
, str2
, maxcount
);
5964 PyObject
*unicode_repr(PyObject
*unicode
)
5966 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
5967 PyUnicode_GET_SIZE(unicode
),
5971 PyDoc_STRVAR(rfind__doc__
,
5972 "S.rfind(sub [,start [,end]]) -> int\n\
5974 Return the highest index in S where substring sub is found,\n\
5975 such that sub is contained within s[start,end]. Optional\n\
5976 arguments start and end are interpreted as in slice notation.\n\
5978 Return -1 on failure.");
5981 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
5983 PyUnicodeObject
*substring
;
5988 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
5989 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5991 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
5992 (PyObject
*)substring
);
5993 if (substring
== NULL
)
5996 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, -1));
5998 Py_DECREF(substring
);
6002 PyDoc_STRVAR(rindex__doc__
,
6003 "S.rindex(sub [,start [,end]]) -> int\n\
6005 Like S.rfind() but raise ValueError when the substring is not found.");
6008 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
6011 PyUnicodeObject
*substring
;
6015 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
6016 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6018 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6019 (PyObject
*)substring
);
6020 if (substring
== NULL
)
6023 result
= findstring(self
, substring
, start
, end
, -1);
6025 Py_DECREF(substring
);
6027 PyErr_SetString(PyExc_ValueError
, "substring not found");
6030 return PyInt_FromLong(result
);
6033 PyDoc_STRVAR(rjust__doc__
,
6034 "S.rjust(width[, fillchar]) -> unicode\n\
6036 Return S right justified in a Unicode string of length width. Padding is\n\
6037 done using the specified fill character (default is a space).");
6040 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
6043 Py_UNICODE fillchar
= ' ';
6045 if (!PyArg_ParseTuple(args
, "i|O&:rjust", &width
, convert_uc
, &fillchar
))
6048 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6050 return (PyObject
*) self
;
6053 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
6057 unicode_slice(PyUnicodeObject
*self
, int start
, int end
)
6059 /* standard clamping */
6064 if (end
> self
->length
)
6066 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
6067 /* full slice, return original string */
6069 return (PyObject
*) self
;
6074 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
6078 PyObject
*PyUnicode_Split(PyObject
*s
,
6084 s
= PyUnicode_FromObject(s
);
6088 sep
= PyUnicode_FromObject(sep
);
6095 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
6102 PyDoc_STRVAR(split__doc__
,
6103 "S.split([sep [,maxsplit]]) -> list of strings\n\
6105 Return a list of the words in S, using sep as the\n\
6106 delimiter string. If maxsplit is given, at most maxsplit\n\
6107 splits are done. If sep is not specified or is None,\n\
6108 any whitespace string is a separator.");
6111 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
6113 PyObject
*substring
= Py_None
;
6116 if (!PyArg_ParseTuple(args
, "|Oi:split", &substring
, &maxcount
))
6119 if (substring
== Py_None
)
6120 return split(self
, NULL
, maxcount
);
6121 else if (PyUnicode_Check(substring
))
6122 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
6124 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
6127 PyObject
*PyUnicode_RSplit(PyObject
*s
,
6133 s
= PyUnicode_FromObject(s
);
6137 sep
= PyUnicode_FromObject(sep
);
6144 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
6151 PyDoc_STRVAR(rsplit__doc__
,
6152 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6154 Return a list of the words in S, using sep as the\n\
6155 delimiter string, starting at the end of the string and\n\
6156 working to the front. If maxsplit is given, at most maxsplit\n\
6157 splits are done. If sep is not specified, any whitespace string\n\
6161 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
6163 PyObject
*substring
= Py_None
;
6166 if (!PyArg_ParseTuple(args
, "|Oi:rsplit", &substring
, &maxcount
))
6169 if (substring
== Py_None
)
6170 return rsplit(self
, NULL
, maxcount
);
6171 else if (PyUnicode_Check(substring
))
6172 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
6174 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
6177 PyDoc_STRVAR(splitlines__doc__
,
6178 "S.splitlines([keepends]]) -> list of strings\n\
6180 Return a list of the lines in S, breaking at line boundaries.\n\
6181 Line breaks are not included in the resulting list unless keepends\n\
6182 is given and true.");
6185 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
6189 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
6192 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
6196 PyObject
*unicode_str(PyUnicodeObject
*self
)
6198 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
6201 PyDoc_STRVAR(swapcase__doc__
,
6202 "S.swapcase() -> unicode\n\
6204 Return a copy of S with uppercase characters converted to lowercase\n\
6208 unicode_swapcase(PyUnicodeObject
*self
)
6210 return fixup(self
, fixswapcase
);
6213 PyDoc_STRVAR(translate__doc__
,
6214 "S.translate(table) -> unicode\n\
6216 Return a copy of the string S, where all characters have been mapped\n\
6217 through the given translation table, which must be a mapping of\n\
6218 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6219 Unmapped characters are left untouched. Characters mapped to None\n\
6223 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
6225 return PyUnicode_TranslateCharmap(self
->str
,
6231 PyDoc_STRVAR(upper__doc__
,
6232 "S.upper() -> unicode\n\
6234 Return a copy of S converted to uppercase.");
6237 unicode_upper(PyUnicodeObject
*self
)
6239 return fixup(self
, fixupper
);
6242 PyDoc_STRVAR(zfill__doc__
,
6243 "S.zfill(width) -> unicode\n\
6245 Pad a numeric string x with zeros on the left, to fill a field\n\
6246 of the specified width. The string x is never truncated.");
6249 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
6255 if (!PyArg_ParseTuple(args
, "i:zfill", &width
))
6258 if (self
->length
>= width
) {
6259 if (PyUnicode_CheckExact(self
)) {
6261 return (PyObject
*) self
;
6264 return PyUnicode_FromUnicode(
6265 PyUnicode_AS_UNICODE(self
),
6266 PyUnicode_GET_SIZE(self
)
6270 fill
= width
- self
->length
;
6272 u
= pad(self
, fill
, 0, '0');
6277 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
6278 /* move sign to beginning of string */
6279 u
->str
[0] = u
->str
[fill
];
6283 return (PyObject
*) u
;
6288 unicode_freelistsize(PyUnicodeObject
*self
)
6290 return PyInt_FromLong(unicode_freelist_size
);
6294 PyDoc_STRVAR(startswith__doc__
,
6295 "S.startswith(prefix[, start[, end]]) -> bool\n\
6297 Return True if S starts with the specified prefix, False otherwise.\n\
6298 With optional start, test S beginning at that position.\n\
6299 With optional end, stop comparing S at that position.");
6302 unicode_startswith(PyUnicodeObject
*self
,
6305 PyUnicodeObject
*substring
;
6310 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &substring
,
6311 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6313 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6314 (PyObject
*)substring
);
6315 if (substring
== NULL
)
6318 result
= PyBool_FromLong(tailmatch(self
, substring
, start
, end
, -1));
6320 Py_DECREF(substring
);
6325 PyDoc_STRVAR(endswith__doc__
,
6326 "S.endswith(suffix[, start[, end]]) -> bool\n\
6328 Return True if S ends with the specified suffix, False otherwise.\n\
6329 With optional start, test S beginning at that position.\n\
6330 With optional end, stop comparing S at that position.");
6333 unicode_endswith(PyUnicodeObject
*self
,
6336 PyUnicodeObject
*substring
;
6341 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &substring
,
6342 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6344 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6345 (PyObject
*)substring
);
6346 if (substring
== NULL
)
6349 result
= PyBool_FromLong(tailmatch(self
, substring
, start
, end
, +1));
6351 Py_DECREF(substring
);
6358 unicode_getnewargs(PyUnicodeObject
*v
)
6360 return Py_BuildValue("(u#)", v
->str
, v
->length
);
6364 static PyMethodDef unicode_methods
[] = {
6366 /* Order is according to common usage: often used methods should
6367 appear first, since lookup is done sequentially. */
6369 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
, encode__doc__
},
6370 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
6371 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
6372 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
6373 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
6374 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
6375 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
6376 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
6377 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
6378 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
6379 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
6380 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
6381 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
6382 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
6383 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
6384 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
, decode__doc__
},
6385 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6386 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
6387 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
6388 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
6389 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
6390 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
6391 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
6392 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
6393 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
6394 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
6395 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
6396 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
6397 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
6398 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
6399 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
6400 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
6401 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
6402 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
6403 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
6404 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
6405 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
6406 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
6408 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
6412 /* This one is just used for debugging the implementation. */
6413 {"freelistsize", (PyCFunction
) unicode_freelistsize
, METH_NOARGS
},
6416 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
6421 unicode_mod(PyObject
*v
, PyObject
*w
)
6423 if (!PyUnicode_Check(v
)) {
6424 Py_INCREF(Py_NotImplemented
);
6425 return Py_NotImplemented
;
6427 return PyUnicode_Format(v
, w
);
6430 static PyNumberMethods unicode_as_number
= {
6435 unicode_mod
, /*nb_remainder*/
6438 static PySequenceMethods unicode_as_sequence
= {
6439 (inquiry
) unicode_length
, /* sq_length */
6440 (binaryfunc
) PyUnicode_Concat
, /* sq_concat */
6441 (intargfunc
) unicode_repeat
, /* sq_repeat */
6442 (intargfunc
) unicode_getitem
, /* sq_item */
6443 (intintargfunc
) unicode_slice
, /* sq_slice */
6444 0, /* sq_ass_item */
6445 0, /* sq_ass_slice */
6446 (objobjproc
)PyUnicode_Contains
, /*sq_contains*/
6450 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
6452 if (PyInt_Check(item
)) {
6453 long i
= PyInt_AS_LONG(item
);
6455 i
+= PyString_GET_SIZE(self
);
6456 return unicode_getitem(self
, i
);
6457 } else if (PyLong_Check(item
)) {
6458 long i
= PyLong_AsLong(item
);
6459 if (i
== -1 && PyErr_Occurred())
6462 i
+= PyString_GET_SIZE(self
);
6463 return unicode_getitem(self
, i
);
6464 } else if (PySlice_Check(item
)) {
6465 int start
, stop
, step
, slicelength
, cur
, i
;
6466 Py_UNICODE
* source_buf
;
6467 Py_UNICODE
* result_buf
;
6470 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyString_GET_SIZE(self
),
6471 &start
, &stop
, &step
, &slicelength
) < 0) {
6475 if (slicelength
<= 0) {
6476 return PyUnicode_FromUnicode(NULL
, 0);
6478 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
6479 result_buf
= PyMem_MALLOC(slicelength
*sizeof(Py_UNICODE
));
6481 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
6482 result_buf
[i
] = source_buf
[cur
];
6485 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
6486 PyMem_FREE(result_buf
);
6490 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
6495 static PyMappingMethods unicode_as_mapping
= {
6496 (inquiry
)unicode_length
, /* mp_length */
6497 (binaryfunc
)unicode_subscript
, /* mp_subscript */
6498 (objobjargproc
)0, /* mp_ass_subscript */
6502 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
6507 PyErr_SetString(PyExc_SystemError
,
6508 "accessing non-existent unicode segment");
6511 *ptr
= (void *) self
->str
;
6512 return PyUnicode_GET_DATA_SIZE(self
);
6516 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, int index
,
6519 PyErr_SetString(PyExc_TypeError
,
6520 "cannot use unicode as modifiable buffer");
6525 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
6529 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
6534 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
6541 PyErr_SetString(PyExc_SystemError
,
6542 "accessing non-existent unicode segment");
6545 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
6548 *ptr
= (void *) PyString_AS_STRING(str
);
6549 return PyString_GET_SIZE(str
);
6552 /* Helpers for PyUnicode_Format() */
6555 getnextarg(PyObject
*args
, int arglen
, int *p_argidx
)
6557 int argidx
= *p_argidx
;
6558 if (argidx
< arglen
) {
6563 return PyTuple_GetItem(args
, argidx
);
6565 PyErr_SetString(PyExc_TypeError
,
6566 "not enough arguments for format string");
6570 #define F_LJUST (1<<0)
6571 #define F_SIGN (1<<1)
6572 #define F_BLANK (1<<2)
6573 #define F_ALT (1<<3)
6574 #define F_ZERO (1<<4)
6577 int usprintf(register Py_UNICODE
*buffer
, char *format
, ...)
6583 va_start(va
, format
);
6585 /* First, format the string as char array, then expand to Py_UNICODE
6587 charbuffer
= (char *)buffer
;
6588 len
= vsprintf(charbuffer
, format
, va
);
6589 for (i
= len
- 1; i
>= 0; i
--)
6590 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
6596 /* XXX To save some code duplication, formatfloat/long/int could have been
6597 shared with stringobject.c, converting from 8-bit to Unicode after the
6598 formatting is done. */
6601 formatfloat(Py_UNICODE
*buf
,
6608 /* fmt = '%#.' + `prec` + `type`
6609 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
6613 x
= PyFloat_AsDouble(v
);
6614 if (x
== -1.0 && PyErr_Occurred())
6618 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
6620 /* Worst case length calc to ensure no buffer overrun:
6624 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6625 for any double rep.)
6626 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6629 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6630 len = 1 + 50 + 1 + prec = 52 + prec
6632 If prec=0 the effective precision is 1 (the leading digit is
6633 always given), therefore increase the length by one.
6636 if ((type
== 'g' && buflen
<= (size_t)10 + (size_t)prec
) ||
6637 (type
== 'f' && buflen
<= (size_t)53 + (size_t)prec
)) {
6638 PyErr_SetString(PyExc_OverflowError
,
6639 "formatted float is too long (precision too large?)");
6642 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%d%c",
6643 (flags
&F_ALT
) ? "#" : "",
6645 return usprintf(buf
, fmt
, x
);
6649 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
6653 PyObject
*str
; /* temporary string object. */
6654 PyUnicodeObject
*result
;
6656 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
6659 result
= _PyUnicode_New(len
);
6660 for (i
= 0; i
< len
; i
++)
6661 result
->str
[i
] = buf
[i
];
6662 result
->str
[len
] = 0;
6664 return (PyObject
*)result
;
6668 formatint(Py_UNICODE
*buf
,
6675 /* fmt = '%#.' + `prec` + 'l' + `type`
6676 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6680 char fmt
[64]; /* plenty big enough! */
6684 x
= PyInt_AsLong(v
);
6685 if (x
== -1 && PyErr_Occurred())
6687 if (x
< 0 && type
== 'u') {
6690 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
6697 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6698 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
6700 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
6701 PyErr_SetString(PyExc_OverflowError
,
6702 "formatted integer is too long (precision too large?)");
6706 if ((flags
& F_ALT
) &&
6707 (type
== 'x' || type
== 'X')) {
6708 /* When converting under %#x or %#X, there are a number
6709 * of issues that cause pain:
6710 * - when 0 is being converted, the C standard leaves off
6711 * the '0x' or '0X', which is inconsistent with other
6712 * %#x/%#X conversions and inconsistent with Python's
6714 * - there are platforms that violate the standard and
6715 * convert 0 with the '0x' or '0X'
6716 * (Metrowerks, Compaq Tru64)
6717 * - there are platforms that give '0x' when converting
6718 * under %#X, but convert 0 in accordance with the
6719 * standard (OS/2 EMX)
6721 * We can achieve the desired consistency by inserting our
6722 * own '0x' or '0X' prefix, and substituting %x/%X in place
6725 * Note that this is the same approach as used in
6726 * formatint() in stringobject.c
6728 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
6729 sign
, type
, prec
, type
);
6732 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
6733 sign
, (flags
&F_ALT
) ? "#" : "",
6737 return usprintf(buf
, fmt
, -x
);
6739 return usprintf(buf
, fmt
, x
);
6743 formatchar(Py_UNICODE
*buf
,
6747 /* presume that the buffer is at least 2 characters long */
6748 if (PyUnicode_Check(v
)) {
6749 if (PyUnicode_GET_SIZE(v
) != 1)
6751 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
6754 else if (PyString_Check(v
)) {
6755 if (PyString_GET_SIZE(v
) != 1)
6757 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
6761 /* Integer input truncated to a character */
6763 x
= PyInt_AsLong(v
);
6764 if (x
== -1 && PyErr_Occurred())
6766 #ifdef Py_UNICODE_WIDE
6767 if (x
< 0 || x
> 0x10ffff) {
6768 PyErr_SetString(PyExc_OverflowError
,
6769 "%c arg not in range(0x110000) "
6770 "(wide Python build)");
6774 if (x
< 0 || x
> 0xffff) {
6775 PyErr_SetString(PyExc_OverflowError
,
6776 "%c arg not in range(0x10000) "
6777 "(narrow Python build)");
6781 buf
[0] = (Py_UNICODE
) x
;
6787 PyErr_SetString(PyExc_TypeError
,
6788 "%c requires int or char");
6792 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6794 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6795 chars are formatted. XXX This is a magic number. Each formatting
6796 routine does bounds checking to ensure no overflow, but a better
6797 solution may be to malloc a buffer of appropriate size for each
6798 format. For now, the current solution is sufficient.
6800 #define FORMATBUFLEN (size_t)120
6802 PyObject
*PyUnicode_Format(PyObject
*format
,
6805 Py_UNICODE
*fmt
, *res
;
6806 int fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
6808 PyUnicodeObject
*result
= NULL
;
6809 PyObject
*dict
= NULL
;
6812 if (format
== NULL
|| args
== NULL
) {
6813 PyErr_BadInternalCall();
6816 uformat
= PyUnicode_FromObject(format
);
6817 if (uformat
== NULL
)
6819 fmt
= PyUnicode_AS_UNICODE(uformat
);
6820 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
6822 reslen
= rescnt
= fmtcnt
+ 100;
6823 result
= _PyUnicode_New(reslen
);
6826 res
= PyUnicode_AS_UNICODE(result
);
6828 if (PyTuple_Check(args
)) {
6829 arglen
= PyTuple_Size(args
);
6836 if (args
->ob_type
->tp_as_mapping
&& !PyTuple_Check(args
) &&
6837 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
6840 while (--fmtcnt
>= 0) {
6843 rescnt
= fmtcnt
+ 100;
6845 if (_PyUnicode_Resize(&result
, reslen
) < 0)
6847 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
6853 /* Got a format specifier */
6857 Py_UNICODE c
= '\0';
6860 PyObject
*temp
= NULL
;
6864 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
6868 Py_UNICODE
*keystart
;
6874 PyErr_SetString(PyExc_TypeError
,
6875 "format requires a mapping");
6881 /* Skip over balanced parentheses */
6882 while (pcount
> 0 && --fmtcnt
>= 0) {
6885 else if (*fmt
== '(')
6889 keylen
= fmt
- keystart
- 1;
6890 if (fmtcnt
< 0 || pcount
> 0) {
6891 PyErr_SetString(PyExc_ValueError
,
6892 "incomplete format key");
6896 /* keys are converted to strings using UTF-8 and
6897 then looked up since Python uses strings to hold
6898 variables names etc. in its namespaces and we
6899 wouldn't want to break common idioms. */
6900 key
= PyUnicode_EncodeUTF8(keystart
,
6904 key
= PyUnicode_FromUnicode(keystart
, keylen
);
6912 args
= PyObject_GetItem(dict
, key
);
6921 while (--fmtcnt
>= 0) {
6922 switch (c
= *fmt
++) {
6923 case '-': flags
|= F_LJUST
; continue;
6924 case '+': flags
|= F_SIGN
; continue;
6925 case ' ': flags
|= F_BLANK
; continue;
6926 case '#': flags
|= F_ALT
; continue;
6927 case '0': flags
|= F_ZERO
; continue;
6932 v
= getnextarg(args
, arglen
, &argidx
);
6935 if (!PyInt_Check(v
)) {
6936 PyErr_SetString(PyExc_TypeError
,
6940 width
= PyInt_AsLong(v
);
6948 else if (c
>= '0' && c
<= '9') {
6950 while (--fmtcnt
>= 0) {
6952 if (c
< '0' || c
> '9')
6954 if ((width
*10) / 10 != width
) {
6955 PyErr_SetString(PyExc_ValueError
,
6959 width
= width
*10 + (c
- '0');
6967 v
= getnextarg(args
, arglen
, &argidx
);
6970 if (!PyInt_Check(v
)) {
6971 PyErr_SetString(PyExc_TypeError
,
6975 prec
= PyInt_AsLong(v
);
6981 else if (c
>= '0' && c
<= '9') {
6983 while (--fmtcnt
>= 0) {
6984 c
= Py_CHARMASK(*fmt
++);
6985 if (c
< '0' || c
> '9')
6987 if ((prec
*10) / 10 != prec
) {
6988 PyErr_SetString(PyExc_ValueError
,
6992 prec
= prec
*10 + (c
- '0');
6997 if (c
== 'h' || c
== 'l' || c
== 'L') {
7003 PyErr_SetString(PyExc_ValueError
,
7004 "incomplete format");
7008 v
= getnextarg(args
, arglen
, &argidx
);
7018 /* presume that buffer length is at least 1 */
7025 if (PyUnicode_Check(v
) && c
== 's') {
7032 temp
= PyObject_Unicode(v
);
7034 temp
= PyObject_Repr(v
);
7037 if (PyUnicode_Check(temp
))
7038 /* nothing to do */;
7039 else if (PyString_Check(temp
)) {
7040 /* convert to string to Unicode */
7041 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
7042 PyString_GET_SIZE(temp
),
7052 PyErr_SetString(PyExc_TypeError
,
7053 "%s argument has non-string str()");
7057 pbuf
= PyUnicode_AS_UNICODE(temp
);
7058 len
= PyUnicode_GET_SIZE(temp
);
7059 if (prec
>= 0 && len
> prec
)
7071 if (PyLong_Check(v
)) {
7072 temp
= formatlong(v
, flags
, prec
, c
);
7075 pbuf
= PyUnicode_AS_UNICODE(temp
);
7076 len
= PyUnicode_GET_SIZE(temp
);
7081 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
7100 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
7111 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
7117 PyErr_Format(PyExc_ValueError
,
7118 "unsupported format character '%c' (0x%x) "
7120 (31<=c
&& c
<=126) ? (char)c
: '?',
7122 (int)(fmt
-1 - PyUnicode_AS_UNICODE(uformat
)));
7126 if (*pbuf
== '-' || *pbuf
== '+') {
7130 else if (flags
& F_SIGN
)
7132 else if (flags
& F_BLANK
)
7139 if (rescnt
- (sign
!= 0) < width
) {
7141 rescnt
= width
+ fmtcnt
+ 100;
7145 return PyErr_NoMemory();
7147 if (_PyUnicode_Resize(&result
, reslen
) < 0)
7149 res
= PyUnicode_AS_UNICODE(result
)
7159 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
7160 assert(pbuf
[0] == '0');
7161 assert(pbuf
[1] == c
);
7172 if (width
> len
&& !(flags
& F_LJUST
)) {
7176 } while (--width
> len
);
7181 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
7182 assert(pbuf
[0] == '0');
7183 assert(pbuf
[1] == c
);
7188 Py_UNICODE_COPY(res
, pbuf
, len
);
7191 while (--width
>= len
) {
7195 if (dict
&& (argidx
< arglen
) && c
!= '%') {
7196 PyErr_SetString(PyExc_TypeError
,
7197 "not all arguments converted during string formatting");
7203 if (argidx
< arglen
&& !dict
) {
7204 PyErr_SetString(PyExc_TypeError
,
7205 "not all arguments converted during string formatting");
7213 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
7215 return (PyObject
*)result
;
7226 static PyBufferProcs unicode_as_buffer
= {
7227 (getreadbufferproc
) unicode_buffer_getreadbuf
,
7228 (getwritebufferproc
) unicode_buffer_getwritebuf
,
7229 (getsegcountproc
) unicode_buffer_getsegcount
,
7230 (getcharbufferproc
) unicode_buffer_getcharbuf
,
7234 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
7237 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
7240 static char *kwlist
[] = {"string", "encoding", "errors", 0};
7241 char *encoding
= NULL
;
7242 char *errors
= NULL
;
7244 if (type
!= &PyUnicode_Type
)
7245 return unicode_subtype_new(type
, args
, kwds
);
7246 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
7247 kwlist
, &x
, &encoding
, &errors
))
7250 return (PyObject
*)_PyUnicode_New(0);
7251 if (encoding
== NULL
&& errors
== NULL
)
7252 return PyObject_Unicode(x
);
7254 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
7258 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
7260 PyUnicodeObject
*tmp
, *pnew
;
7263 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
7264 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
7267 assert(PyUnicode_Check(tmp
));
7268 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
7273 pnew
->str
= PyMem_NEW(Py_UNICODE
, n
+1);
7274 if (pnew
->str
== NULL
) {
7275 _Py_ForgetReference((PyObject
*)pnew
);
7278 return PyErr_NoMemory();
7280 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
7282 pnew
->hash
= tmp
->hash
;
7284 return (PyObject
*)pnew
;
7287 PyDoc_STRVAR(unicode_doc
,
7288 "unicode(string [, encoding[, errors]]) -> object\n\
7290 Create a new Unicode object from the given encoded string.\n\
7291 encoding defaults to the current default string encoding.\n\
7292 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
7294 PyTypeObject PyUnicode_Type
= {
7295 PyObject_HEAD_INIT(&PyType_Type
)
7297 "unicode", /* tp_name */
7298 sizeof(PyUnicodeObject
), /* tp_size */
7299 0, /* tp_itemsize */
7301 (destructor
)unicode_dealloc
, /* tp_dealloc */
7305 (cmpfunc
) unicode_compare
, /* tp_compare */
7306 (reprfunc
) unicode_repr
, /* tp_repr */
7307 &unicode_as_number
, /* tp_as_number */
7308 &unicode_as_sequence
, /* tp_as_sequence */
7309 &unicode_as_mapping
, /* tp_as_mapping */
7310 (hashfunc
) unicode_hash
, /* tp_hash*/
7312 (reprfunc
) unicode_str
, /* tp_str */
7313 PyObject_GenericGetAttr
, /* tp_getattro */
7314 0, /* tp_setattro */
7315 &unicode_as_buffer
, /* tp_as_buffer */
7316 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
7317 Py_TPFLAGS_BASETYPE
, /* tp_flags */
7318 unicode_doc
, /* tp_doc */
7319 0, /* tp_traverse */
7321 0, /* tp_richcompare */
7322 0, /* tp_weaklistoffset */
7324 0, /* tp_iternext */
7325 unicode_methods
, /* tp_methods */
7328 &PyBaseString_Type
, /* tp_base */
7330 0, /* tp_descr_get */
7331 0, /* tp_descr_set */
7332 0, /* tp_dictoffset */
7335 unicode_new
, /* tp_new */
7336 PyObject_Del
, /* tp_free */
7339 /* Initialize the Unicode implementation */
7341 void _PyUnicode_Init(void)
7345 /* Init the implementation */
7346 unicode_freelist
= NULL
;
7347 unicode_freelist_size
= 0;
7348 unicode_empty
= _PyUnicode_New(0);
7349 strcpy(unicode_default_encoding
, "ascii");
7350 for (i
= 0; i
< 256; i
++)
7351 unicode_latin1
[i
] = NULL
;
7352 if (PyType_Ready(&PyUnicode_Type
) < 0)
7353 Py_FatalError("Can't initialize 'unicode'");
7356 /* Finalize the Unicode implementation */
7359 _PyUnicode_Fini(void)
7364 Py_XDECREF(unicode_empty
);
7365 unicode_empty
= NULL
;
7367 for (i
= 0; i
< 256; i
++) {
7368 if (unicode_latin1
[i
]) {
7369 Py_DECREF(unicode_latin1
[i
]);
7370 unicode_latin1
[i
] = NULL
;
7374 for (u
= unicode_freelist
; u
!= NULL
;) {
7375 PyUnicodeObject
*v
= u
;
7376 u
= *(PyUnicodeObject
**)u
;
7379 Py_XDECREF(v
->defenc
);
7382 unicode_freelist
= NULL
;
7383 unicode_freelist_size
= 0;
7389 indent-tabs-mode: nil