3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
45 #include "unicodeobject.h"
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
80 # define BYTEORDER_IS_LITTLE_ENDIAN
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
95 /* Free list for Unicode objects */
96 static PyUnicodeObject
*free_list
;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject
*unicode_empty
;
102 /* Single character Unicode strings in the Latin-1 range are being
104 static PyUnicodeObject
*unicode_latin1
[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding
[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace
[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * HORIZONTAL TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * VERTICAL TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak
[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000D, * CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 /* 0x001C, * FILE SEPARATOR */
154 /* 0x001D, * GROUP SEPARATOR */
155 /* 0x001E, * RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #define BLOOM_MASK unsigned long
195 static BLOOM_MASK bloom_linebreak
;
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
199 #define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
203 Py_LOCAL_INLINE(BLOOM_MASK
) make_bloom_mask(Py_UNICODE
* ptr
, Py_ssize_t len
)
205 /* calculate simple bloom-style bitmask for a given unicode string */
211 for (i
= 0; i
< len
; i
++)
212 mask
|= (1 << (ptr
[i
] & 0x1F));
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr
, Py_UNICODE
* set
, Py_ssize_t setlen
)
221 for (i
= 0; i
< setlen
; i
++)
228 #define BLOOM_MEMBER(mask, chr, set, setlen) \
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231 /* --- Unicode Object ----------------------------------------------------- */
234 int unicode_resize(register PyUnicodeObject
*unicode
,
239 /* Shortcut if there's nothing much to do. */
240 if (unicode
->length
== length
)
243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
247 if (unicode
== unicode_empty
||
248 (unicode
->length
== 1 &&
249 unicode
->str
[0] < 256U &&
250 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
251 PyErr_SetString(PyExc_SystemError
,
252 "can't resize shared unicode objects");
256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
258 safe to look at str[length] (without making any assumptions about what
261 oldstr
= unicode
->str
;
262 unicode
->str
= PyObject_REALLOC(unicode
->str
,
263 sizeof(Py_UNICODE
) * (length
+ 1));
265 unicode
->str
= (Py_UNICODE
*)oldstr
;
269 unicode
->str
[length
] = 0;
270 unicode
->length
= length
;
273 /* Reset the object caches */
274 if (unicode
->defenc
) {
275 Py_DECREF(unicode
->defenc
);
276 unicode
->defenc
= NULL
;
283 /* We allocate one more byte to make sure the string is
284 Ux0000 terminated -- XXX is this needed ?
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
292 PyUnicodeObject
*_PyUnicode_New(Py_ssize_t length
)
294 register PyUnicodeObject
*unicode
;
296 /* Optimization for empty strings */
297 if (length
== 0 && unicode_empty
!= NULL
) {
298 Py_INCREF(unicode_empty
);
299 return unicode_empty
;
302 /* Ensure we won't overflow the size. */
303 if (length
> ((PY_SSIZE_T_MAX
/ sizeof(Py_UNICODE
)) - 1)) {
304 return (PyUnicodeObject
*)PyErr_NoMemory();
307 /* Unicode freelist & memory allocation */
310 free_list
= *(PyUnicodeObject
**)unicode
;
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode
->length
< length
) &&
316 unicode_resize(unicode
, length
) < 0) {
317 PyObject_DEL(unicode
->str
);
322 size_t new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
323 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
325 PyObject_INIT(unicode
, &PyUnicode_Type
);
329 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
332 new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
333 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
340 /* Initialize the first element to guard against cases where
341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
348 unicode
->str
[length
] = 0;
349 unicode
->length
= length
;
351 unicode
->defenc
= NULL
;
355 /* XXX UNREF/NEWREF interface should be more symmetrical */
357 _Py_ForgetReference((PyObject
*)unicode
);
358 PyObject_Del(unicode
);
363 void unicode_dealloc(register PyUnicodeObject
*unicode
)
365 if (PyUnicode_CheckExact(unicode
) &&
366 numfree
< PyUnicode_MAXFREELIST
) {
367 /* Keep-Alive optimization */
368 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
369 PyObject_DEL(unicode
->str
);
373 if (unicode
->defenc
) {
374 Py_DECREF(unicode
->defenc
);
375 unicode
->defenc
= NULL
;
377 /* Add to free list */
378 *(PyUnicodeObject
**)unicode
= free_list
;
383 PyObject_DEL(unicode
->str
);
384 Py_XDECREF(unicode
->defenc
);
385 Py_TYPE(unicode
)->tp_free((PyObject
*)unicode
);
390 int _PyUnicode_Resize(PyUnicodeObject
**unicode
, Py_ssize_t length
)
392 register PyUnicodeObject
*v
;
394 /* Argument checks */
395 if (unicode
== NULL
) {
396 PyErr_BadInternalCall();
400 if (v
== NULL
|| !PyUnicode_Check(v
) || Py_REFCNT(v
) != 1 || length
< 0) {
401 PyErr_BadInternalCall();
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
408 if (v
->length
!= length
&&
409 (v
== unicode_empty
|| v
->length
== 1)) {
410 PyUnicodeObject
*w
= _PyUnicode_New(length
);
413 Py_UNICODE_COPY(w
->str
, v
->str
,
414 length
< v
->length
? length
: v
->length
);
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v
, length
);
425 int PyUnicode_Resize(PyObject
**unicode
, Py_ssize_t length
)
427 return _PyUnicode_Resize((PyUnicodeObject
**)unicode
, length
);
430 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
433 PyUnicodeObject
*unicode
;
435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
439 /* Optimization for empty strings */
440 if (size
== 0 && unicode_empty
!= NULL
) {
441 Py_INCREF(unicode_empty
);
442 return (PyObject
*)unicode_empty
;
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size
== 1 && *u
< 256) {
448 unicode
= unicode_latin1
[*u
];
450 unicode
= _PyUnicode_New(1);
453 unicode
->str
[0] = *u
;
454 unicode_latin1
[*u
] = unicode
;
457 return (PyObject
*)unicode
;
461 unicode
= _PyUnicode_New(size
);
465 /* Copy the Unicode data into the new object */
467 Py_UNICODE_COPY(unicode
->str
, u
, size
);
469 return (PyObject
*)unicode
;
472 PyObject
*PyUnicode_FromStringAndSize(const char *u
, Py_ssize_t size
)
474 PyUnicodeObject
*unicode
;
477 PyErr_SetString(PyExc_SystemError
,
478 "Negative size passed to PyUnicode_FromStringAndSize");
482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
488 /* Optimization for empty strings */
489 if (size
== 0 && unicode_empty
!= NULL
) {
490 Py_INCREF(unicode_empty
);
491 return (PyObject
*)unicode_empty
;
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size
== 1 && Py_CHARMASK(*u
) < 128) {
497 unicode
= unicode_latin1
[Py_CHARMASK(*u
)];
499 unicode
= _PyUnicode_New(1);
502 unicode
->str
[0] = Py_CHARMASK(*u
);
503 unicode_latin1
[Py_CHARMASK(*u
)] = unicode
;
506 return (PyObject
*)unicode
;
509 return PyUnicode_DecodeUTF8(u
, size
, NULL
);
512 unicode
= _PyUnicode_New(size
);
516 return (PyObject
*)unicode
;
519 PyObject
*PyUnicode_FromString(const char *u
)
521 size_t size
= strlen(u
);
522 if (size
> PY_SSIZE_T_MAX
) {
523 PyErr_SetString(PyExc_OverflowError
, "input too long");
527 return PyUnicode_FromStringAndSize(u
, size
);
532 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
533 # define CONVERT_WCHAR_TO_SURROGATES
536 #ifdef CONVERT_WCHAR_TO_SURROGATES
538 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
539 to convert from UTF32 to UTF16. */
541 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
544 PyUnicodeObject
*unicode
;
545 register Py_ssize_t i
;
547 const wchar_t *orig_w
;
550 PyErr_BadInternalCall();
556 for (i
= size
; i
> 0; i
--) {
562 unicode
= _PyUnicode_New(alloc
);
566 /* Copy the wchar_t data into the new object */
568 register Py_UNICODE
*u
;
569 u
= PyUnicode_AS_UNICODE(unicode
);
570 for (i
= size
; i
> 0; i
--) {
572 wchar_t ordinal
= *w
++;
574 *u
++ = 0xD800 | (ordinal
>> 10);
575 *u
++ = 0xDC00 | (ordinal
& 0x3FF);
581 return (PyObject
*)unicode
;
586 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
589 PyUnicodeObject
*unicode
;
592 PyErr_BadInternalCall();
596 unicode
= _PyUnicode_New(size
);
600 /* Copy the wchar_t data into the new object */
601 #ifdef HAVE_USABLE_WCHAR_T
602 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
605 register Py_UNICODE
*u
;
606 register Py_ssize_t i
;
607 u
= PyUnicode_AS_UNICODE(unicode
);
608 for (i
= size
; i
> 0; i
--)
613 return (PyObject
*)unicode
;
616 #endif /* CONVERT_WCHAR_TO_SURROGATES */
618 #undef CONVERT_WCHAR_TO_SURROGATES
621 makefmt(char *fmt
, int longflag
, int size_tflag
, int zeropad
, int width
, int precision
, char c
)
627 fmt
+= sprintf(fmt
, "%d", width
);
630 fmt
+= sprintf(fmt
, ".%d", precision
);
633 else if (size_tflag
) {
634 char *f
= PY_FORMAT_SIZE_T
;
642 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
645 PyUnicode_FromFormatV(const char *format
, va_list vargs
)
648 Py_ssize_t callcount
= 0;
649 PyObject
**callresults
= NULL
;
650 PyObject
**callresult
= NULL
;
658 /* used by sprintf */
660 /* use abuffer instead of buffer, if we need more space
661 * (which can happen if there's a format specifier with width). */
662 char *abuffer
= NULL
;
664 Py_ssize_t abuffersize
= 0;
665 char fmt
[60]; /* should be enough for %0width.precisionld */
668 #ifdef VA_LIST_IS_ARRAY
669 Py_MEMCPY(count
, vargs
, sizeof(va_list));
672 __va_copy(count
, vargs
);
677 /* step 1: count the number of %S/%R/%s format specifications
678 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
679 * objects once during step 3 and put the result in an array) */
680 for (f
= format
; *f
; f
++) {
684 if (*(f
+1)=='S' || *(f
+1)=='R')
686 while (isdigit((unsigned)*f
))
687 width
= (width
*10) + *f
++ - '0';
688 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
694 /* step 2: allocate memory for the results of
695 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
697 callresults
= PyObject_Malloc(sizeof(PyObject
*)*callcount
);
702 callresult
= callresults
;
704 /* step 3: figure out how large a buffer we need */
705 for (f
= format
; *f
; f
++) {
709 while (isdigit((unsigned)*f
))
710 width
= (width
*10) + *f
++ - '0';
711 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
714 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
715 * they don't affect the amount of space we reserve.
717 if ((*f
== 'l' || *f
== 'z') &&
718 (f
[1] == 'd' || f
[1] == 'u'))
723 (void)va_arg(count
, int);
724 /* fall through... */
728 case 'd': case 'u': case 'i': case 'x':
729 (void) va_arg(count
, int);
730 /* 20 bytes is enough to hold a 64-bit
731 integer. Decimal takes the most space.
732 This isn't enough for octal.
733 If a width is specified we need more
734 (which we allocate later). */
738 if (abuffersize
< width
)
744 const char *s
= va_arg(count
, const char*);
745 PyObject
*str
= PyUnicode_DecodeUTF8(s
, strlen(s
), "replace");
748 n
+= PyUnicode_GET_SIZE(str
);
749 /* Remember the str and switch to the next slot */
755 PyObject
*obj
= va_arg(count
, PyObject
*);
756 assert(obj
&& PyUnicode_Check(obj
));
757 n
+= PyUnicode_GET_SIZE(obj
);
762 PyObject
*obj
= va_arg(count
, PyObject
*);
763 const char *str
= va_arg(count
, const char *);
765 assert(!obj
|| PyUnicode_Check(obj
));
767 n
+= PyUnicode_GET_SIZE(obj
);
774 PyObject
*obj
= va_arg(count
, PyObject
*);
777 str
= PyObject_Str(obj
);
780 n
+= PyUnicode_GET_SIZE(str
);
781 /* Remember the str and switch to the next slot */
787 PyObject
*obj
= va_arg(count
, PyObject
*);
790 repr
= PyObject_Repr(obj
);
793 n
+= PyUnicode_GET_SIZE(repr
);
794 /* Remember the repr and switch to the next slot */
795 *callresult
++ = repr
;
799 (void) va_arg(count
, int);
800 /* maximum 64-bit pointer representation:
802 * so 19 characters is enough.
803 * XXX I count 18 -- what's the extra for?
808 /* if we stumble upon an unknown
809 formatting code, copy the rest of
810 the format string to the output
811 string. (we cannot just skip the
812 code, since there's no way to know
813 what's in the argument list) */
821 if (abuffersize
> 20) {
822 abuffer
= PyObject_Malloc(abuffersize
);
827 realbuffer
= abuffer
;
831 /* step 4: fill the buffer */
832 /* Since we've analyzed how much space we need for the worst case,
833 we don't have to resize the string.
834 There can be no errors beyond this point. */
835 string
= PyUnicode_FromUnicode(NULL
, n
);
839 s
= PyUnicode_AS_UNICODE(string
);
840 callresult
= callresults
;
842 for (f
= format
; *f
; f
++) {
847 zeropad
= (*f
== '0');
848 /* parse the width.precision part */
850 while (isdigit((unsigned)*f
))
851 width
= (width
*10) + *f
++ - '0';
855 while (isdigit((unsigned)*f
))
856 precision
= (precision
*10) + *f
++ - '0';
858 /* handle the long flag, but only for %ld and %lu.
859 others can be added when necessary. */
860 if (*f
== 'l' && (f
[1] == 'd' || f
[1] == 'u')) {
864 /* handle the size_t flag. */
865 if (*f
== 'z' && (f
[1] == 'd' || f
[1] == 'u')) {
872 *s
++ = va_arg(vargs
, int);
875 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'd');
877 sprintf(realbuffer
, fmt
, va_arg(vargs
, long));
879 sprintf(realbuffer
, fmt
, va_arg(vargs
, Py_ssize_t
));
881 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
882 appendstring(realbuffer
);
885 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'u');
887 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned long));
889 sprintf(realbuffer
, fmt
, va_arg(vargs
, size_t));
891 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned int));
892 appendstring(realbuffer
);
895 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'i');
896 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
897 appendstring(realbuffer
);
900 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'x');
901 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
902 appendstring(realbuffer
);
906 /* unused, since we already have the result */
907 (void) va_arg(vargs
, char *);
908 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(*callresult
),
909 PyUnicode_GET_SIZE(*callresult
));
910 s
+= PyUnicode_GET_SIZE(*callresult
);
911 /* We're done with the unicode()/repr() => forget it */
912 Py_DECREF(*callresult
);
913 /* switch to next unicode()/repr() result */
919 PyObject
*obj
= va_arg(vargs
, PyObject
*);
920 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
921 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
927 PyObject
*obj
= va_arg(vargs
, PyObject
*);
928 const char *str
= va_arg(vargs
, const char *);
930 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
931 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
944 /* unused, since we already have the result */
945 (void) va_arg(vargs
, PyObject
*);
946 ucopy
= PyUnicode_AS_UNICODE(*callresult
);
947 usize
= PyUnicode_GET_SIZE(*callresult
);
948 for (upos
= 0; upos
<usize
;)
949 *s
++ = ucopy
[upos
++];
950 /* We're done with the unicode()/repr() => forget it */
951 Py_DECREF(*callresult
);
952 /* switch to next unicode()/repr() result */
957 sprintf(buffer
, "%p", va_arg(vargs
, void*));
958 /* %p is ill-defined: ensure leading 0x. */
959 if (buffer
[1] == 'X')
961 else if (buffer
[1] != 'x') {
962 memmove(buffer
+2, buffer
, strlen(buffer
)+1);
966 appendstring(buffer
);
981 PyObject_Free(callresults
);
983 PyObject_Free(abuffer
);
984 PyUnicode_Resize(&string
, s
- PyUnicode_AS_UNICODE(string
));
988 PyObject
**callresult2
= callresults
;
989 while (callresult2
< callresult
) {
990 Py_DECREF(*callresult2
);
993 PyObject_Free(callresults
);
996 PyObject_Free(abuffer
);
1003 PyUnicode_FromFormat(const char *format
, ...)
1008 #ifdef HAVE_STDARG_PROTOTYPES
1009 va_start(vargs
, format
);
1013 ret
= PyUnicode_FromFormatV(format
, vargs
);
1018 Py_ssize_t
PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
1022 if (unicode
== NULL
) {
1023 PyErr_BadInternalCall();
1027 /* If possible, try to copy the 0-termination as well */
1028 if (size
> PyUnicode_GET_SIZE(unicode
))
1029 size
= PyUnicode_GET_SIZE(unicode
) + 1;
1031 #ifdef HAVE_USABLE_WCHAR_T
1032 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
1035 register Py_UNICODE
*u
;
1036 register Py_ssize_t i
;
1037 u
= PyUnicode_AS_UNICODE(unicode
);
1038 for (i
= size
; i
> 0; i
--)
1043 if (size
> PyUnicode_GET_SIZE(unicode
))
1044 return PyUnicode_GET_SIZE(unicode
);
1051 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
1055 #ifdef Py_UNICODE_WIDE
1056 if (ordinal
< 0 || ordinal
> 0x10ffff) {
1057 PyErr_SetString(PyExc_ValueError
,
1058 "unichr() arg not in range(0x110000) "
1059 "(wide Python build)");
1063 if (ordinal
< 0 || ordinal
> 0xffff) {
1064 PyErr_SetString(PyExc_ValueError
,
1065 "unichr() arg not in range(0x10000) "
1066 "(narrow Python build)");
1071 s
[0] = (Py_UNICODE
)ordinal
;
1072 return PyUnicode_FromUnicode(s
, 1);
1075 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
1077 /* XXX Perhaps we should make this API an alias of
1078 PyObject_Unicode() instead ?! */
1079 if (PyUnicode_CheckExact(obj
)) {
1083 if (PyUnicode_Check(obj
)) {
1084 /* For a Unicode subtype that's not a Unicode object,
1085 return a true Unicode object with the same data. */
1086 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
1087 PyUnicode_GET_SIZE(obj
));
1089 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
1092 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
1093 const char *encoding
,
1096 const char *s
= NULL
;
1101 PyErr_BadInternalCall();
1106 /* For b/w compatibility we also accept Unicode objects provided
1107 that no encodings is given and then redirect to
1108 PyObject_Unicode() which then applies the additional logic for
1111 NOTE: This API should really only be used for object which
1112 represent *encoded* Unicode !
1115 if (PyUnicode_Check(obj
)) {
1117 PyErr_SetString(PyExc_TypeError
,
1118 "decoding Unicode is not supported");
1121 return PyObject_Unicode(obj
);
1124 if (PyUnicode_Check(obj
)) {
1125 PyErr_SetString(PyExc_TypeError
,
1126 "decoding Unicode is not supported");
1132 if (PyString_Check(obj
)) {
1133 s
= PyString_AS_STRING(obj
);
1134 len
= PyString_GET_SIZE(obj
);
1136 else if (PyByteArray_Check(obj
)) {
1137 /* Python 2.x specific */
1138 PyErr_Format(PyExc_TypeError
,
1139 "decoding bytearray is not supported");
1142 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
1143 /* Overwrite the error message with something more useful in
1144 case of a TypeError. */
1145 if (PyErr_ExceptionMatches(PyExc_TypeError
))
1146 PyErr_Format(PyExc_TypeError
,
1147 "coercing to Unicode: need string or buffer, "
1149 Py_TYPE(obj
)->tp_name
);
1153 /* Convert to Unicode */
1155 Py_INCREF(unicode_empty
);
1156 v
= (PyObject
*)unicode_empty
;
1159 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
1167 PyObject
*PyUnicode_Decode(const char *s
,
1169 const char *encoding
,
1172 PyObject
*buffer
= NULL
, *unicode
;
1174 if (encoding
== NULL
)
1175 encoding
= PyUnicode_GetDefaultEncoding();
1177 /* Shortcuts for common default encodings */
1178 if (strcmp(encoding
, "utf-8") == 0)
1179 return PyUnicode_DecodeUTF8(s
, size
, errors
);
1180 else if (strcmp(encoding
, "latin-1") == 0)
1181 return PyUnicode_DecodeLatin1(s
, size
, errors
);
1182 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1183 else if (strcmp(encoding
, "mbcs") == 0)
1184 return PyUnicode_DecodeMBCS(s
, size
, errors
);
1186 else if (strcmp(encoding
, "ascii") == 0)
1187 return PyUnicode_DecodeASCII(s
, size
, errors
);
1189 /* Decode via the codec registry */
1190 buffer
= PyBuffer_FromMemory((void *)s
, size
);
1193 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
1194 if (unicode
== NULL
)
1196 if (!PyUnicode_Check(unicode
)) {
1197 PyErr_Format(PyExc_TypeError
,
1198 "decoder did not return an unicode object (type=%.400s)",
1199 Py_TYPE(unicode
)->tp_name
);
1211 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
1212 const char *encoding
,
1217 if (!PyUnicode_Check(unicode
)) {
1218 PyErr_BadArgument();
1222 if (encoding
== NULL
)
1223 encoding
= PyUnicode_GetDefaultEncoding();
1225 /* Decode via the codec registry */
1226 v
= PyCodec_Decode(unicode
, encoding
, errors
);
1235 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
1237 const char *encoding
,
1240 PyObject
*v
, *unicode
;
1242 unicode
= PyUnicode_FromUnicode(s
, size
);
1243 if (unicode
== NULL
)
1245 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
1250 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
1251 const char *encoding
,
1256 if (!PyUnicode_Check(unicode
)) {
1257 PyErr_BadArgument();
1261 if (encoding
== NULL
)
1262 encoding
= PyUnicode_GetDefaultEncoding();
1264 /* Encode via the codec registry */
1265 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1274 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
1275 const char *encoding
,
1280 if (!PyUnicode_Check(unicode
)) {
1281 PyErr_BadArgument();
1285 if (encoding
== NULL
)
1286 encoding
= PyUnicode_GetDefaultEncoding();
1288 /* Shortcuts for common default encodings */
1289 if (errors
== NULL
) {
1290 if (strcmp(encoding
, "utf-8") == 0)
1291 return PyUnicode_AsUTF8String(unicode
);
1292 else if (strcmp(encoding
, "latin-1") == 0)
1293 return PyUnicode_AsLatin1String(unicode
);
1294 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1295 else if (strcmp(encoding
, "mbcs") == 0)
1296 return PyUnicode_AsMBCSString(unicode
);
1298 else if (strcmp(encoding
, "ascii") == 0)
1299 return PyUnicode_AsASCIIString(unicode
);
1302 /* Encode via the codec registry */
1303 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1306 if (!PyString_Check(v
)) {
1307 PyErr_Format(PyExc_TypeError
,
1308 "encoder did not return a string object (type=%.400s)",
1309 Py_TYPE(v
)->tp_name
);
1319 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
1322 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
1326 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
1327 if (v
&& errors
== NULL
)
1328 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
1332 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
1334 if (!PyUnicode_Check(unicode
)) {
1335 PyErr_BadArgument();
1338 return PyUnicode_AS_UNICODE(unicode
);
1344 Py_ssize_t
PyUnicode_GetSize(PyObject
*unicode
)
1346 if (!PyUnicode_Check(unicode
)) {
1347 PyErr_BadArgument();
1350 return PyUnicode_GET_SIZE(unicode
);
1356 const char *PyUnicode_GetDefaultEncoding(void)
1358 return unicode_default_encoding
;
1361 int PyUnicode_SetDefaultEncoding(const char *encoding
)
1365 /* Make sure the encoding is valid. As side effect, this also
1366 loads the encoding into the codec registry cache. */
1367 v
= _PyCodec_Lookup(encoding
);
1371 strncpy(unicode_default_encoding
,
1373 sizeof(unicode_default_encoding
));
1380 /* error handling callback helper:
1381 build arguments, call the callback and check the arguments,
1382 if no exception occurred, copy the replacement to the output
1383 and adjust various state variables.
1384 return 0 on success, -1 on error
1388 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
1389 const char *encoding
, const char *reason
,
1390 const char *input
, Py_ssize_t insize
, Py_ssize_t
*startinpos
,
1391 Py_ssize_t
*endinpos
, PyObject
**exceptionObject
, const char **inptr
,
1392 PyUnicodeObject
**output
, Py_ssize_t
*outpos
, Py_UNICODE
**outptr
)
1394 static char *argparse
= "O!n;decoding error handler must return (unicode, int) tuple";
1396 PyObject
*restuple
= NULL
;
1397 PyObject
*repunicode
= NULL
;
1398 Py_ssize_t outsize
= PyUnicode_GET_SIZE(*output
);
1399 Py_ssize_t requiredsize
;
1405 if (*errorHandler
== NULL
) {
1406 *errorHandler
= PyCodec_LookupError(errors
);
1407 if (*errorHandler
== NULL
)
1411 if (*exceptionObject
== NULL
) {
1412 *exceptionObject
= PyUnicodeDecodeError_Create(
1413 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
1414 if (*exceptionObject
== NULL
)
1418 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
1420 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
1422 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
1426 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
1427 if (restuple
== NULL
)
1429 if (!PyTuple_Check(restuple
)) {
1430 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
1433 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
1436 newpos
= insize
+newpos
;
1437 if (newpos
<0 || newpos
>insize
) {
1438 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", newpos
);
1442 /* need more space? (at least enough for what we
1443 have+the replacement+the rest of the string (starting
1444 at the new input position), so we won't have to check space
1445 when there are no errors in the rest of the string) */
1446 repptr
= PyUnicode_AS_UNICODE(repunicode
);
1447 repsize
= PyUnicode_GET_SIZE(repunicode
);
1448 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
1449 if (requiredsize
> outsize
) {
1450 if (requiredsize
<2*outsize
)
1451 requiredsize
= 2*outsize
;
1452 if (_PyUnicode_Resize(output
, requiredsize
) < 0)
1454 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
1457 *inptr
= input
+ newpos
;
1458 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
1465 Py_XDECREF(restuple
);
1469 /* --- UTF-7 Codec -------------------------------------------------------- */
1471 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1473 /* Three simple macros defining base-64. */
1475 /* Is c a base-64 character? */
1477 #define IS_BASE64(c) \
1478 (isalnum(c) || (c) == '+' || (c) == '/')
1480 /* given that c is a base-64 character, what is its base-64 value? */
1482 #define FROM_BASE64(c) \
1483 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1484 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1485 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1486 (c) == '+' ? 62 : 63)
1488 /* What is the base-64 character of the bottom 6 bits of n? */
1490 #define TO_BASE64(n) \
1491 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1493 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1494 * decoded as itself. We are permissive on decoding; the only ASCII
1495 * byte not decoding to itself is the + which begins a base64
1498 #define DECODE_DIRECT(c) \
1499 ((c) <= 127 && (c) != '+')
1501 /* The UTF-7 encoder treats ASCII characters differently according to
1502 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1503 * the above). See RFC2152. This array identifies these different
1506 * alphanumeric and '(),-./:?
1508 * !"#$%&*;<=>@[]^_`{|}
1511 * 3 : special (must be base64 encoded)
1512 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1516 char utf7_category
[128] = {
1517 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1518 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1519 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1520 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1521 /* sp ! " # $ % & ' ( ) * + , - . / */
1522 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1523 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1525 /* @ A B C D E F G H I J K L M N O */
1526 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1527 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1529 /* ` a b c d e f g h i j k l m n o */
1530 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1531 /* p q r s t u v w x y z { | } ~ del */
1532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1535 /* ENCODE_DIRECT: this character should be encoded as itself. The
1536 * answer depends on whether we are encoding set O as itself, and also
1537 * on whether we are encoding whitespace as itself. RFC2152 makes it
1538 * clear that the answers to these questions vary between
1539 * applications, so this code needs to be flexible. */
1541 #define ENCODE_DIRECT(c, directO, directWS) \
1542 ((c) < 128 && (c) > 0 && \
1543 ((utf7_category[(c)] == 0) || \
1544 (directWS && (utf7_category[(c)] == 2)) || \
1545 (directO && (utf7_category[(c)] == 1))))
1547 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
1551 return PyUnicode_DecodeUTF7Stateful(s
, size
, errors
, NULL
);
1554 /* The decoder. The only state we preserve is our read position,
1555 * i.e. how many characters we have consumed. So if we end in the
1556 * middle of a shift sequence we have to back off the read position
1557 * and the output to the beginning of the sequence, otherwise we lose
1558 * all the shift state (seen bits, number of bits seen, high
1561 PyObject
*PyUnicode_DecodeUTF7Stateful(const char *s
,
1564 Py_ssize_t
*consumed
)
1566 const char *starts
= s
;
1567 Py_ssize_t startinpos
;
1568 Py_ssize_t endinpos
;
1571 PyUnicodeObject
*unicode
;
1573 const char *errmsg
= "";
1575 Py_UNICODE
*shiftOutStart
;
1576 unsigned int base64bits
= 0;
1577 unsigned long base64buffer
= 0;
1578 Py_UNICODE surrogate
= 0;
1579 PyObject
*errorHandler
= NULL
;
1580 PyObject
*exc
= NULL
;
1582 unicode
= _PyUnicode_New(size
);
1588 return (PyObject
*)unicode
;
1596 Py_UNICODE ch
= (unsigned char) *s
;
1598 if (inShift
) { /* in a base-64 section */
1599 if (IS_BASE64(ch
)) { /* consume a base-64 character */
1600 base64buffer
= (base64buffer
<< 6) | FROM_BASE64(ch
);
1603 if (base64bits
>= 16) {
1604 /* we have enough bits for a UTF-16 value */
1605 Py_UNICODE outCh
= (Py_UNICODE
)
1606 (base64buffer
>> (base64bits
-16));
1608 base64buffer
&= (1 << base64bits
) - 1; /* clear high bits */
1610 /* expecting a second surrogate */
1611 if (outCh
>= 0xDC00 && outCh
<= 0xDFFF) {
1612 #ifdef Py_UNICODE_WIDE
1613 *p
++ = (((surrogate
& 0x3FF)<<10)
1614 | (outCh
& 0x3FF)) + 0x10000;
1623 errmsg
= "second surrogate missing";
1627 else if (outCh
>= 0xD800 && outCh
<= 0xDBFF) {
1628 /* first surrogate */
1631 else if (outCh
>= 0xDC00 && outCh
<= 0xDFFF) {
1632 errmsg
= "unexpected second surrogate";
1640 else { /* now leaving a base-64 section */
1644 errmsg
= "second surrogate missing at end of shift sequence";
1647 if (base64bits
> 0) { /* left-over bits */
1648 if (base64bits
>= 6) {
1649 /* We've seen at least one base-64 character */
1650 errmsg
= "partial character in shift sequence";
1654 /* Some bits remain; they should be zero */
1655 if (base64buffer
!= 0) {
1656 errmsg
= "non-zero padding bits in shift sequence";
1662 /* '-' is absorbed; other terminating
1663 characters are preserved */
1668 else if ( ch
== '+' ) {
1669 startinpos
= s
-starts
;
1670 s
++; /* consume '+' */
1671 if (s
< e
&& *s
== '-') { /* '+-' encodes '+' */
1675 else { /* begin base64-encoded section */
1681 else if (DECODE_DIRECT(ch
)) { /* character decodes as itself */
1686 startinpos
= s
-starts
;
1688 errmsg
= "unexpected special character";
1693 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1694 endinpos
= s
-starts
;
1695 if (unicode_decode_call_errorhandler(
1696 errors
, &errorHandler
,
1698 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1699 &unicode
, &outpos
, &p
))
1705 if (inShift
&& !consumed
) { /* in shift sequence, no more to follow */
1706 /* if we're in an inconsistent state, that's an error */
1708 (base64bits
>= 6) ||
1709 (base64bits
> 0 && base64buffer
!= 0)) {
1710 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1712 if (unicode_decode_call_errorhandler(
1713 errors
, &errorHandler
,
1714 "utf7", "unterminated shift sequence",
1715 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1716 &unicode
, &outpos
, &p
))
1724 p
= shiftOutStart
; /* back off output */
1725 *consumed
= startinpos
;
1728 *consumed
= s
-starts
;
1732 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1735 Py_XDECREF(errorHandler
);
1737 return (PyObject
*)unicode
;
1740 Py_XDECREF(errorHandler
);
1747 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1750 int base64WhiteSpace
,
1754 /* It might be possible to tighten this worst case */
1755 Py_ssize_t allocated
= 5 * size
;
1758 unsigned int base64bits
= 0;
1759 unsigned long base64buffer
= 0;
1763 if (allocated
/ 5 != size
)
1764 return PyErr_NoMemory();
1767 return PyString_FromStringAndSize(NULL
, 0);
1769 v
= PyString_FromStringAndSize(NULL
, allocated
);
1773 start
= out
= PyString_AS_STRING(v
);
1774 for (;i
< size
; ++i
) {
1775 Py_UNICODE ch
= s
[i
];
1778 if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1780 if (base64bits
) { /* output remaining bits */
1781 *out
++ = TO_BASE64(base64buffer
<< (6-base64bits
));
1786 /* Characters not in the BASE64 set implicitly unshift the sequence
1787 so no '-' is required, except if the character is itself a '-' */
1788 if (IS_BASE64(ch
) || ch
== '-') {
1797 else { /* not in a shift sequence */
1802 else if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1813 #ifdef Py_UNICODE_WIDE
1814 if (ch
>= 0x10000) {
1815 /* code first surrogate */
1817 base64buffer
= (base64buffer
<< 16) | 0xd800 | ((ch
-0x10000) >> 10);
1818 while (base64bits
>= 6) {
1819 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1822 /* prepare second surrogate */
1823 ch
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1827 base64buffer
= (base64buffer
<< 16) | ch
;
1828 while (base64bits
>= 6) {
1829 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1834 *out
++= TO_BASE64(base64buffer
<< (6-base64bits
) );
1838 _PyString_Resize(&v
, out
- start
);
1845 #undef DECODE_DIRECT
1846 #undef ENCODE_DIRECT
1848 /* --- UTF-8 Codec -------------------------------------------------------- */
1851 char utf8_code_length
[256] = {
1852 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1853 illegal prefix. see RFC 2279 for details */
1854 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1867 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1869 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1872 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1876 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1879 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1882 Py_ssize_t
*consumed
)
1884 const char *starts
= s
;
1886 Py_ssize_t startinpos
;
1887 Py_ssize_t endinpos
;
1890 PyUnicodeObject
*unicode
;
1892 const char *errmsg
= "";
1893 PyObject
*errorHandler
= NULL
;
1894 PyObject
*exc
= NULL
;
1896 /* Note: size will always be longer than the resulting Unicode
1898 unicode
= _PyUnicode_New(size
);
1904 return (PyObject
*)unicode
;
1907 /* Unpack UTF-8 encoded data */
1912 Py_UCS4 ch
= (unsigned char)*s
;
1915 *p
++ = (Py_UNICODE
)ch
;
1920 n
= utf8_code_length
[ch
];
1926 errmsg
= "unexpected end of data";
1927 startinpos
= s
-starts
;
1936 errmsg
= "unexpected code byte";
1937 startinpos
= s
-starts
;
1938 endinpos
= startinpos
+1;
1942 errmsg
= "internal error";
1943 startinpos
= s
-starts
;
1944 endinpos
= startinpos
+1;
1948 if ((s
[1] & 0xc0) != 0x80) {
1949 errmsg
= "invalid data";
1950 startinpos
= s
-starts
;
1951 endinpos
= startinpos
+2;
1954 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1956 startinpos
= s
-starts
;
1957 endinpos
= startinpos
+2;
1958 errmsg
= "illegal encoding";
1962 *p
++ = (Py_UNICODE
)ch
;
1966 if ((s
[1] & 0xc0) != 0x80 ||
1967 (s
[2] & 0xc0) != 0x80) {
1968 errmsg
= "invalid data";
1969 startinpos
= s
-starts
;
1970 endinpos
= startinpos
+3;
1973 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1975 /* Note: UTF-8 encodings of surrogates are considered
1976 legal UTF-8 sequences;
1978 XXX For wide builds (UCS-4) we should probably try
1979 to recombine the surrogates into a single code
1982 errmsg
= "illegal encoding";
1983 startinpos
= s
-starts
;
1984 endinpos
= startinpos
+3;
1988 *p
++ = (Py_UNICODE
)ch
;
1992 if ((s
[1] & 0xc0) != 0x80 ||
1993 (s
[2] & 0xc0) != 0x80 ||
1994 (s
[3] & 0xc0) != 0x80) {
1995 errmsg
= "invalid data";
1996 startinpos
= s
-starts
;
1997 endinpos
= startinpos
+4;
2000 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
2001 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
2002 /* validate and convert to UTF-16 */
2003 if ((ch
< 0x10000) /* minimum value allowed for 4
2005 || (ch
> 0x10ffff)) /* maximum value allowed for
2008 errmsg
= "illegal encoding";
2009 startinpos
= s
-starts
;
2010 endinpos
= startinpos
+4;
2013 #ifdef Py_UNICODE_WIDE
2014 *p
++ = (Py_UNICODE
)ch
;
2016 /* compute and append the two surrogates: */
2018 /* translate from 10000..10FFFF to 0..FFFF */
2021 /* high surrogate = top 10 bits added to D800 */
2022 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
2024 /* low surrogate = bottom 10 bits added to DC00 */
2025 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
2030 /* Other sizes are only needed for UCS-4 */
2031 errmsg
= "unsupported Unicode code range";
2032 startinpos
= s
-starts
;
2033 endinpos
= startinpos
+n
;
2040 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2041 if (unicode_decode_call_errorhandler(
2042 errors
, &errorHandler
,
2044 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2045 &unicode
, &outpos
, &p
))
2049 *consumed
= s
-starts
;
2052 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2055 Py_XDECREF(errorHandler
);
2057 return (PyObject
*)unicode
;
2060 Py_XDECREF(errorHandler
);
2066 /* Allocation strategy: if the string is short, convert into a stack buffer
2067 and allocate exactly as much space needed at the end. Else allocate the
2068 maximum possible needed (4 result bytes per Unicode character), and return
2069 the excess memory at the end.
2072 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
2076 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2078 Py_ssize_t i
; /* index into s of next input byte */
2079 PyObject
*v
; /* result string object */
2080 char *p
; /* next free byte in output buffer */
2081 Py_ssize_t nallocated
; /* number of result bytes allocated */
2082 Py_ssize_t nneeded
; /* number of result bytes needed */
2083 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
2088 if (size
<= MAX_SHORT_UNICHARS
) {
2089 /* Write into the stack buffer; nallocated can't overflow.
2090 * At the end, we'll allocate exactly as much heap space as it
2091 * turns out we need.
2093 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
2094 v
= NULL
; /* will allocate after we're done */
2098 /* Overallocate on the heap, and give the excess back at the end. */
2099 nallocated
= size
* 4;
2100 if (nallocated
/ 4 != size
) /* overflow! */
2101 return PyErr_NoMemory();
2102 v
= PyString_FromStringAndSize(NULL
, nallocated
);
2105 p
= PyString_AS_STRING(v
);
2108 for (i
= 0; i
< size
;) {
2109 Py_UCS4 ch
= s
[i
++];
2115 else if (ch
< 0x0800) {
2116 /* Encode Latin-1 */
2117 *p
++ = (char)(0xc0 | (ch
>> 6));
2118 *p
++ = (char)(0x80 | (ch
& 0x3f));
2121 /* Encode UCS2 Unicode ordinals */
2123 /* Special case: check for high surrogate */
2124 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
2126 /* Check for low surrogate and combine the two to
2127 form a UCS4 value */
2128 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2129 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
2133 /* Fall through: handles isolated high surrogates */
2135 *p
++ = (char)(0xe0 | (ch
>> 12));
2136 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2137 *p
++ = (char)(0x80 | (ch
& 0x3f));
2141 /* Encode UCS4 Unicode ordinals */
2142 *p
++ = (char)(0xf0 | (ch
>> 18));
2143 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
2144 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2145 *p
++ = (char)(0x80 | (ch
& 0x3f));
2150 /* This was stack allocated. */
2151 nneeded
= p
- stackbuf
;
2152 assert(nneeded
<= nallocated
);
2153 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
2156 /* Cut back to size actually needed. */
2157 nneeded
= p
- PyString_AS_STRING(v
);
2158 assert(nneeded
<= nallocated
);
2159 _PyString_Resize(&v
, nneeded
);
2163 #undef MAX_SHORT_UNICHARS
2166 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
2168 if (!PyUnicode_Check(unicode
)) {
2169 PyErr_BadArgument();
2172 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
2173 PyUnicode_GET_SIZE(unicode
),
2177 /* --- UTF-32 Codec ------------------------------------------------------- */
2180 PyUnicode_DecodeUTF32(const char *s
,
2185 return PyUnicode_DecodeUTF32Stateful(s
, size
, errors
, byteorder
, NULL
);
2189 PyUnicode_DecodeUTF32Stateful(const char *s
,
2193 Py_ssize_t
*consumed
)
2195 const char *starts
= s
;
2196 Py_ssize_t startinpos
;
2197 Py_ssize_t endinpos
;
2199 PyUnicodeObject
*unicode
;
2201 #ifndef Py_UNICODE_WIDE
2204 const int pairs
= 0;
2206 const unsigned char *q
, *e
;
2207 int bo
= 0; /* assume native ordering by default */
2208 const char *errmsg
= "";
2209 /* Offsets from q for retrieving bytes in the right order. */
2210 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2211 int iorder
[] = {0, 1, 2, 3};
2213 int iorder
[] = {3, 2, 1, 0};
2215 PyObject
*errorHandler
= NULL
;
2216 PyObject
*exc
= NULL
;
2217 /* On narrow builds we split characters outside the BMP into two
2218 codepoints => count how much extra space we need. */
2219 #ifndef Py_UNICODE_WIDE
2220 for (i
= pairs
= 0; i
< size
/4; i
++)
2221 if (((Py_UCS4
*)s
)[i
] >= 0x10000)
2225 /* This might be one to much, because of a BOM */
2226 unicode
= _PyUnicode_New((size
+3)/4+pairs
);
2230 return (PyObject
*)unicode
;
2232 /* Unpack UTF-32 encoded data */
2234 q
= (unsigned char *)s
;
2240 /* Check for BOM marks (U+FEFF) in the input and adjust current
2241 byte order setting accordingly. In native mode, the leading BOM
2242 mark is skipped, in all other modes, it is copied to the output
2243 stream as-is (giving a ZWNBSP character). */
2246 const Py_UCS4 bom
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2247 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2248 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2249 if (bom
== 0x0000FEFF) {
2253 else if (bom
== 0xFFFE0000) {
2258 if (bom
== 0x0000FEFF) {
2262 else if (bom
== 0xFFFE0000) {
2287 /* remaining bytes at the end? (size should be divisible by 4) */
2291 errmsg
= "truncated data";
2292 startinpos
= ((const char *)q
)-starts
;
2293 endinpos
= ((const char *)e
)-starts
;
2295 /* The remaining input chars are ignored if the callback
2296 chooses to skip the input */
2298 ch
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2299 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2303 errmsg
= "codepoint not in range(0x110000)";
2304 startinpos
= ((const char *)q
)-starts
;
2305 endinpos
= startinpos
+4;
2308 #ifndef Py_UNICODE_WIDE
2311 *p
++ = 0xD800 | ((ch
-0x10000) >> 10);
2312 *p
++ = 0xDC00 | ((ch
-0x10000) & 0x3FF);
2320 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2321 if (unicode_decode_call_errorhandler(
2322 errors
, &errorHandler
,
2324 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2325 &unicode
, &outpos
, &p
))
2333 *consumed
= (const char *)q
-starts
;
2336 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2339 Py_XDECREF(errorHandler
);
2341 return (PyObject
*)unicode
;
2345 Py_XDECREF(errorHandler
);
2351 PyUnicode_EncodeUTF32(const Py_UNICODE
*s
,
2358 Py_ssize_t nsize
, bytesize
;
2359 #ifndef Py_UNICODE_WIDE
2360 Py_ssize_t i
, pairs
;
2362 const int pairs
= 0;
2364 /* Offsets from p for storing byte pairs in the right order. */
2365 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366 int iorder
[] = {0, 1, 2, 3};
2368 int iorder
[] = {3, 2, 1, 0};
2371 #define STORECHAR(CH) \
2373 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2374 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2375 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2376 p[iorder[0]] = (CH) & 0xff; \
2380 /* In narrow builds we can output surrogate pairs as one codepoint,
2381 so we need less space. */
2382 #ifndef Py_UNICODE_WIDE
2383 for (i
= pairs
= 0; i
< size
-1; i
++)
2384 if (0xD800 <= s
[i
] && s
[i
] <= 0xDBFF &&
2385 0xDC00 <= s
[i
+1] && s
[i
+1] <= 0xDFFF)
2388 nsize
= (size
- pairs
+ (byteorder
== 0));
2389 bytesize
= nsize
* 4;
2390 if (bytesize
/ 4 != nsize
)
2391 return PyErr_NoMemory();
2392 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2396 p
= (unsigned char *)PyString_AS_STRING(v
);
2402 if (byteorder
== -1) {
2409 else if (byteorder
== 1) {
2417 while (size
-- > 0) {
2419 #ifndef Py_UNICODE_WIDE
2420 if (0xD800 <= ch
&& ch
<= 0xDBFF && size
> 0) {
2422 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2423 ch
= (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2435 PyObject
*PyUnicode_AsUTF32String(PyObject
*unicode
)
2437 if (!PyUnicode_Check(unicode
)) {
2438 PyErr_BadArgument();
2441 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode
),
2442 PyUnicode_GET_SIZE(unicode
),
2447 /* --- UTF-16 Codec ------------------------------------------------------- */
2450 PyUnicode_DecodeUTF16(const char *s
,
2455 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
2459 PyUnicode_DecodeUTF16Stateful(const char *s
,
2463 Py_ssize_t
*consumed
)
2465 const char *starts
= s
;
2466 Py_ssize_t startinpos
;
2467 Py_ssize_t endinpos
;
2469 PyUnicodeObject
*unicode
;
2471 const unsigned char *q
, *e
;
2472 int bo
= 0; /* assume native ordering by default */
2473 const char *errmsg
= "";
2474 /* Offsets from q for retrieving byte pairs in the right order. */
2475 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2476 int ihi
= 1, ilo
= 0;
2478 int ihi
= 0, ilo
= 1;
2480 PyObject
*errorHandler
= NULL
;
2481 PyObject
*exc
= NULL
;
2483 /* Note: size will always be longer than the resulting Unicode
2485 unicode
= _PyUnicode_New(size
);
2489 return (PyObject
*)unicode
;
2491 /* Unpack UTF-16 encoded data */
2493 q
= (unsigned char *)s
;
2499 /* Check for BOM marks (U+FEFF) in the input and adjust current
2500 byte order setting accordingly. In native mode, the leading BOM
2501 mark is skipped, in all other modes, it is copied to the output
2502 stream as-is (giving a ZWNBSP character). */
2505 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
2506 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2507 if (bom
== 0xFEFF) {
2511 else if (bom
== 0xFFFE) {
2516 if (bom
== 0xFEFF) {
2520 else if (bom
== 0xFFFE) {
2541 /* remaining bytes at the end? (size should be even) */
2545 errmsg
= "truncated data";
2546 startinpos
= ((const char *)q
)-starts
;
2547 endinpos
= ((const char *)e
)-starts
;
2549 /* The remaining input chars are ignored if the callback
2550 chooses to skip the input */
2552 ch
= (q
[ihi
] << 8) | q
[ilo
];
2556 if (ch
< 0xD800 || ch
> 0xDFFF) {
2561 /* UTF-16 code pair: */
2563 errmsg
= "unexpected end of data";
2564 startinpos
= (((const char *)q
)-2)-starts
;
2565 endinpos
= ((const char *)e
)-starts
;
2568 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
2569 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
2571 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2572 #ifndef Py_UNICODE_WIDE
2576 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2581 errmsg
= "illegal UTF-16 surrogate";
2582 startinpos
= (((const char *)q
)-4)-starts
;
2583 endinpos
= startinpos
+2;
2588 errmsg
= "illegal encoding";
2589 startinpos
= (((const char *)q
)-2)-starts
;
2590 endinpos
= startinpos
+2;
2591 /* Fall through to report the error */
2594 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2595 if (unicode_decode_call_errorhandler(
2596 errors
, &errorHandler
,
2598 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2599 &unicode
, &outpos
, &p
))
2607 *consumed
= (const char *)q
-starts
;
2610 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2613 Py_XDECREF(errorHandler
);
2615 return (PyObject
*)unicode
;
2619 Py_XDECREF(errorHandler
);
2625 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
2632 Py_ssize_t nsize
, bytesize
;
2633 #ifdef Py_UNICODE_WIDE
2634 Py_ssize_t i
, pairs
;
2636 const int pairs
= 0;
2638 /* Offsets from p for storing byte pairs in the right order. */
2639 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640 int ihi
= 1, ilo
= 0;
2642 int ihi
= 0, ilo
= 1;
2645 #define STORECHAR(CH) \
2647 p[ihi] = ((CH) >> 8) & 0xff; \
2648 p[ilo] = (CH) & 0xff; \
2652 #ifdef Py_UNICODE_WIDE
2653 for (i
= pairs
= 0; i
< size
; i
++)
2654 if (s
[i
] >= 0x10000)
2657 /* 2 * (size + pairs + (byteorder == 0)) */
2658 if (size
> PY_SSIZE_T_MAX
||
2659 size
> PY_SSIZE_T_MAX
- pairs
- (byteorder
== 0))
2660 return PyErr_NoMemory();
2661 nsize
= size
+ pairs
+ (byteorder
== 0);
2662 bytesize
= nsize
* 2;
2663 if (bytesize
/ 2 != nsize
)
2664 return PyErr_NoMemory();
2665 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2669 p
= (unsigned char *)PyString_AS_STRING(v
);
2675 if (byteorder
== -1) {
2680 else if (byteorder
== 1) {
2686 while (size
-- > 0) {
2687 Py_UNICODE ch
= *s
++;
2689 #ifdef Py_UNICODE_WIDE
2690 if (ch
>= 0x10000) {
2691 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
2692 ch
= 0xD800 | ((ch
-0x10000) >> 10);
2703 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
2705 if (!PyUnicode_Check(unicode
)) {
2706 PyErr_BadArgument();
2709 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
2710 PyUnicode_GET_SIZE(unicode
),
2715 /* --- Unicode Escape Codec ----------------------------------------------- */
2717 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
2719 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
2723 const char *starts
= s
;
2724 Py_ssize_t startinpos
;
2725 Py_ssize_t endinpos
;
2732 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
2733 PyObject
*errorHandler
= NULL
;
2734 PyObject
*exc
= NULL
;
2736 /* Escaped strings will always be longer than the resulting
2737 Unicode string, so we start with size here and then reduce the
2738 length after conversion to the true value.
2739 (but if the error callback returns a long replacement string
2740 we'll have to allocate more space) */
2741 v
= _PyUnicode_New(size
);
2745 return (PyObject
*)v
;
2747 p
= PyUnicode_AS_UNICODE(v
);
2755 /* Non-escape characters are interpreted as Unicode ordinals */
2757 *p
++ = (unsigned char) *s
++;
2761 startinpos
= s
-starts
;
2766 c
= '\0'; /* Invalid after \ */
2771 case '\\': *p
++ = '\\'; break;
2772 case '\'': *p
++ = '\''; break;
2773 case '\"': *p
++ = '\"'; break;
2774 case 'b': *p
++ = '\b'; break;
2775 case 'f': *p
++ = '\014'; break; /* FF */
2776 case 't': *p
++ = '\t'; break;
2777 case 'n': *p
++ = '\n'; break;
2778 case 'r': *p
++ = '\r'; break;
2779 case 'v': *p
++ = '\013'; break; /* VT */
2780 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
2782 /* \OOO (octal) escapes */
2783 case '0': case '1': case '2': case '3':
2784 case '4': case '5': case '6': case '7':
2786 if (s
< end
&& '0' <= *s
&& *s
<= '7') {
2787 x
= (x
<<3) + *s
++ - '0';
2788 if (s
< end
&& '0' <= *s
&& *s
<= '7')
2789 x
= (x
<<3) + *s
++ - '0';
2798 message
= "truncated \\xXX escape";
2804 message
= "truncated \\uXXXX escape";
2810 message
= "truncated \\UXXXXXXXX escape";
2813 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2816 if (unicode_decode_call_errorhandler(
2817 errors
, &errorHandler
,
2818 "unicodeescape", "end of string in escape sequence",
2819 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2824 for (i
= 0; i
< digits
; ++i
) {
2825 c
= (unsigned char) s
[i
];
2827 endinpos
= (s
+i
+1)-starts
;
2828 if (unicode_decode_call_errorhandler(
2829 errors
, &errorHandler
,
2830 "unicodeescape", message
,
2831 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2836 chr
= (chr
<<4) & ~0xF;
2837 if (c
>= '0' && c
<= '9')
2839 else if (c
>= 'a' && c
<= 'f')
2840 chr
+= 10 + c
- 'a';
2842 chr
+= 10 + c
- 'A';
2845 if (chr
== 0xffffffff && PyErr_Occurred())
2846 /* _decoding_error will have already written into the
2850 /* when we get here, chr is a 32-bit unicode character */
2852 /* UCS-2 character */
2853 *p
++ = (Py_UNICODE
) chr
;
2854 else if (chr
<= 0x10ffff) {
2855 /* UCS-4 character. Either store directly, or as
2857 #ifdef Py_UNICODE_WIDE
2861 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
2862 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
2865 endinpos
= s
-starts
;
2866 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2867 if (unicode_decode_call_errorhandler(
2868 errors
, &errorHandler
,
2869 "unicodeescape", "illegal Unicode character",
2870 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2878 message
= "malformed \\N character escape";
2879 if (ucnhash_CAPI
== NULL
) {
2880 /* load the unicode data module */
2882 m
= PyImport_ImportModuleNoBlock("unicodedata");
2885 api
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
2889 ucnhash_CAPI
= (_PyUnicode_Name_CAPI
*)PyCObject_AsVoidPtr(api
);
2891 if (ucnhash_CAPI
== NULL
)
2895 const char *start
= s
+1;
2896 /* look for the closing brace */
2897 while (*s
!= '}' && s
< end
)
2899 if (s
> start
&& s
< end
&& *s
== '}') {
2900 /* found a name. look it up in the unicode database */
2901 message
= "unknown Unicode character name";
2903 if (ucnhash_CAPI
->getcode(NULL
, start
, (int)(s
-start
-1), &chr
))
2907 endinpos
= s
-starts
;
2908 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2909 if (unicode_decode_call_errorhandler(
2910 errors
, &errorHandler
,
2911 "unicodeescape", message
,
2912 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2919 message
= "\\ at end of string";
2921 endinpos
= s
-starts
;
2922 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2923 if (unicode_decode_call_errorhandler(
2924 errors
, &errorHandler
,
2925 "unicodeescape", message
,
2926 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2932 *p
++ = (unsigned char)s
[-1];
2939 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2941 Py_XDECREF(errorHandler
);
2943 return (PyObject
*)v
;
2948 "\\N escapes not supported (can't load unicodedata module)"
2951 Py_XDECREF(errorHandler
);
2957 Py_XDECREF(errorHandler
);
2962 /* Return a Unicode-Escape string version of the Unicode object.
2964 If quotes is true, the string is enclosed in u"" or u'' quotes as
2969 Py_LOCAL_INLINE(const Py_UNICODE
*) findchar(const Py_UNICODE
*s
,
2973 /* like wcschr, but doesn't stop at NULL characters */
2975 while (size
-- > 0) {
2985 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
2992 static const char *hexdigit
= "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE
2994 const Py_ssize_t expandsize
= 10;
2996 const Py_ssize_t expandsize
= 6;
2999 /* XXX(nnorwitz): rather than over-allocating, it would be
3000 better to choose a different scheme. Perhaps scan the
3001 first N-chars of the string and allocate based on that size.
3003 /* Initial allocation is based on the longest-possible unichr
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007 unichr, so in this case it's the longest unichr escape. In
3008 narrow (UTF-16) builds this is five chars per source unichr
3009 since there are two unichrs in the surrogate pair, so in narrow
3010 (UTF-16) builds it's not the longest unichr escape.
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013 so in the narrow (UTF-16) build case it's the longest unichr
3017 if (size
> (PY_SSIZE_T_MAX
- 2 - 1) / expandsize
)
3018 return PyErr_NoMemory();
3020 repr
= PyString_FromStringAndSize(NULL
,
3027 p
= PyString_AS_STRING(repr
);
3031 *p
++ = (findchar(s
, size
, '\'') &&
3032 !findchar(s
, size
, '"')) ? '"' : '\'';
3034 while (size
-- > 0) {
3035 Py_UNICODE ch
= *s
++;
3037 /* Escape quotes and backslashes */
3039 ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1]) || ch
== '\\') {
3045 #ifdef Py_UNICODE_WIDE
3046 /* Map 21-bit characters to '\U00xxxxxx' */
3047 else if (ch
>= 0x10000) {
3050 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
3051 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
3052 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
3053 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
3054 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
3055 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
3056 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
3057 *p
++ = hexdigit
[ch
& 0x0000000F];
3061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062 else if (ch
>= 0xD800 && ch
< 0xDC00) {
3068 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3069 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3072 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
3073 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
3074 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
3075 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
3076 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
3077 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
3078 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
3079 *p
++ = hexdigit
[ucs
& 0x0000000F];
3082 /* Fall through: isolated surrogates are copied as-is */
3088 /* Map 16-bit characters to '\uxxxx' */
3092 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
3093 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
3094 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3095 *p
++ = hexdigit
[ch
& 0x000F];
3098 /* Map special whitespace to '\t', \n', '\r' */
3099 else if (ch
== '\t') {
3103 else if (ch
== '\n') {
3107 else if (ch
== '\r') {
3112 /* Map non-printable US ASCII to '\xhh' */
3113 else if (ch
< ' ' || ch
>= 0x7F) {
3116 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3117 *p
++ = hexdigit
[ch
& 0x000F];
3120 /* Copy everything else as-is */
3125 *p
++ = PyString_AS_STRING(repr
)[1];
3128 _PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
));
3132 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
3135 return unicodeescape_string(s
, size
, 0);
3138 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
3140 if (!PyUnicode_Check(unicode
)) {
3141 PyErr_BadArgument();
3144 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3145 PyUnicode_GET_SIZE(unicode
));
3148 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3150 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
3154 const char *starts
= s
;
3155 Py_ssize_t startinpos
;
3156 Py_ssize_t endinpos
;
3162 PyObject
*errorHandler
= NULL
;
3163 PyObject
*exc
= NULL
;
3165 /* Escaped strings will always be longer than the resulting
3166 Unicode string, so we start with size here and then reduce the
3167 length after conversion to the true value. (But decoding error
3168 handler might have to resize the string) */
3169 v
= _PyUnicode_New(size
);
3173 return (PyObject
*)v
;
3174 p
= PyUnicode_AS_UNICODE(v
);
3182 /* Non-escape characters are interpreted as Unicode ordinals */
3184 *p
++ = (unsigned char)*s
++;
3187 startinpos
= s
-starts
;
3189 /* \u-escapes are only interpreted iff the number of leading
3190 backslashes if odd */
3195 *p
++ = (unsigned char)*s
++;
3197 if (((s
- bs
) & 1) == 0 ||
3199 (*s
!= 'u' && *s
!= 'U')) {
3203 count
= *s
=='u' ? 4 : 8;
3206 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3207 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3208 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
3209 c
= (unsigned char)*s
;
3211 endinpos
= s
-starts
;
3212 if (unicode_decode_call_errorhandler(
3213 errors
, &errorHandler
,
3214 "rawunicodeescape", "truncated \\uXXXX",
3215 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3221 if (c
>= '0' && c
<= '9')
3223 else if (c
>= 'a' && c
<= 'f')
3229 /* UCS-2 character */
3230 *p
++ = (Py_UNICODE
) x
;
3231 else if (x
<= 0x10ffff) {
3232 /* UCS-4 character. Either store directly, or as
3234 #ifdef Py_UNICODE_WIDE
3235 *p
++ = (Py_UNICODE
) x
;
3238 *p
++ = 0xD800 + (Py_UNICODE
) (x
>> 10);
3239 *p
++ = 0xDC00 + (Py_UNICODE
) (x
& 0x03FF);
3242 endinpos
= s
-starts
;
3243 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3244 if (unicode_decode_call_errorhandler(
3245 errors
, &errorHandler
,
3246 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3247 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3254 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3256 Py_XDECREF(errorHandler
);
3258 return (PyObject
*)v
;
3262 Py_XDECREF(errorHandler
);
3267 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
3274 static const char *hexdigit
= "0123456789abcdef";
3275 #ifdef Py_UNICODE_WIDE
3276 const Py_ssize_t expandsize
= 10;
3278 const Py_ssize_t expandsize
= 6;
3281 if (size
> PY_SSIZE_T_MAX
/ expandsize
)
3282 return PyErr_NoMemory();
3284 repr
= PyString_FromStringAndSize(NULL
, expandsize
* size
);
3290 p
= q
= PyString_AS_STRING(repr
);
3291 while (size
-- > 0) {
3292 Py_UNICODE ch
= *s
++;
3293 #ifdef Py_UNICODE_WIDE
3294 /* Map 32-bit characters to '\Uxxxxxxxx' */
3295 if (ch
>= 0x10000) {
3298 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
3299 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
3300 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
3301 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
3302 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3303 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3304 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3305 *p
++ = hexdigit
[ch
& 15];
3309 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3310 if (ch
>= 0xD800 && ch
< 0xDC00) {
3316 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3317 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3320 *p
++ = hexdigit
[(ucs
>> 28) & 0xf];
3321 *p
++ = hexdigit
[(ucs
>> 24) & 0xf];
3322 *p
++ = hexdigit
[(ucs
>> 20) & 0xf];
3323 *p
++ = hexdigit
[(ucs
>> 16) & 0xf];
3324 *p
++ = hexdigit
[(ucs
>> 12) & 0xf];
3325 *p
++ = hexdigit
[(ucs
>> 8) & 0xf];
3326 *p
++ = hexdigit
[(ucs
>> 4) & 0xf];
3327 *p
++ = hexdigit
[ucs
& 0xf];
3330 /* Fall through: isolated surrogates are copied as-is */
3335 /* Map 16-bit characters to '\uxxxx' */
3339 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3340 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3341 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3342 *p
++ = hexdigit
[ch
& 15];
3344 /* Copy everything else as-is */
3349 _PyString_Resize(&repr
, p
- q
);
3353 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
3355 if (!PyUnicode_Check(unicode
)) {
3356 PyErr_BadArgument();
3359 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3360 PyUnicode_GET_SIZE(unicode
));
3363 /* --- Unicode Internal Codec ------------------------------------------- */
3365 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
3369 const char *starts
= s
;
3370 Py_ssize_t startinpos
;
3371 Py_ssize_t endinpos
;
3377 PyObject
*errorHandler
= NULL
;
3378 PyObject
*exc
= NULL
;
3380 #ifdef Py_UNICODE_WIDE
3381 Py_UNICODE unimax
= PyUnicode_GetMax();
3384 /* XXX overflow detection missing */
3385 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
3388 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
3389 return (PyObject
*)v
;
3390 p
= PyUnicode_AS_UNICODE(v
);
3394 memcpy(p
, s
, sizeof(Py_UNICODE
));
3395 /* We have to sanity check the raw data, otherwise doom looms for
3396 some malformed UCS-4 data. */
3398 #ifdef Py_UNICODE_WIDE
3399 *p
> unimax
|| *p
< 0 ||
3401 end
-s
< Py_UNICODE_SIZE
3404 startinpos
= s
- starts
;
3405 if (end
-s
< Py_UNICODE_SIZE
) {
3406 endinpos
= end
-starts
;
3407 reason
= "truncated input";
3410 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
3411 reason
= "illegal code point (> 0x10FFFF)";
3413 outpos
= p
- PyUnicode_AS_UNICODE(v
);
3414 if (unicode_decode_call_errorhandler(
3415 errors
, &errorHandler
,
3416 "unicode_internal", reason
,
3417 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3424 s
+= Py_UNICODE_SIZE
;
3428 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3430 Py_XDECREF(errorHandler
);
3432 return (PyObject
*)v
;
3436 Py_XDECREF(errorHandler
);
3441 /* --- Latin-1 Codec ------------------------------------------------------ */
3443 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
3450 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3452 Py_UNICODE r
= *(unsigned char*)s
;
3453 return PyUnicode_FromUnicode(&r
, 1);
3456 v
= _PyUnicode_New(size
);
3460 return (PyObject
*)v
;
3461 p
= PyUnicode_AS_UNICODE(v
);
3463 *p
++ = (unsigned char)*s
++;
3464 return (PyObject
*)v
;
3471 /* create or adjust a UnicodeEncodeError */
3472 static void make_encode_exception(PyObject
**exceptionObject
,
3473 const char *encoding
,
3474 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3475 Py_ssize_t startpos
, Py_ssize_t endpos
,
3478 if (*exceptionObject
== NULL
) {
3479 *exceptionObject
= PyUnicodeEncodeError_Create(
3480 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3483 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
3485 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
3487 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
3491 Py_DECREF(*exceptionObject
);
3492 *exceptionObject
= NULL
;
3496 /* raises a UnicodeEncodeError */
3497 static void raise_encode_exception(PyObject
**exceptionObject
,
3498 const char *encoding
,
3499 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3500 Py_ssize_t startpos
, Py_ssize_t endpos
,
3503 make_encode_exception(exceptionObject
,
3504 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3505 if (*exceptionObject
!= NULL
)
3506 PyCodec_StrictErrors(*exceptionObject
);
3509 /* error handling callback helper:
3510 build arguments, call the callback and check the arguments,
3511 put the result into newpos and return the replacement string, which
3512 has to be freed by the caller */
3513 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
3514 PyObject
**errorHandler
,
3515 const char *encoding
, const char *reason
,
3516 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
3517 Py_ssize_t startpos
, Py_ssize_t endpos
,
3520 static char *argparse
= "O!n;encoding error handler must return (unicode, int) tuple";
3523 PyObject
*resunicode
;
3525 if (*errorHandler
== NULL
) {
3526 *errorHandler
= PyCodec_LookupError(errors
);
3527 if (*errorHandler
== NULL
)
3531 make_encode_exception(exceptionObject
,
3532 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3533 if (*exceptionObject
== NULL
)
3536 restuple
= PyObject_CallFunctionObjArgs(
3537 *errorHandler
, *exceptionObject
, NULL
);
3538 if (restuple
== NULL
)
3540 if (!PyTuple_Check(restuple
)) {
3541 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
3542 Py_DECREF(restuple
);
3545 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3546 &resunicode
, newpos
)) {
3547 Py_DECREF(restuple
);
3551 *newpos
= size
+*newpos
;
3552 if (*newpos
<0 || *newpos
>size
) {
3553 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
3554 Py_DECREF(restuple
);
3557 Py_INCREF(resunicode
);
3558 Py_DECREF(restuple
);
3562 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
3569 /* pointers to the beginning and end+1 of input */
3570 const Py_UNICODE
*startp
= p
;
3571 const Py_UNICODE
*endp
= p
+ size
;
3572 /* pointer to the beginning of the unencodable characters */
3573 /* const Py_UNICODE *badp = NULL; */
3574 /* pointer into the output */
3576 /* current output position */
3577 Py_ssize_t respos
= 0;
3579 const char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
3580 const char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3581 PyObject
*errorHandler
= NULL
;
3582 PyObject
*exc
= NULL
;
3583 /* the following variable is used for caching string comparisons
3584 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3585 int known_errorHandler
= -1;
3587 /* allocate enough for a simple encoding without
3588 replacements, if we need more, we'll resize */
3589 res
= PyString_FromStringAndSize(NULL
, size
);
3594 str
= PyString_AS_STRING(res
);
3600 /* can we encode this? */
3602 /* no overflow check, because we know that the space is enough */
3607 Py_ssize_t unicodepos
= p
-startp
;
3608 Py_ssize_t requiredsize
;
3609 PyObject
*repunicode
;
3614 /* startpos for collecting unencodable chars */
3615 const Py_UNICODE
*collstart
= p
;
3616 const Py_UNICODE
*collend
= p
;
3617 /* find all unecodable characters */
3618 while ((collend
< endp
) && ((*collend
)>=limit
))
3620 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3621 if (known_errorHandler
==-1) {
3622 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3623 known_errorHandler
= 1;
3624 else if (!strcmp(errors
, "replace"))
3625 known_errorHandler
= 2;
3626 else if (!strcmp(errors
, "ignore"))
3627 known_errorHandler
= 3;
3628 else if (!strcmp(errors
, "xmlcharrefreplace"))
3629 known_errorHandler
= 4;
3631 known_errorHandler
= 0;
3633 switch (known_errorHandler
) {
3634 case 1: /* strict */
3635 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
3637 case 2: /* replace */
3638 while (collstart
++<collend
)
3639 *str
++ = '?'; /* fall through */
3640 case 3: /* ignore */
3643 case 4: /* xmlcharrefreplace */
3644 respos
= str
-PyString_AS_STRING(res
);
3645 /* determine replacement size (temporarily (mis)uses p) */
3646 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
3655 #ifndef Py_UNICODE_WIDE
3661 else if (*p
<1000000)
3667 requiredsize
= respos
+repsize
+(endp
-collend
);
3668 if (requiredsize
> ressize
) {
3669 if (requiredsize
<2*ressize
)
3670 requiredsize
= 2*ressize
;
3671 if (_PyString_Resize(&res
, requiredsize
))
3673 str
= PyString_AS_STRING(res
) + respos
;
3674 ressize
= requiredsize
;
3676 /* generate replacement (temporarily (mis)uses p) */
3677 for (p
= collstart
; p
< collend
; ++p
) {
3678 str
+= sprintf(str
, "&#%d;", (int)*p
);
3683 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
3684 encoding
, reason
, startp
, size
, &exc
,
3685 collstart
-startp
, collend
-startp
, &newpos
);
3686 if (repunicode
== NULL
)
3688 /* need more space? (at least enough for what we have+the
3689 replacement+the rest of the string, so we won't have to
3690 check space for encodable characters) */
3691 respos
= str
-PyString_AS_STRING(res
);
3692 repsize
= PyUnicode_GET_SIZE(repunicode
);
3693 requiredsize
= respos
+repsize
+(endp
-collend
);
3694 if (requiredsize
> ressize
) {
3695 if (requiredsize
<2*ressize
)
3696 requiredsize
= 2*ressize
;
3697 if (_PyString_Resize(&res
, requiredsize
)) {
3698 Py_DECREF(repunicode
);
3701 str
= PyString_AS_STRING(res
) + respos
;
3702 ressize
= requiredsize
;
3704 /* check if there is anything unencodable in the replacement
3705 and copy it to the output */
3706 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
3709 raise_encode_exception(&exc
, encoding
, startp
, size
,
3710 unicodepos
, unicodepos
+1, reason
);
3711 Py_DECREF(repunicode
);
3716 p
= startp
+ newpos
;
3717 Py_DECREF(repunicode
);
3721 /* Resize if we allocated to much */
3722 respos
= str
-PyString_AS_STRING(res
);
3724 /* If this falls res will be NULL */
3725 _PyString_Resize(&res
, respos
);
3726 Py_XDECREF(errorHandler
);
3732 Py_XDECREF(errorHandler
);
3737 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
3741 return unicode_encode_ucs1(p
, size
, errors
, 256);
3744 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
3746 if (!PyUnicode_Check(unicode
)) {
3747 PyErr_BadArgument();
3750 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
3751 PyUnicode_GET_SIZE(unicode
),
3755 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3757 PyObject
*PyUnicode_DecodeASCII(const char *s
,
3761 const char *starts
= s
;
3764 Py_ssize_t startinpos
;
3765 Py_ssize_t endinpos
;
3768 PyObject
*errorHandler
= NULL
;
3769 PyObject
*exc
= NULL
;
3771 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3772 if (size
== 1 && *(unsigned char*)s
< 128) {
3773 Py_UNICODE r
= *(unsigned char*)s
;
3774 return PyUnicode_FromUnicode(&r
, 1);
3777 v
= _PyUnicode_New(size
);
3781 return (PyObject
*)v
;
3782 p
= PyUnicode_AS_UNICODE(v
);
3785 register unsigned char c
= (unsigned char)*s
;
3791 startinpos
= s
-starts
;
3792 endinpos
= startinpos
+ 1;
3793 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
3794 if (unicode_decode_call_errorhandler(
3795 errors
, &errorHandler
,
3796 "ascii", "ordinal not in range(128)",
3797 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3802 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
3803 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3805 Py_XDECREF(errorHandler
);
3807 return (PyObject
*)v
;
3811 Py_XDECREF(errorHandler
);
3816 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
3820 return unicode_encode_ucs1(p
, size
, errors
, 128);
3823 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
3825 if (!PyUnicode_Check(unicode
)) {
3826 PyErr_BadArgument();
3829 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
3830 PyUnicode_GET_SIZE(unicode
),
3834 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3836 /* --- MBCS codecs for Windows -------------------------------------------- */
3838 #if SIZEOF_INT < SIZEOF_SIZE_T
3842 /* XXX This code is limited to "true" double-byte encodings, as
3843 a) it assumes an incomplete character consists of a single byte, and
3844 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3845 encodings, see IsDBCSLeadByteEx documentation. */
3847 static int is_dbcs_lead_byte(const char *s
, int offset
)
3849 const char *curr
= s
+ offset
;
3851 if (IsDBCSLeadByte(*curr
)) {
3852 const char *prev
= CharPrev(s
, curr
);
3853 return (prev
== curr
) || !IsDBCSLeadByte(*prev
) || (curr
- prev
== 2);
3859 * Decode MBCS string into unicode object. If 'final' is set, converts
3860 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3862 static int decode_mbcs(PyUnicodeObject
**v
,
3863 const char *s
, /* MBCS string */
3864 int size
, /* sizeof MBCS string */
3873 /* Skip trailing lead-byte unless 'final' is set */
3874 if (!final
&& size
>= 1 && is_dbcs_lead_byte(s
, size
- 1))
3877 /* First get the size of the result */
3879 usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
3881 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3887 /* Create unicode object */
3888 *v
= _PyUnicode_New(usize
);
3893 /* Extend unicode object */
3894 n
= PyUnicode_GET_SIZE(*v
);
3895 if (_PyUnicode_Resize(v
, n
+ usize
) < 0)
3899 /* Do the conversion */
3901 p
= PyUnicode_AS_UNICODE(*v
) + n
;
3902 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
3903 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3911 PyObject
*PyUnicode_DecodeMBCSStateful(const char *s
,
3914 Py_ssize_t
*consumed
)
3916 PyUnicodeObject
*v
= NULL
;
3925 done
= decode_mbcs(&v
, s
, INT_MAX
, 0);
3928 done
= decode_mbcs(&v
, s
, (int)size
, !consumed
);
3939 if (size
> INT_MAX
) {
3946 return (PyObject
*)v
;
3949 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
3953 return PyUnicode_DecodeMBCSStateful(s
, size
, errors
, NULL
);
3957 * Convert unicode into string object (MBCS).
3958 * Returns 0 if succeed, -1 otherwise.
3960 static int encode_mbcs(PyObject
**repr
,
3961 const Py_UNICODE
*p
, /* unicode */
3962 int size
) /* size of unicode */
3969 /* First get the size of the result */
3971 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
3972 if (mbcssize
== 0) {
3973 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3978 if (*repr
== NULL
) {
3979 /* Create string object */
3980 *repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
3985 /* Extend string object */
3986 n
= PyString_Size(*repr
);
3987 if (_PyString_Resize(repr
, n
+ mbcssize
) < 0)
3991 /* Do the conversion */
3993 char *s
= PyString_AS_STRING(*repr
) + n
;
3994 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
3995 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
4003 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
4007 PyObject
*repr
= NULL
;
4013 ret
= encode_mbcs(&repr
, p
, INT_MAX
);
4016 ret
= encode_mbcs(&repr
, p
, (int)size
);
4024 if (size
> INT_MAX
) {
4034 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
4036 if (!PyUnicode_Check(unicode
)) {
4037 PyErr_BadArgument();
4040 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
4041 PyUnicode_GET_SIZE(unicode
),
4047 #endif /* MS_WINDOWS */
4049 /* --- Character Mapping Codec -------------------------------------------- */
4051 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
4056 const char *starts
= s
;
4057 Py_ssize_t startinpos
;
4058 Py_ssize_t endinpos
;
4063 Py_ssize_t extrachars
= 0;
4064 PyObject
*errorHandler
= NULL
;
4065 PyObject
*exc
= NULL
;
4066 Py_UNICODE
*mapstring
= NULL
;
4067 Py_ssize_t maplen
= 0;
4069 /* Default to Latin-1 */
4070 if (mapping
== NULL
)
4071 return PyUnicode_DecodeLatin1(s
, size
, errors
);
4073 v
= _PyUnicode_New(size
);
4077 return (PyObject
*)v
;
4078 p
= PyUnicode_AS_UNICODE(v
);
4080 if (PyUnicode_CheckExact(mapping
)) {
4081 mapstring
= PyUnicode_AS_UNICODE(mapping
);
4082 maplen
= PyUnicode_GET_SIZE(mapping
);
4084 unsigned char ch
= *s
;
4085 Py_UNICODE x
= 0xfffe; /* illegal value */
4091 /* undefined mapping */
4092 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4093 startinpos
= s
-starts
;
4094 endinpos
= startinpos
+1;
4095 if (unicode_decode_call_errorhandler(
4096 errors
, &errorHandler
,
4097 "charmap", "character maps to <undefined>",
4098 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4110 unsigned char ch
= *s
;
4113 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4114 w
= PyInt_FromLong((long)ch
);
4117 x
= PyObject_GetItem(mapping
, w
);
4120 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4121 /* No mapping found means: mapping is undefined. */
4130 if (PyInt_Check(x
)) {
4131 long value
= PyInt_AS_LONG(x
);
4132 if (value
< 0 || value
> 65535) {
4133 PyErr_SetString(PyExc_TypeError
,
4134 "character mapping must be in range(65536)");
4138 *p
++ = (Py_UNICODE
)value
;
4140 else if (x
== Py_None
) {
4141 /* undefined mapping */
4142 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4143 startinpos
= s
-starts
;
4144 endinpos
= startinpos
+1;
4145 if (unicode_decode_call_errorhandler(
4146 errors
, &errorHandler
,
4147 "charmap", "character maps to <undefined>",
4148 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4156 else if (PyUnicode_Check(x
)) {
4157 Py_ssize_t targetsize
= PyUnicode_GET_SIZE(x
);
4159 if (targetsize
== 1)
4161 *p
++ = *PyUnicode_AS_UNICODE(x
);
4163 else if (targetsize
> 1) {
4165 if (targetsize
> extrachars
) {
4167 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
4168 Py_ssize_t needed
= (targetsize
- extrachars
) + \
4170 extrachars
+= needed
;
4171 /* XXX overflow detection missing */
4172 if (_PyUnicode_Resize(&v
,
4173 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
4177 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
4180 PyUnicode_AS_UNICODE(x
),
4183 extrachars
-= targetsize
;
4185 /* 1-0 mapping: skip the character */
4188 /* wrong return value */
4189 PyErr_SetString(PyExc_TypeError
,
4190 "character mapping must return integer, None or unicode");
4198 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
4199 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
4201 Py_XDECREF(errorHandler
);
4203 return (PyObject
*)v
;
4206 Py_XDECREF(errorHandler
);
4212 /* Charmap encoding: the lookup table */
4214 struct encoding_map
{
4216 unsigned char level1
[32];
4218 unsigned char level23
[1];
4222 encoding_map_size(PyObject
*obj
, PyObject
* args
)
4224 struct encoding_map
*map
= (struct encoding_map
*)obj
;
4225 return PyInt_FromLong(sizeof(*map
) - 1 + 16*map
->count2
+
4229 static PyMethodDef encoding_map_methods
[] = {
4230 {"size", encoding_map_size
, METH_NOARGS
,
4231 PyDoc_STR("Return the size (in bytes) of this object") },
4236 encoding_map_dealloc(PyObject
* o
)
4241 static PyTypeObject EncodingMapType
= {
4242 PyVarObject_HEAD_INIT(NULL
, 0)
4243 "EncodingMap", /*tp_name*/
4244 sizeof(struct encoding_map
), /*tp_basicsize*/
4247 encoding_map_dealloc
, /*tp_dealloc*/
4254 0, /*tp_as_sequence*/
4255 0, /*tp_as_mapping*/
4262 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
4266 0, /*tp_richcompare*/
4267 0, /*tp_weaklistoffset*/
4270 encoding_map_methods
, /*tp_methods*/
4277 0, /*tp_dictoffset*/
4286 PyUnicode_BuildEncodingMap(PyObject
* string
)
4290 struct encoding_map
*mresult
;
4293 unsigned char level1
[32];
4294 unsigned char level2
[512];
4295 unsigned char *mlevel1
, *mlevel2
, *mlevel3
;
4296 int count2
= 0, count3
= 0;
4298 if (!PyUnicode_Check(string
) || PyUnicode_GetSize(string
) != 256) {
4299 PyErr_BadArgument();
4302 decode
= PyUnicode_AS_UNICODE(string
);
4303 memset(level1
, 0xFF, sizeof level1
);
4304 memset(level2
, 0xFF, sizeof level2
);
4306 /* If there isn't a one-to-one mapping of NULL to \0,
4307 or if there are non-BMP characters, we need to use
4308 a mapping dictionary. */
4311 for (i
= 1; i
< 256; i
++) {
4314 #ifdef Py_UNICODE_WIDE
4315 || decode
[i
] > 0xFFFF
4321 if (decode
[i
] == 0xFFFE)
4322 /* unmapped character */
4324 l1
= decode
[i
] >> 11;
4325 l2
= decode
[i
] >> 7;
4326 if (level1
[l1
] == 0xFF)
4327 level1
[l1
] = count2
++;
4328 if (level2
[l2
] == 0xFF)
4329 level2
[l2
] = count3
++;
4332 if (count2
>= 0xFF || count3
>= 0xFF)
4336 PyObject
*result
= PyDict_New();
4337 PyObject
*key
, *value
;
4340 for (i
= 0; i
< 256; i
++) {
4342 key
= PyInt_FromLong(decode
[i
]);
4343 value
= PyInt_FromLong(i
);
4346 if (PyDict_SetItem(result
, key
, value
) == -1)
4359 /* Create a three-level trie */
4360 result
= PyObject_MALLOC(sizeof(struct encoding_map
) +
4361 16*count2
+ 128*count3
- 1);
4363 return PyErr_NoMemory();
4364 PyObject_Init(result
, &EncodingMapType
);
4365 mresult
= (struct encoding_map
*)result
;
4366 mresult
->count2
= count2
;
4367 mresult
->count3
= count3
;
4368 mlevel1
= mresult
->level1
;
4369 mlevel2
= mresult
->level23
;
4370 mlevel3
= mresult
->level23
+ 16*count2
;
4371 memcpy(mlevel1
, level1
, 32);
4372 memset(mlevel2
, 0xFF, 16*count2
);
4373 memset(mlevel3
, 0, 128*count3
);
4375 for (i
= 1; i
< 256; i
++) {
4376 int o1
, o2
, o3
, i2
, i3
;
4377 if (decode
[i
] == 0xFFFE)
4378 /* unmapped character */
4381 o2
= (decode
[i
]>>7) & 0xF;
4382 i2
= 16*mlevel1
[o1
] + o2
;
4383 if (mlevel2
[i2
] == 0xFF)
4384 mlevel2
[i2
] = count3
++;
4385 o3
= decode
[i
] & 0x7F;
4386 i3
= 128*mlevel2
[i2
] + o3
;
4393 encoding_map_lookup(Py_UNICODE c
, PyObject
*mapping
)
4395 struct encoding_map
*map
= (struct encoding_map
*)mapping
;
4397 int l2
= (c
>>7) & 0xF;
4401 #ifdef Py_UNICODE_WIDE
4409 i
= map
->level1
[l1
];
4414 i
= map
->level23
[16*i
+l2
];
4419 i
= map
->level23
[16*map
->count2
+ 128*i
+ l3
];
4426 /* Lookup the character ch in the mapping. If the character
4427 can't be found, Py_None is returned (or NULL, if another
4429 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
4431 PyObject
*w
= PyInt_FromLong((long)c
);
4436 x
= PyObject_GetItem(mapping
, w
);
4439 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4440 /* No mapping found means: mapping is undefined. */
4448 else if (x
== Py_None
)
4450 else if (PyInt_Check(x
)) {
4451 long value
= PyInt_AS_LONG(x
);
4452 if (value
< 0 || value
> 255) {
4453 PyErr_SetString(PyExc_TypeError
,
4454 "character mapping must be in range(256)");
4460 else if (PyString_Check(x
))
4463 /* wrong return value */
4464 PyErr_SetString(PyExc_TypeError
,
4465 "character mapping must return integer, None or str");
4472 charmapencode_resize(PyObject
**outobj
, Py_ssize_t
*outpos
, Py_ssize_t requiredsize
)
4474 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4475 /* exponentially overallocate to minimize reallocations */
4476 if (requiredsize
< 2*outsize
)
4477 requiredsize
= 2*outsize
;
4478 if (_PyString_Resize(outobj
, requiredsize
)) {
4484 typedef enum charmapencode_result
{
4485 enc_SUCCESS
, enc_FAILED
, enc_EXCEPTION
4486 }charmapencode_result
;
4487 /* lookup the character, put the result in the output string and adjust
4488 various state variables. Reallocate the output string if not enough
4489 space is available. Return a new reference to the object that
4490 was put in the output buffer, or Py_None, if the mapping was undefined
4491 (in which case no character was written) or NULL, if a
4492 reallocation error occurred. The caller must decref the result */
4494 charmapencode_result
charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
4495 PyObject
**outobj
, Py_ssize_t
*outpos
)
4499 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4501 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4502 int res
= encoding_map_lookup(c
, mapping
);
4503 Py_ssize_t requiredsize
= *outpos
+1;
4506 if (outsize
<requiredsize
)
4507 if (!charmapencode_resize(outobj
, outpos
, requiredsize
))
4508 return enc_EXCEPTION
;
4509 outstart
= PyString_AS_STRING(*outobj
);
4510 outstart
[(*outpos
)++] = (char)res
;
4514 rep
= charmapencode_lookup(c
, mapping
);
4516 return enc_EXCEPTION
;
4517 else if (rep
==Py_None
) {
4521 if (PyInt_Check(rep
)) {
4522 Py_ssize_t requiredsize
= *outpos
+1;
4523 if (outsize
<requiredsize
)
4524 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4526 return enc_EXCEPTION
;
4528 outstart
= PyString_AS_STRING(*outobj
);
4529 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
4532 const char *repchars
= PyString_AS_STRING(rep
);
4533 Py_ssize_t repsize
= PyString_GET_SIZE(rep
);
4534 Py_ssize_t requiredsize
= *outpos
+repsize
;
4535 if (outsize
<requiredsize
)
4536 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4538 return enc_EXCEPTION
;
4540 outstart
= PyString_AS_STRING(*outobj
);
4541 memcpy(outstart
+ *outpos
, repchars
, repsize
);
4549 /* handle an error in PyUnicode_EncodeCharmap
4550 Return 0 on success, -1 on error */
4552 int charmap_encoding_error(
4553 const Py_UNICODE
*p
, Py_ssize_t size
, Py_ssize_t
*inpos
, PyObject
*mapping
,
4554 PyObject
**exceptionObject
,
4555 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
4556 PyObject
**res
, Py_ssize_t
*respos
)
4558 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4562 /* startpos for collecting unencodable chars */
4563 Py_ssize_t collstartpos
= *inpos
;
4564 Py_ssize_t collendpos
= *inpos
+1;
4566 char *encoding
= "charmap";
4567 char *reason
= "character maps to <undefined>";
4568 charmapencode_result x
;
4570 /* find all unencodable characters */
4571 while (collendpos
< size
) {
4573 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4574 int res
= encoding_map_lookup(p
[collendpos
], mapping
);
4581 rep
= charmapencode_lookup(p
[collendpos
], mapping
);
4584 else if (rep
!=Py_None
) {
4591 /* cache callback name lookup
4592 * (if not done yet, i.e. it's the first error) */
4593 if (*known_errorHandler
==-1) {
4594 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4595 *known_errorHandler
= 1;
4596 else if (!strcmp(errors
, "replace"))
4597 *known_errorHandler
= 2;
4598 else if (!strcmp(errors
, "ignore"))
4599 *known_errorHandler
= 3;
4600 else if (!strcmp(errors
, "xmlcharrefreplace"))
4601 *known_errorHandler
= 4;
4603 *known_errorHandler
= 0;
4605 switch (*known_errorHandler
) {
4606 case 1: /* strict */
4607 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4609 case 2: /* replace */
4610 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
4611 x
= charmapencode_output('?', mapping
, res
, respos
);
4612 if (x
==enc_EXCEPTION
) {
4615 else if (x
==enc_FAILED
) {
4616 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4621 case 3: /* ignore */
4622 *inpos
= collendpos
;
4624 case 4: /* xmlcharrefreplace */
4625 /* generate replacement (temporarily (mis)uses p) */
4626 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
4627 char buffer
[2+29+1+1];
4629 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
4630 for (cp
= buffer
; *cp
; ++cp
) {
4631 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
4632 if (x
==enc_EXCEPTION
)
4634 else if (x
==enc_FAILED
) {
4635 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4640 *inpos
= collendpos
;
4643 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
4644 encoding
, reason
, p
, size
, exceptionObject
,
4645 collstartpos
, collendpos
, &newpos
);
4646 if (repunicode
== NULL
)
4648 /* generate replacement */
4649 repsize
= PyUnicode_GET_SIZE(repunicode
);
4650 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
4651 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
4652 if (x
==enc_EXCEPTION
) {
4655 else if (x
==enc_FAILED
) {
4656 Py_DECREF(repunicode
);
4657 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4662 Py_DECREF(repunicode
);
4667 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
4673 PyObject
*res
= NULL
;
4674 /* current input position */
4675 Py_ssize_t inpos
= 0;
4676 /* current output position */
4677 Py_ssize_t respos
= 0;
4678 PyObject
*errorHandler
= NULL
;
4679 PyObject
*exc
= NULL
;
4680 /* the following variable is used for caching string comparisons
4681 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4682 * 3=ignore, 4=xmlcharrefreplace */
4683 int known_errorHandler
= -1;
4685 /* Default to Latin-1 */
4686 if (mapping
== NULL
)
4687 return PyUnicode_EncodeLatin1(p
, size
, errors
);
4689 /* allocate enough for a simple encoding without
4690 replacements, if we need more, we'll resize */
4691 res
= PyString_FromStringAndSize(NULL
, size
);
4697 while (inpos
<size
) {
4698 /* try to encode it */
4699 charmapencode_result x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
4700 if (x
==enc_EXCEPTION
) /* error */
4702 if (x
==enc_FAILED
) { /* unencodable character */
4703 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
4705 &known_errorHandler
, &errorHandler
, errors
,
4711 /* done with this character => adjust input position */
4715 /* Resize if we allocated to much */
4716 if (respos
<PyString_GET_SIZE(res
)) {
4717 if (_PyString_Resize(&res
, respos
))
4721 Py_XDECREF(errorHandler
);
4727 Py_XDECREF(errorHandler
);
4731 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
4734 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
4735 PyErr_BadArgument();
4738 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
4739 PyUnicode_GET_SIZE(unicode
),
4744 /* create or adjust a UnicodeTranslateError */
4745 static void make_translate_exception(PyObject
**exceptionObject
,
4746 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4747 Py_ssize_t startpos
, Py_ssize_t endpos
,
4750 if (*exceptionObject
== NULL
) {
4751 *exceptionObject
= PyUnicodeTranslateError_Create(
4752 unicode
, size
, startpos
, endpos
, reason
);
4755 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
4757 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
4759 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
4763 Py_DECREF(*exceptionObject
);
4764 *exceptionObject
= NULL
;
4768 /* raises a UnicodeTranslateError */
4769 static void raise_translate_exception(PyObject
**exceptionObject
,
4770 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4771 Py_ssize_t startpos
, Py_ssize_t endpos
,
4774 make_translate_exception(exceptionObject
,
4775 unicode
, size
, startpos
, endpos
, reason
);
4776 if (*exceptionObject
!= NULL
)
4777 PyCodec_StrictErrors(*exceptionObject
);
4780 /* error handling callback helper:
4781 build arguments, call the callback and check the arguments,
4782 put the result into newpos and return the replacement string, which
4783 has to be freed by the caller */
4784 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
4785 PyObject
**errorHandler
,
4787 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
4788 Py_ssize_t startpos
, Py_ssize_t endpos
,
4791 static char *argparse
= "O!n;translating error handler must return (unicode, int) tuple";
4793 Py_ssize_t i_newpos
;
4795 PyObject
*resunicode
;
4797 if (*errorHandler
== NULL
) {
4798 *errorHandler
= PyCodec_LookupError(errors
);
4799 if (*errorHandler
== NULL
)
4803 make_translate_exception(exceptionObject
,
4804 unicode
, size
, startpos
, endpos
, reason
);
4805 if (*exceptionObject
== NULL
)
4808 restuple
= PyObject_CallFunctionObjArgs(
4809 *errorHandler
, *exceptionObject
, NULL
);
4810 if (restuple
== NULL
)
4812 if (!PyTuple_Check(restuple
)) {
4813 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
4814 Py_DECREF(restuple
);
4817 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
4818 &resunicode
, &i_newpos
)) {
4819 Py_DECREF(restuple
);
4823 *newpos
= size
+i_newpos
;
4826 if (*newpos
<0 || *newpos
>size
) {
4827 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
4828 Py_DECREF(restuple
);
4831 Py_INCREF(resunicode
);
4832 Py_DECREF(restuple
);
4836 /* Lookup the character ch in the mapping and put the result in result,
4837 which must be decrefed by the caller.
4838 Return 0 on success, -1 on error */
4840 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
4842 PyObject
*w
= PyInt_FromLong((long)c
);
4847 x
= PyObject_GetItem(mapping
, w
);
4850 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4851 /* No mapping found means: use 1:1 mapping. */
4858 else if (x
== Py_None
) {
4862 else if (PyInt_Check(x
)) {
4863 long value
= PyInt_AS_LONG(x
);
4864 long max
= PyUnicode_GetMax();
4865 if (value
< 0 || value
> max
) {
4866 PyErr_Format(PyExc_TypeError
,
4867 "character mapping must be in range(0x%lx)", max
+1);
4874 else if (PyUnicode_Check(x
)) {
4879 /* wrong return value */
4880 PyErr_SetString(PyExc_TypeError
,
4881 "character mapping must return integer, None or unicode");
4886 /* ensure that *outobj is at least requiredsize characters long,
4887 if not reallocate and adjust various state variables.
4888 Return 0 on success, -1 on error */
4890 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
4891 Py_ssize_t requiredsize
)
4893 Py_ssize_t oldsize
= PyUnicode_GET_SIZE(*outobj
);
4894 if (requiredsize
> oldsize
) {
4895 /* remember old output position */
4896 Py_ssize_t outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
4897 /* exponentially overallocate to minimize reallocations */
4898 if (requiredsize
< 2 * oldsize
)
4899 requiredsize
= 2 * oldsize
;
4900 if (PyUnicode_Resize(outobj
, requiredsize
) < 0)
4902 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
4906 /* lookup the character, put the result in the output string and adjust
4907 various state variables. Return a new reference to the object that
4908 was put in the output buffer in *result, or Py_None, if the mapping was
4909 undefined (in which case no character was written).
4910 The called must decref result.
4911 Return 0 on success, -1 on error. */
4913 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
4914 Py_ssize_t insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
4917 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
4920 /* not found => default to 1:1 mapping */
4921 *(*outp
)++ = *curinp
;
4923 else if (*res
==Py_None
)
4925 else if (PyInt_Check(*res
)) {
4926 /* no overflow check, because we know that the space is enough */
4927 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
4929 else if (PyUnicode_Check(*res
)) {
4930 Py_ssize_t repsize
= PyUnicode_GET_SIZE(*res
);
4932 /* no overflow check, because we know that the space is enough */
4933 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
4935 else if (repsize
!=0) {
4936 /* more than one character */
4937 Py_ssize_t requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
4938 (insize
- (curinp
-startinp
)) +
4940 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
4942 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
4951 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
4957 PyObject
*res
= NULL
;
4958 /* pointers to the beginning and end+1 of input */
4959 const Py_UNICODE
*startp
= p
;
4960 const Py_UNICODE
*endp
= p
+ size
;
4961 /* pointer into the output */
4963 /* current output position */
4964 Py_ssize_t respos
= 0;
4965 char *reason
= "character maps to <undefined>";
4966 PyObject
*errorHandler
= NULL
;
4967 PyObject
*exc
= NULL
;
4968 /* the following variable is used for caching string comparisons
4969 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4970 * 3=ignore, 4=xmlcharrefreplace */
4971 int known_errorHandler
= -1;
4973 if (mapping
== NULL
) {
4974 PyErr_BadArgument();
4978 /* allocate enough for a simple 1:1 translation without
4979 replacements, if we need more, we'll resize */
4980 res
= PyUnicode_FromUnicode(NULL
, size
);
4985 str
= PyUnicode_AS_UNICODE(res
);
4988 /* try to encode it */
4990 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
4995 if (x
!=Py_None
) /* it worked => adjust input pointer */
4997 else { /* untranslatable character */
4998 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
5002 /* startpos for collecting untranslatable chars */
5003 const Py_UNICODE
*collstart
= p
;
5004 const Py_UNICODE
*collend
= p
+1;
5005 const Py_UNICODE
*coll
;
5007 /* find all untranslatable characters */
5008 while (collend
< endp
) {
5009 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
5016 /* cache callback name lookup
5017 * (if not done yet, i.e. it's the first error) */
5018 if (known_errorHandler
==-1) {
5019 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5020 known_errorHandler
= 1;
5021 else if (!strcmp(errors
, "replace"))
5022 known_errorHandler
= 2;
5023 else if (!strcmp(errors
, "ignore"))
5024 known_errorHandler
= 3;
5025 else if (!strcmp(errors
, "xmlcharrefreplace"))
5026 known_errorHandler
= 4;
5028 known_errorHandler
= 0;
5030 switch (known_errorHandler
) {
5031 case 1: /* strict */
5032 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
5034 case 2: /* replace */
5035 /* No need to check for space, this is a 1:1 replacement */
5036 for (coll
= collstart
; coll
<collend
; ++coll
)
5039 case 3: /* ignore */
5042 case 4: /* xmlcharrefreplace */
5043 /* generate replacement (temporarily (mis)uses p) */
5044 for (p
= collstart
; p
< collend
; ++p
) {
5045 char buffer
[2+29+1+1];
5047 sprintf(buffer
, "&#%d;", (int)*p
);
5048 if (charmaptranslate_makespace(&res
, &str
,
5049 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
5051 for (cp
= buffer
; *cp
; ++cp
)
5057 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
5058 reason
, startp
, size
, &exc
,
5059 collstart
-startp
, collend
-startp
, &newpos
);
5060 if (repunicode
== NULL
)
5062 /* generate replacement */
5063 repsize
= PyUnicode_GET_SIZE(repunicode
);
5064 if (charmaptranslate_makespace(&res
, &str
,
5065 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
5066 Py_DECREF(repunicode
);
5069 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
5071 p
= startp
+ newpos
;
5072 Py_DECREF(repunicode
);
5076 /* Resize if we allocated to much */
5077 respos
= str
-PyUnicode_AS_UNICODE(res
);
5078 if (respos
<PyUnicode_GET_SIZE(res
)) {
5079 if (PyUnicode_Resize(&res
, respos
) < 0)
5083 Py_XDECREF(errorHandler
);
5089 Py_XDECREF(errorHandler
);
5093 PyObject
*PyUnicode_Translate(PyObject
*str
,
5099 str
= PyUnicode_FromObject(str
);
5102 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
5103 PyUnicode_GET_SIZE(str
),
5114 /* --- Decimal Encoder ---------------------------------------------------- */
5116 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
5121 Py_UNICODE
*p
, *end
;
5122 PyObject
*errorHandler
= NULL
;
5123 PyObject
*exc
= NULL
;
5124 const char *encoding
= "decimal";
5125 const char *reason
= "invalid decimal Unicode string";
5126 /* the following variable is used for caching string comparisons
5127 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5128 int known_errorHandler
= -1;
5130 if (output
== NULL
) {
5131 PyErr_BadArgument();
5138 register Py_UNICODE ch
= *p
;
5140 PyObject
*repunicode
;
5144 Py_UNICODE
*collstart
;
5145 Py_UNICODE
*collend
;
5147 if (Py_UNICODE_ISSPACE(ch
)) {
5152 decimal
= Py_UNICODE_TODECIMAL(ch
);
5154 *output
++ = '0' + decimal
;
5158 if (0 < ch
&& ch
< 256) {
5159 *output
++ = (char)ch
;
5163 /* All other characters are considered unencodable */
5166 while (collend
< end
) {
5167 if ((0 < *collend
&& *collend
< 256) ||
5168 !Py_UNICODE_ISSPACE(*collend
) ||
5169 Py_UNICODE_TODECIMAL(*collend
))
5172 /* cache callback name lookup
5173 * (if not done yet, i.e. it's the first error) */
5174 if (known_errorHandler
==-1) {
5175 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5176 known_errorHandler
= 1;
5177 else if (!strcmp(errors
, "replace"))
5178 known_errorHandler
= 2;
5179 else if (!strcmp(errors
, "ignore"))
5180 known_errorHandler
= 3;
5181 else if (!strcmp(errors
, "xmlcharrefreplace"))
5182 known_errorHandler
= 4;
5184 known_errorHandler
= 0;
5186 switch (known_errorHandler
) {
5187 case 1: /* strict */
5188 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
5190 case 2: /* replace */
5191 for (p
= collstart
; p
< collend
; ++p
)
5194 case 3: /* ignore */
5197 case 4: /* xmlcharrefreplace */
5198 /* generate replacement (temporarily (mis)uses p) */
5199 for (p
= collstart
; p
< collend
; ++p
)
5200 output
+= sprintf(output
, "&#%d;", (int)*p
);
5204 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
5205 encoding
, reason
, s
, length
, &exc
,
5206 collstart
-s
, collend
-s
, &newpos
);
5207 if (repunicode
== NULL
)
5209 /* generate replacement */
5210 repsize
= PyUnicode_GET_SIZE(repunicode
);
5211 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
5212 Py_UNICODE ch
= *uni2
;
5213 if (Py_UNICODE_ISSPACE(ch
))
5216 decimal
= Py_UNICODE_TODECIMAL(ch
);
5218 *output
++ = '0' + decimal
;
5219 else if (0 < ch
&& ch
< 256)
5220 *output
++ = (char)ch
;
5222 Py_DECREF(repunicode
);
5223 raise_encode_exception(&exc
, encoding
,
5224 s
, length
, collstart
-s
, collend
-s
, reason
);
5230 Py_DECREF(repunicode
);
5233 /* 0-terminate the output string */
5236 Py_XDECREF(errorHandler
);
5241 Py_XDECREF(errorHandler
);
5245 /* --- Helpers ------------------------------------------------------------ */
5247 #include "stringlib/unicodedefs.h"
5249 #define FROM_UNICODE
5251 #include "stringlib/fastsearch.h"
5253 #include "stringlib/count.h"
5254 #include "stringlib/find.h"
5255 #include "stringlib/partition.h"
5257 /* helper macro to fixup start/end slice values */
5258 #define FIX_START_END(obj) \
5260 start += (obj)->length; \
5263 if (end > (obj)->length) \
5264 end = (obj)->length; \
5266 end += (obj)->length; \
5270 Py_ssize_t
PyUnicode_Count(PyObject
*str
,
5276 PyUnicodeObject
* str_obj
;
5277 PyUnicodeObject
* sub_obj
;
5279 str_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(str
);
5282 sub_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(substr
);
5288 FIX_START_END(str_obj
);
5290 result
= stringlib_count(
5291 str_obj
->str
+ start
, end
- start
, sub_obj
->str
, sub_obj
->length
5300 Py_ssize_t
PyUnicode_Find(PyObject
*str
,
5308 str
= PyUnicode_FromObject(str
);
5311 sub
= PyUnicode_FromObject(sub
);
5318 result
= stringlib_find_slice(
5319 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5320 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5324 result
= stringlib_rfind_slice(
5325 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5326 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5337 int tailmatch(PyUnicodeObject
*self
,
5338 PyUnicodeObject
*substring
,
5343 if (substring
->length
== 0)
5346 FIX_START_END(self
);
5348 end
-= substring
->length
;
5352 if (direction
> 0) {
5353 if (Py_UNICODE_MATCH(self
, end
, substring
))
5356 if (Py_UNICODE_MATCH(self
, start
, substring
))
5363 Py_ssize_t
PyUnicode_Tailmatch(PyObject
*str
,
5371 str
= PyUnicode_FromObject(str
);
5374 substr
= PyUnicode_FromObject(substr
);
5375 if (substr
== NULL
) {
5380 result
= tailmatch((PyUnicodeObject
*)str
,
5381 (PyUnicodeObject
*)substr
,
5382 start
, end
, direction
);
5388 /* Apply fixfct filter to the Unicode object self and return a
5389 reference to the modified object */
5392 PyObject
*fixup(PyUnicodeObject
*self
,
5393 int (*fixfct
)(PyUnicodeObject
*s
))
5398 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5402 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5404 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
5405 /* fixfct should return TRUE if it modified the buffer. If
5406 FALSE, return a reference to the original buffer instead
5407 (to save space, not time) */
5410 return (PyObject
*) self
;
5412 return (PyObject
*) u
;
5416 int fixupper(PyUnicodeObject
*self
)
5418 Py_ssize_t len
= self
->length
;
5419 Py_UNICODE
*s
= self
->str
;
5423 register Py_UNICODE ch
;
5425 ch
= Py_UNICODE_TOUPPER(*s
);
5437 int fixlower(PyUnicodeObject
*self
)
5439 Py_ssize_t len
= self
->length
;
5440 Py_UNICODE
*s
= self
->str
;
5444 register Py_UNICODE ch
;
5446 ch
= Py_UNICODE_TOLOWER(*s
);
5458 int fixswapcase(PyUnicodeObject
*self
)
5460 Py_ssize_t len
= self
->length
;
5461 Py_UNICODE
*s
= self
->str
;
5465 if (Py_UNICODE_ISUPPER(*s
)) {
5466 *s
= Py_UNICODE_TOLOWER(*s
);
5468 } else if (Py_UNICODE_ISLOWER(*s
)) {
5469 *s
= Py_UNICODE_TOUPPER(*s
);
5479 int fixcapitalize(PyUnicodeObject
*self
)
5481 Py_ssize_t len
= self
->length
;
5482 Py_UNICODE
*s
= self
->str
;
5487 if (Py_UNICODE_ISLOWER(*s
)) {
5488 *s
= Py_UNICODE_TOUPPER(*s
);
5493 if (Py_UNICODE_ISUPPER(*s
)) {
5494 *s
= Py_UNICODE_TOLOWER(*s
);
5503 int fixtitle(PyUnicodeObject
*self
)
5505 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5506 register Py_UNICODE
*e
;
5507 int previous_is_cased
;
5509 /* Shortcut for single character strings */
5510 if (PyUnicode_GET_SIZE(self
) == 1) {
5511 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
5520 e
= p
+ PyUnicode_GET_SIZE(self
);
5521 previous_is_cased
= 0;
5522 for (; p
< e
; p
++) {
5523 register const Py_UNICODE ch
= *p
;
5525 if (previous_is_cased
)
5526 *p
= Py_UNICODE_TOLOWER(ch
);
5528 *p
= Py_UNICODE_TOTITLE(ch
);
5530 if (Py_UNICODE_ISLOWER(ch
) ||
5531 Py_UNICODE_ISUPPER(ch
) ||
5532 Py_UNICODE_ISTITLE(ch
))
5533 previous_is_cased
= 1;
5535 previous_is_cased
= 0;
5541 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
5543 PyObject
*internal_separator
= NULL
;
5544 const Py_UNICODE blank
= ' ';
5545 const Py_UNICODE
*sep
= &blank
;
5546 Py_ssize_t seplen
= 1;
5547 PyUnicodeObject
*res
= NULL
; /* the result */
5548 Py_ssize_t res_alloc
= 100; /* # allocated bytes for string in res */
5549 Py_ssize_t res_used
; /* # used bytes */
5550 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
5551 PyObject
*fseq
; /* PySequence_Fast(seq) */
5552 Py_ssize_t seqlen
; /* len(fseq) -- number of items in sequence */
5556 fseq
= PySequence_Fast(seq
, "");
5561 /* Grrrr. A codec may be invoked to convert str objects to
5562 * Unicode, and so it's possible to call back into Python code
5563 * during PyUnicode_FromObject(), and so it's possible for a sick
5564 * codec to change the size of fseq (if seq is a list). Therefore
5565 * we have to keep refetching the size -- can't assume seqlen
5568 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5569 /* If empty sequence, return u"". */
5571 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
5574 /* If singleton sequence with an exact Unicode, return that. */
5576 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
5577 if (PyUnicode_CheckExact(item
)) {
5579 res
= (PyUnicodeObject
*)item
;
5584 /* At least two items to join, or one that isn't exact Unicode. */
5586 /* Set up sep and seplen -- they're needed. */
5587 if (separator
== NULL
) {
5592 internal_separator
= PyUnicode_FromObject(separator
);
5593 if (internal_separator
== NULL
)
5595 sep
= PyUnicode_AS_UNICODE(internal_separator
);
5596 seplen
= PyUnicode_GET_SIZE(internal_separator
);
5597 /* In case PyUnicode_FromObject() mutated seq. */
5598 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5603 res
= _PyUnicode_New(res_alloc
);
5606 res_p
= PyUnicode_AS_UNICODE(res
);
5609 for (i
= 0; i
< seqlen
; ++i
) {
5611 Py_ssize_t new_res_used
;
5613 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
5614 /* Convert item to Unicode. */
5615 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
5616 PyErr_Format(PyExc_TypeError
,
5617 "sequence item %zd: expected string or Unicode,"
5619 i
, Py_TYPE(item
)->tp_name
);
5622 item
= PyUnicode_FromObject(item
);
5625 /* We own a reference to item from here on. */
5627 /* In case PyUnicode_FromObject() mutated seq. */
5628 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5630 /* Make sure we have enough space for the separator and the item. */
5631 itemlen
= PyUnicode_GET_SIZE(item
);
5632 new_res_used
= res_used
+ itemlen
;
5633 if (new_res_used
< 0)
5635 if (i
< seqlen
- 1) {
5636 new_res_used
+= seplen
;
5637 if (new_res_used
< 0)
5640 if (new_res_used
> res_alloc
) {
5641 /* double allocated size until it's big enough */
5643 res_alloc
+= res_alloc
;
5646 } while (new_res_used
> res_alloc
);
5647 if (_PyUnicode_Resize(&res
, res_alloc
) < 0) {
5651 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
5654 /* Copy item, and maybe the separator. */
5655 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), itemlen
);
5657 if (i
< seqlen
- 1) {
5658 Py_UNICODE_COPY(res_p
, sep
, seplen
);
5662 res_used
= new_res_used
;
5665 /* Shrink res to match the used area; this probably can't fail,
5666 * but it's cheap to check.
5668 if (_PyUnicode_Resize(&res
, res_used
) < 0)
5672 Py_XDECREF(internal_separator
);
5674 return (PyObject
*)res
;
5677 PyErr_SetString(PyExc_OverflowError
,
5678 "join() result is too long for a Python string");
5683 Py_XDECREF(internal_separator
);
5690 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
5702 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
5707 if (left
> PY_SSIZE_T_MAX
- self
->length
||
5708 right
> PY_SSIZE_T_MAX
- (left
+ self
->length
)) {
5709 PyErr_SetString(PyExc_OverflowError
, "padded string is too long");
5712 u
= _PyUnicode_New(left
+ self
->length
+ right
);
5715 Py_UNICODE_FILL(u
->str
, fill
, left
);
5716 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
5718 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
5724 #define SPLIT_APPEND(data, left, right) \
5725 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5728 if (PyList_Append(list, str)) { \
5736 PyObject
*split_whitespace(PyUnicodeObject
*self
,
5738 Py_ssize_t maxcount
)
5740 register Py_ssize_t i
;
5741 register Py_ssize_t j
;
5742 Py_ssize_t len
= self
->length
;
5744 register const Py_UNICODE
*buf
= self
->str
;
5746 for (i
= j
= 0; i
< len
; ) {
5748 while (i
< len
&& Py_UNICODE_ISSPACE(buf
[i
]))
5751 while (i
< len
&& !Py_UNICODE_ISSPACE(buf
[i
]))
5754 if (maxcount
-- <= 0)
5756 SPLIT_APPEND(buf
, j
, i
);
5757 while (i
< len
&& Py_UNICODE_ISSPACE(buf
[i
]))
5763 SPLIT_APPEND(buf
, j
, len
);
5772 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
5775 register Py_ssize_t i
;
5776 register Py_ssize_t j
;
5782 string
= PyUnicode_FromObject(string
);
5785 data
= PyUnicode_AS_UNICODE(string
);
5786 len
= PyUnicode_GET_SIZE(string
);
5788 list
= PyList_New(0);
5792 for (i
= j
= 0; i
< len
; ) {
5795 /* Find a line and append it */
5796 while (i
< len
&& !BLOOM_LINEBREAK(data
[i
]))
5799 /* Skip the line break reading CRLF as one line break */
5802 if (data
[i
] == '\r' && i
+ 1 < len
&&
5810 SPLIT_APPEND(data
, j
, eol
);
5814 SPLIT_APPEND(data
, j
, len
);
5827 PyObject
*split_char(PyUnicodeObject
*self
,
5830 Py_ssize_t maxcount
)
5832 register Py_ssize_t i
;
5833 register Py_ssize_t j
;
5834 Py_ssize_t len
= self
->length
;
5836 register const Py_UNICODE
*buf
= self
->str
;
5838 for (i
= j
= 0; i
< len
; ) {
5840 if (maxcount
-- <= 0)
5842 SPLIT_APPEND(buf
, j
, i
);
5848 SPLIT_APPEND(buf
, j
, len
);
5858 PyObject
*split_substring(PyUnicodeObject
*self
,
5860 PyUnicodeObject
*substring
,
5861 Py_ssize_t maxcount
)
5863 register Py_ssize_t i
;
5864 register Py_ssize_t j
;
5865 Py_ssize_t len
= self
->length
;
5866 Py_ssize_t sublen
= substring
->length
;
5869 for (i
= j
= 0; i
<= len
- sublen
; ) {
5870 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
5871 if (maxcount
-- <= 0)
5873 SPLIT_APPEND(self
->str
, j
, i
);
5879 SPLIT_APPEND(self
->str
, j
, len
);
5889 PyObject
*rsplit_whitespace(PyUnicodeObject
*self
,
5891 Py_ssize_t maxcount
)
5893 register Py_ssize_t i
;
5894 register Py_ssize_t j
;
5895 Py_ssize_t len
= self
->length
;
5897 register const Py_UNICODE
*buf
= self
->str
;
5899 for (i
= j
= len
- 1; i
>= 0; ) {
5901 while (i
>= 0 && Py_UNICODE_ISSPACE(buf
[i
]))
5904 while (i
>= 0 && !Py_UNICODE_ISSPACE(buf
[i
]))
5907 if (maxcount
-- <= 0)
5909 SPLIT_APPEND(buf
, i
+ 1, j
+ 1);
5910 while (i
>= 0 && Py_UNICODE_ISSPACE(buf
[i
]))
5916 SPLIT_APPEND(buf
, 0, j
+ 1);
5918 if (PyList_Reverse(list
) < 0)
5928 PyObject
*rsplit_char(PyUnicodeObject
*self
,
5931 Py_ssize_t maxcount
)
5933 register Py_ssize_t i
;
5934 register Py_ssize_t j
;
5935 Py_ssize_t len
= self
->length
;
5937 register const Py_UNICODE
*buf
= self
->str
;
5939 for (i
= j
= len
- 1; i
>= 0; ) {
5941 if (maxcount
-- <= 0)
5943 SPLIT_APPEND(buf
, i
+ 1, j
+ 1);
5949 SPLIT_APPEND(buf
, 0, j
+ 1);
5951 if (PyList_Reverse(list
) < 0)
5961 PyObject
*rsplit_substring(PyUnicodeObject
*self
,
5963 PyUnicodeObject
*substring
,
5964 Py_ssize_t maxcount
)
5966 register Py_ssize_t i
;
5967 register Py_ssize_t j
;
5968 Py_ssize_t len
= self
->length
;
5969 Py_ssize_t sublen
= substring
->length
;
5972 for (i
= len
- sublen
, j
= len
; i
>= 0; ) {
5973 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
5974 if (maxcount
-- <= 0)
5976 SPLIT_APPEND(self
->str
, i
+ sublen
, j
);
5983 SPLIT_APPEND(self
->str
, 0, j
);
5985 if (PyList_Reverse(list
) < 0)
5997 PyObject
*split(PyUnicodeObject
*self
,
5998 PyUnicodeObject
*substring
,
5999 Py_ssize_t maxcount
)
6004 maxcount
= PY_SSIZE_T_MAX
;
6006 list
= PyList_New(0);
6010 if (substring
== NULL
)
6011 return split_whitespace(self
,list
,maxcount
);
6013 else if (substring
->length
== 1)
6014 return split_char(self
,list
,substring
->str
[0],maxcount
);
6016 else if (substring
->length
== 0) {
6018 PyErr_SetString(PyExc_ValueError
, "empty separator");
6022 return split_substring(self
,list
,substring
,maxcount
);
6026 PyObject
*rsplit(PyUnicodeObject
*self
,
6027 PyUnicodeObject
*substring
,
6028 Py_ssize_t maxcount
)
6033 maxcount
= PY_SSIZE_T_MAX
;
6035 list
= PyList_New(0);
6039 if (substring
== NULL
)
6040 return rsplit_whitespace(self
,list
,maxcount
);
6042 else if (substring
->length
== 1)
6043 return rsplit_char(self
,list
,substring
->str
[0],maxcount
);
6045 else if (substring
->length
== 0) {
6047 PyErr_SetString(PyExc_ValueError
, "empty separator");
6051 return rsplit_substring(self
,list
,substring
,maxcount
);
6055 PyObject
*replace(PyUnicodeObject
*self
,
6056 PyUnicodeObject
*str1
,
6057 PyUnicodeObject
*str2
,
6058 Py_ssize_t maxcount
)
6063 maxcount
= PY_SSIZE_T_MAX
;
6065 if (str1
->length
== str2
->length
) {
6068 if (str1
->length
== 1) {
6069 /* replace characters */
6071 if (!findchar(self
->str
, self
->length
, str1
->str
[0]))
6073 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
6076 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
6079 for (i
= 0; i
< u
->length
; i
++)
6080 if (u
->str
[i
] == u1
) {
6087 self
->str
, self
->length
, str1
->str
, str1
->length
, FAST_SEARCH
6091 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
6094 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
6095 while (i
<= self
->length
- str1
->length
)
6096 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
6099 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
6106 Py_ssize_t n
, i
, j
, e
;
6107 Py_ssize_t product
, new_size
, delta
;
6110 /* replace strings */
6111 n
= stringlib_count(self
->str
, self
->length
, str1
->str
, str1
->length
);
6116 /* new_size = self->length + n * (str2->length - str1->length)); */
6117 delta
= (str2
->length
- str1
->length
);
6119 new_size
= self
->length
;
6121 product
= n
* (str2
->length
- str1
->length
);
6122 if ((product
/ (str2
->length
- str1
->length
)) != n
) {
6123 PyErr_SetString(PyExc_OverflowError
,
6124 "replace string is too long");
6127 new_size
= self
->length
+ product
;
6129 PyErr_SetString(PyExc_OverflowError
,
6130 "replace string is too long");
6134 u
= _PyUnicode_New(new_size
);
6139 e
= self
->length
- str1
->length
;
6140 if (str1
->length
> 0) {
6142 /* look for next match */
6145 if (Py_UNICODE_MATCH(self
, j
, str1
))
6152 /* copy unchanged part [i:j] */
6153 Py_UNICODE_COPY(p
, self
->str
+i
, j
-i
);
6156 /* copy substitution string */
6157 if (str2
->length
> 0) {
6158 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
6161 i
= j
+ str1
->length
;
6163 if (i
< self
->length
)
6164 /* copy tail [i:] */
6165 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
6169 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
6173 *p
++ = self
->str
[i
++];
6175 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
6178 return (PyObject
*) u
;
6181 /* nothing to replace; return original string (when possible) */
6182 if (PyUnicode_CheckExact(self
)) {
6184 return (PyObject
*) self
;
6186 return PyUnicode_FromUnicode(self
->str
, self
->length
);
6189 /* --- Unicode Object Methods --------------------------------------------- */
6191 PyDoc_STRVAR(title__doc__
,
6192 "S.title() -> unicode\n\
6194 Return a titlecased version of S, i.e. words start with title case\n\
6195 characters, all remaining cased characters have lower case.");
6198 unicode_title(PyUnicodeObject
*self
)
6200 return fixup(self
, fixtitle
);
6203 PyDoc_STRVAR(capitalize__doc__
,
6204 "S.capitalize() -> unicode\n\
6206 Return a capitalized version of S, i.e. make the first character\n\
6210 unicode_capitalize(PyUnicodeObject
*self
)
6212 return fixup(self
, fixcapitalize
);
6216 PyDoc_STRVAR(capwords__doc__
,
6217 "S.capwords() -> unicode\n\
6219 Apply .capitalize() to all words in S and return the result with\n\
6220 normalized whitespace (all whitespace strings are replaced by ' ').");
6223 unicode_capwords(PyUnicodeObject
*self
)
6229 /* Split into words */
6230 list
= split(self
, NULL
, -1);
6234 /* Capitalize each word */
6235 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
6236 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
6240 Py_DECREF(PyList_GET_ITEM(list
, i
));
6241 PyList_SET_ITEM(list
, i
, item
);
6244 /* Join the words to form a new string */
6245 item
= PyUnicode_Join(NULL
, list
);
6249 return (PyObject
*)item
;
6253 /* Argument converter. Coerces to a single unicode character */
6256 convert_uc(PyObject
*obj
, void *addr
)
6258 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
6262 uniobj
= PyUnicode_FromObject(obj
);
6263 if (uniobj
== NULL
) {
6264 PyErr_SetString(PyExc_TypeError
,
6265 "The fill character cannot be converted to Unicode");
6268 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
6269 PyErr_SetString(PyExc_TypeError
,
6270 "The fill character must be exactly one character long");
6274 unistr
= PyUnicode_AS_UNICODE(uniobj
);
6275 *fillcharloc
= unistr
[0];
6280 PyDoc_STRVAR(center__doc__
,
6281 "S.center(width[, fillchar]) -> unicode\n\
6283 Return S centered in a Unicode string of length width. Padding is\n\
6284 done using the specified fill character (default is a space)");
6287 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
6289 Py_ssize_t marg
, left
;
6291 Py_UNICODE fillchar
= ' ';
6293 if (!PyArg_ParseTuple(args
, "n|O&:center", &width
, convert_uc
, &fillchar
))
6296 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6298 return (PyObject
*) self
;
6301 marg
= width
- self
->length
;
6302 left
= marg
/ 2 + (marg
& width
& 1);
6304 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
6309 /* This code should go into some future Unicode collation support
6310 module. The basic comparison should compare ordinals on a naive
6311 basis (this is what Java does and thus JPython too). */
6313 /* speedy UTF-16 code point order comparison */
6315 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6317 static short utf16Fixup
[32] =
6319 0, 0, 0, 0, 0, 0, 0, 0,
6320 0, 0, 0, 0, 0, 0, 0, 0,
6321 0, 0, 0, 0, 0, 0, 0, 0,
6322 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6326 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6328 Py_ssize_t len1
, len2
;
6330 Py_UNICODE
*s1
= str1
->str
;
6331 Py_UNICODE
*s2
= str2
->str
;
6333 len1
= str1
->length
;
6334 len2
= str2
->length
;
6336 while (len1
> 0 && len2
> 0) {
6342 if (c1
> (1<<11) * 26)
6343 c1
+= utf16Fixup
[c1
>>11];
6344 if (c2
> (1<<11) * 26)
6345 c2
+= utf16Fixup
[c2
>>11];
6346 /* now c1 and c2 are in UTF-32-compatible order */
6349 return (c1
< c2
) ? -1 : 1;
6354 return (len1
< len2
) ? -1 : (len1
!= len2
);
6360 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6362 register Py_ssize_t len1
, len2
;
6364 Py_UNICODE
*s1
= str1
->str
;
6365 Py_UNICODE
*s2
= str2
->str
;
6367 len1
= str1
->length
;
6368 len2
= str2
->length
;
6370 while (len1
> 0 && len2
> 0) {
6377 return (c1
< c2
) ? -1 : 1;
6382 return (len1
< len2
) ? -1 : (len1
!= len2
);
6387 int PyUnicode_Compare(PyObject
*left
,
6390 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
6393 /* Coerce the two arguments */
6394 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6397 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6401 /* Shortcut for empty or interned objects */
6408 result
= unicode_compare(u
, v
);
6420 PyObject
*PyUnicode_RichCompare(PyObject
*left
,
6426 result
= PyUnicode_Compare(left
, right
);
6427 if (result
== -1 && PyErr_Occurred())
6430 /* Convert the return value to a Boolean */
6433 result
= (result
== 0);
6436 result
= (result
!= 0);
6439 result
= (result
<= 0);
6442 result
= (result
>= 0);
6445 result
= (result
== -1);
6448 result
= (result
== 1);
6451 return PyBool_FromLong(result
);
6457 Type errors mean that PyUnicode_FromObject() could not convert
6458 one of the arguments (usually the right hand side) to Unicode,
6459 ie. we can't handle the comparison request. However, it is
6460 possible that the other object knows a comparison method, which
6461 is why we return Py_NotImplemented to give the other object a
6465 if (PyErr_ExceptionMatches(PyExc_TypeError
)) {
6467 Py_INCREF(Py_NotImplemented
);
6468 return Py_NotImplemented
;
6470 if (op
!= Py_EQ
&& op
!= Py_NE
)
6473 /* Equality comparison.
6475 This is a special case: we silence any PyExc_UnicodeDecodeError
6476 and instead turn it into a PyErr_UnicodeWarning.
6479 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
))
6482 if (PyErr_Warn(PyExc_UnicodeWarning
,
6484 "Unicode equal comparison "
6485 "failed to convert both arguments to Unicode - "
6486 "interpreting them as being unequal" :
6487 "Unicode unequal comparison "
6488 "failed to convert both arguments to Unicode - "
6489 "interpreting them as being unequal"
6492 result
= (op
== Py_NE
);
6493 return PyBool_FromLong(result
);
6496 int PyUnicode_Contains(PyObject
*container
,
6499 PyObject
*str
, *sub
;
6502 /* Coerce the two arguments */
6503 sub
= PyUnicode_FromObject(element
);
6505 PyErr_SetString(PyExc_TypeError
,
6506 "'in <string>' requires string as left operand");
6510 str
= PyUnicode_FromObject(container
);
6516 result
= stringlib_contains_obj(str
, sub
);
6524 /* Concat to string or Unicode object giving a new Unicode object. */
6526 PyObject
*PyUnicode_Concat(PyObject
*left
,
6529 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
6531 /* Coerce the two arguments */
6532 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6535 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6540 if (v
== unicode_empty
) {
6542 return (PyObject
*)u
;
6544 if (u
== unicode_empty
) {
6546 return (PyObject
*)v
;
6549 /* Concat the two Unicode strings */
6550 w
= _PyUnicode_New(u
->length
+ v
->length
);
6553 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
6554 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
6558 return (PyObject
*)w
;
6566 PyDoc_STRVAR(count__doc__
,
6567 "S.count(sub[, start[, end]]) -> int\n\
6569 Return the number of non-overlapping occurrences of substring sub in\n\
6570 Unicode string S[start:end]. Optional arguments start and end are\n\
6571 interpreted as in slice notation.");
6574 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
6576 PyUnicodeObject
*substring
;
6577 Py_ssize_t start
= 0;
6578 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6581 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
6582 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6585 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6586 (PyObject
*)substring
);
6587 if (substring
== NULL
)
6590 FIX_START_END(self
);
6592 result
= PyInt_FromSsize_t(
6593 stringlib_count(self
->str
+ start
, end
- start
,
6594 substring
->str
, substring
->length
)
6597 Py_DECREF(substring
);
6602 PyDoc_STRVAR(encode__doc__
,
6603 "S.encode([encoding[,errors]]) -> string or unicode\n\
6605 Encodes S using the codec registered for encoding. encoding defaults\n\
6606 to the default encoding. errors may be given to set a different error\n\
6607 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6608 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6609 'xmlcharrefreplace' as well as any other name registered with\n\
6610 codecs.register_error that can handle UnicodeEncodeErrors.");
6613 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
6615 char *encoding
= NULL
;
6616 char *errors
= NULL
;
6619 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
6621 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
6624 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6625 PyErr_Format(PyExc_TypeError
,
6626 "encoder did not return a string/unicode object "
6628 Py_TYPE(v
)->tp_name
);
6638 PyDoc_STRVAR(decode__doc__
,
6639 "S.decode([encoding[,errors]]) -> string or unicode\n\
6641 Decodes S using the codec registered for encoding. encoding defaults\n\
6642 to the default encoding. errors may be given to set a different error\n\
6643 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6644 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6645 as well as any other name registerd with codecs.register_error that is\n\
6646 able to handle UnicodeDecodeErrors.");
6649 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
)
6651 char *encoding
= NULL
;
6652 char *errors
= NULL
;
6655 if (!PyArg_ParseTuple(args
, "|ss:decode", &encoding
, &errors
))
6657 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
6660 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6661 PyErr_Format(PyExc_TypeError
,
6662 "decoder did not return a string/unicode object "
6664 Py_TYPE(v
)->tp_name
);
6674 PyDoc_STRVAR(expandtabs__doc__
,
6675 "S.expandtabs([tabsize]) -> unicode\n\
6677 Return a copy of S where all tab characters are expanded using spaces.\n\
6678 If tabsize is not given, a tab size of 8 characters is assumed.");
6681 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
6687 Py_ssize_t i
, j
, incr
;
6691 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
6694 /* First pass: determine size of output string */
6695 i
= 0; /* chars up to and including most recent \n or \r */
6696 j
= 0; /* chars since most recent \n or \r (use in tab calculations) */
6697 e
= self
->str
+ self
->length
; /* end of input */
6698 for (p
= self
->str
; p
< e
; p
++)
6701 incr
= tabsize
- (j
% tabsize
); /* cannot overflow */
6702 if (j
> PY_SSIZE_T_MAX
- incr
)
6708 if (j
> PY_SSIZE_T_MAX
- 1)
6711 if (*p
== '\n' || *p
== '\r') {
6712 if (i
> PY_SSIZE_T_MAX
- j
)
6719 if (i
> PY_SSIZE_T_MAX
- j
)
6722 /* Second pass: create output string and fill it */
6723 u
= _PyUnicode_New(i
+ j
);
6727 j
= 0; /* same as in first pass */
6728 q
= u
->str
; /* next output char */
6729 qe
= u
->str
+ u
->length
; /* end of output */
6731 for (p
= self
->str
; p
< e
; p
++)
6734 i
= tabsize
- (j
% tabsize
);
6748 if (*p
== '\n' || *p
== '\r')
6752 return (PyObject
*) u
;
6757 PyErr_SetString(PyExc_OverflowError
, "new string is too long");
6761 PyDoc_STRVAR(find__doc__
,
6762 "S.find(sub [,start [,end]]) -> int\n\
6764 Return the lowest index in S where substring sub is found,\n\
6765 such that sub is contained within s[start:end]. Optional\n\
6766 arguments start and end are interpreted as in slice notation.\n\
6768 Return -1 on failure.");
6771 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
6773 PyObject
*substring
;
6778 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6781 result
= stringlib_find_slice(
6782 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6783 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6787 Py_DECREF(substring
);
6789 return PyInt_FromSsize_t(result
);
6793 unicode_getitem(PyUnicodeObject
*self
, Py_ssize_t index
)
6795 if (index
< 0 || index
>= self
->length
) {
6796 PyErr_SetString(PyExc_IndexError
, "string index out of range");
6800 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
6804 unicode_hash(PyUnicodeObject
*self
)
6806 /* Since Unicode objects compare equal to their ASCII string
6807 counterparts, they should use the individual character values
6808 as basis for their hash value. This is needed to assure that
6809 strings and Unicode objects behave in the same way as
6812 register Py_ssize_t len
;
6813 register Py_UNICODE
*p
;
6816 if (self
->hash
!= -1)
6818 len
= PyUnicode_GET_SIZE(self
);
6819 p
= PyUnicode_AS_UNICODE(self
);
6822 x
= (1000003*x
) ^ *p
++;
6823 x
^= PyUnicode_GET_SIZE(self
);
6830 PyDoc_STRVAR(index__doc__
,
6831 "S.index(sub [,start [,end]]) -> int\n\
6833 Like S.find() but raise ValueError when the substring is not found.");
6836 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
6839 PyObject
*substring
;
6843 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6846 result
= stringlib_find_slice(
6847 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6848 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6852 Py_DECREF(substring
);
6855 PyErr_SetString(PyExc_ValueError
, "substring not found");
6859 return PyInt_FromSsize_t(result
);
6862 PyDoc_STRVAR(islower__doc__
,
6863 "S.islower() -> bool\n\
6865 Return True if all cased characters in S are lowercase and there is\n\
6866 at least one cased character in S, False otherwise.");
6869 unicode_islower(PyUnicodeObject
*self
)
6871 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6872 register const Py_UNICODE
*e
;
6875 /* Shortcut for single character strings */
6876 if (PyUnicode_GET_SIZE(self
) == 1)
6877 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
6879 /* Special case for empty strings */
6880 if (PyUnicode_GET_SIZE(self
) == 0)
6881 return PyBool_FromLong(0);
6883 e
= p
+ PyUnicode_GET_SIZE(self
);
6885 for (; p
< e
; p
++) {
6886 register const Py_UNICODE ch
= *p
;
6888 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
6889 return PyBool_FromLong(0);
6890 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
6893 return PyBool_FromLong(cased
);
6896 PyDoc_STRVAR(isupper__doc__
,
6897 "S.isupper() -> bool\n\
6899 Return True if all cased characters in S are uppercase and there is\n\
6900 at least one cased character in S, False otherwise.");
6903 unicode_isupper(PyUnicodeObject
*self
)
6905 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6906 register const Py_UNICODE
*e
;
6909 /* Shortcut for single character strings */
6910 if (PyUnicode_GET_SIZE(self
) == 1)
6911 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
6913 /* Special case for empty strings */
6914 if (PyUnicode_GET_SIZE(self
) == 0)
6915 return PyBool_FromLong(0);
6917 e
= p
+ PyUnicode_GET_SIZE(self
);
6919 for (; p
< e
; p
++) {
6920 register const Py_UNICODE ch
= *p
;
6922 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
6923 return PyBool_FromLong(0);
6924 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
6927 return PyBool_FromLong(cased
);
6930 PyDoc_STRVAR(istitle__doc__
,
6931 "S.istitle() -> bool\n\
6933 Return True if S is a titlecased string and there is at least one\n\
6934 character in S, i.e. upper- and titlecase characters may only\n\
6935 follow uncased characters and lowercase characters only cased ones.\n\
6936 Return False otherwise.");
6939 unicode_istitle(PyUnicodeObject
*self
)
6941 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6942 register const Py_UNICODE
*e
;
6943 int cased
, previous_is_cased
;
6945 /* Shortcut for single character strings */
6946 if (PyUnicode_GET_SIZE(self
) == 1)
6947 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
6948 (Py_UNICODE_ISUPPER(*p
) != 0));
6950 /* Special case for empty strings */
6951 if (PyUnicode_GET_SIZE(self
) == 0)
6952 return PyBool_FromLong(0);
6954 e
= p
+ PyUnicode_GET_SIZE(self
);
6956 previous_is_cased
= 0;
6957 for (; p
< e
; p
++) {
6958 register const Py_UNICODE ch
= *p
;
6960 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
6961 if (previous_is_cased
)
6962 return PyBool_FromLong(0);
6963 previous_is_cased
= 1;
6966 else if (Py_UNICODE_ISLOWER(ch
)) {
6967 if (!previous_is_cased
)
6968 return PyBool_FromLong(0);
6969 previous_is_cased
= 1;
6973 previous_is_cased
= 0;
6975 return PyBool_FromLong(cased
);
6978 PyDoc_STRVAR(isspace__doc__
,
6979 "S.isspace() -> bool\n\
6981 Return True if all characters in S are whitespace\n\
6982 and there is at least one character in S, False otherwise.");
6985 unicode_isspace(PyUnicodeObject
*self
)
6987 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6988 register const Py_UNICODE
*e
;
6990 /* Shortcut for single character strings */
6991 if (PyUnicode_GET_SIZE(self
) == 1 &&
6992 Py_UNICODE_ISSPACE(*p
))
6993 return PyBool_FromLong(1);
6995 /* Special case for empty strings */
6996 if (PyUnicode_GET_SIZE(self
) == 0)
6997 return PyBool_FromLong(0);
6999 e
= p
+ PyUnicode_GET_SIZE(self
);
7000 for (; p
< e
; p
++) {
7001 if (!Py_UNICODE_ISSPACE(*p
))
7002 return PyBool_FromLong(0);
7004 return PyBool_FromLong(1);
7007 PyDoc_STRVAR(isalpha__doc__
,
7008 "S.isalpha() -> bool\n\
7010 Return True if all characters in S are alphabetic\n\
7011 and there is at least one character in S, False otherwise.");
7014 unicode_isalpha(PyUnicodeObject
*self
)
7016 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7017 register const Py_UNICODE
*e
;
7019 /* Shortcut for single character strings */
7020 if (PyUnicode_GET_SIZE(self
) == 1 &&
7021 Py_UNICODE_ISALPHA(*p
))
7022 return PyBool_FromLong(1);
7024 /* Special case for empty strings */
7025 if (PyUnicode_GET_SIZE(self
) == 0)
7026 return PyBool_FromLong(0);
7028 e
= p
+ PyUnicode_GET_SIZE(self
);
7029 for (; p
< e
; p
++) {
7030 if (!Py_UNICODE_ISALPHA(*p
))
7031 return PyBool_FromLong(0);
7033 return PyBool_FromLong(1);
7036 PyDoc_STRVAR(isalnum__doc__
,
7037 "S.isalnum() -> bool\n\
7039 Return True if all characters in S are alphanumeric\n\
7040 and there is at least one character in S, False otherwise.");
7043 unicode_isalnum(PyUnicodeObject
*self
)
7045 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7046 register const Py_UNICODE
*e
;
7048 /* Shortcut for single character strings */
7049 if (PyUnicode_GET_SIZE(self
) == 1 &&
7050 Py_UNICODE_ISALNUM(*p
))
7051 return PyBool_FromLong(1);
7053 /* Special case for empty strings */
7054 if (PyUnicode_GET_SIZE(self
) == 0)
7055 return PyBool_FromLong(0);
7057 e
= p
+ PyUnicode_GET_SIZE(self
);
7058 for (; p
< e
; p
++) {
7059 if (!Py_UNICODE_ISALNUM(*p
))
7060 return PyBool_FromLong(0);
7062 return PyBool_FromLong(1);
7065 PyDoc_STRVAR(isdecimal__doc__
,
7066 "S.isdecimal() -> bool\n\
7068 Return True if there are only decimal characters in S,\n\
7072 unicode_isdecimal(PyUnicodeObject
*self
)
7074 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7075 register const Py_UNICODE
*e
;
7077 /* Shortcut for single character strings */
7078 if (PyUnicode_GET_SIZE(self
) == 1 &&
7079 Py_UNICODE_ISDECIMAL(*p
))
7080 return PyBool_FromLong(1);
7082 /* Special case for empty strings */
7083 if (PyUnicode_GET_SIZE(self
) == 0)
7084 return PyBool_FromLong(0);
7086 e
= p
+ PyUnicode_GET_SIZE(self
);
7087 for (; p
< e
; p
++) {
7088 if (!Py_UNICODE_ISDECIMAL(*p
))
7089 return PyBool_FromLong(0);
7091 return PyBool_FromLong(1);
7094 PyDoc_STRVAR(isdigit__doc__
,
7095 "S.isdigit() -> bool\n\
7097 Return True if all characters in S are digits\n\
7098 and there is at least one character in S, False otherwise.");
7101 unicode_isdigit(PyUnicodeObject
*self
)
7103 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7104 register const Py_UNICODE
*e
;
7106 /* Shortcut for single character strings */
7107 if (PyUnicode_GET_SIZE(self
) == 1 &&
7108 Py_UNICODE_ISDIGIT(*p
))
7109 return PyBool_FromLong(1);
7111 /* Special case for empty strings */
7112 if (PyUnicode_GET_SIZE(self
) == 0)
7113 return PyBool_FromLong(0);
7115 e
= p
+ PyUnicode_GET_SIZE(self
);
7116 for (; p
< e
; p
++) {
7117 if (!Py_UNICODE_ISDIGIT(*p
))
7118 return PyBool_FromLong(0);
7120 return PyBool_FromLong(1);
7123 PyDoc_STRVAR(isnumeric__doc__
,
7124 "S.isnumeric() -> bool\n\
7126 Return True if there are only numeric characters in S,\n\
7130 unicode_isnumeric(PyUnicodeObject
*self
)
7132 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7133 register const Py_UNICODE
*e
;
7135 /* Shortcut for single character strings */
7136 if (PyUnicode_GET_SIZE(self
) == 1 &&
7137 Py_UNICODE_ISNUMERIC(*p
))
7138 return PyBool_FromLong(1);
7140 /* Special case for empty strings */
7141 if (PyUnicode_GET_SIZE(self
) == 0)
7142 return PyBool_FromLong(0);
7144 e
= p
+ PyUnicode_GET_SIZE(self
);
7145 for (; p
< e
; p
++) {
7146 if (!Py_UNICODE_ISNUMERIC(*p
))
7147 return PyBool_FromLong(0);
7149 return PyBool_FromLong(1);
7152 PyDoc_STRVAR(join__doc__
,
7153 "S.join(sequence) -> unicode\n\
7155 Return a string which is the concatenation of the strings in the\n\
7156 sequence. The separator between elements is S.");
7159 unicode_join(PyObject
*self
, PyObject
*data
)
7161 return PyUnicode_Join(self
, data
);
7165 unicode_length(PyUnicodeObject
*self
)
7167 return self
->length
;
7170 PyDoc_STRVAR(ljust__doc__
,
7171 "S.ljust(width[, fillchar]) -> int\n\
7173 Return S left-justified in a Unicode string of length width. Padding is\n\
7174 done using the specified fill character (default is a space).");
7177 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
7180 Py_UNICODE fillchar
= ' ';
7182 if (!PyArg_ParseTuple(args
, "n|O&:ljust", &width
, convert_uc
, &fillchar
))
7185 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7187 return (PyObject
*) self
;
7190 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
7193 PyDoc_STRVAR(lower__doc__
,
7194 "S.lower() -> unicode\n\
7196 Return a copy of the string S converted to lowercase.");
7199 unicode_lower(PyUnicodeObject
*self
)
7201 return fixup(self
, fixlower
);
7205 #define RIGHTSTRIP 1
7208 /* Arrays indexed by above */
7209 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7211 #define STRIPNAME(i) (stripformat[i]+3)
7213 /* externally visible for str.strip(unicode) */
7215 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
7217 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
7218 Py_ssize_t len
= PyUnicode_GET_SIZE(self
);
7219 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
7220 Py_ssize_t seplen
= PyUnicode_GET_SIZE(sepobj
);
7223 BLOOM_MASK sepmask
= make_bloom_mask(sep
, seplen
);
7226 if (striptype
!= RIGHTSTRIP
) {
7227 while (i
< len
&& BLOOM_MEMBER(sepmask
, s
[i
], sep
, seplen
)) {
7233 if (striptype
!= LEFTSTRIP
) {
7236 } while (j
>= i
&& BLOOM_MEMBER(sepmask
, s
[j
], sep
, seplen
));
7240 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7242 return (PyObject
*)self
;
7245 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7250 do_strip(PyUnicodeObject
*self
, int striptype
)
7252 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
7253 Py_ssize_t len
= PyUnicode_GET_SIZE(self
), i
, j
;
7256 if (striptype
!= RIGHTSTRIP
) {
7257 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
7263 if (striptype
!= LEFTSTRIP
) {
7266 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
7270 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7272 return (PyObject
*)self
;
7275 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7280 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
7282 PyObject
*sep
= NULL
;
7284 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
7287 if (sep
!= NULL
&& sep
!= Py_None
) {
7288 if (PyUnicode_Check(sep
))
7289 return _PyUnicode_XStrip(self
, striptype
, sep
);
7290 else if (PyString_Check(sep
)) {
7292 sep
= PyUnicode_FromObject(sep
);
7295 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
7300 PyErr_Format(PyExc_TypeError
,
7301 "%s arg must be None, unicode or str",
7302 STRIPNAME(striptype
));
7307 return do_strip(self
, striptype
);
7311 PyDoc_STRVAR(strip__doc__
,
7312 "S.strip([chars]) -> unicode\n\
7314 Return a copy of the string S with leading and trailing\n\
7315 whitespace removed.\n\
7316 If chars is given and not None, remove characters in chars instead.\n\
7317 If chars is a str, it will be converted to unicode before stripping");
7320 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
7322 if (PyTuple_GET_SIZE(args
) == 0)
7323 return do_strip(self
, BOTHSTRIP
); /* Common case */
7325 return do_argstrip(self
, BOTHSTRIP
, args
);
7329 PyDoc_STRVAR(lstrip__doc__
,
7330 "S.lstrip([chars]) -> unicode\n\
7332 Return a copy of the string S with leading whitespace removed.\n\
7333 If chars is given and not None, remove characters in chars instead.\n\
7334 If chars is a str, it will be converted to unicode before stripping");
7337 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
7339 if (PyTuple_GET_SIZE(args
) == 0)
7340 return do_strip(self
, LEFTSTRIP
); /* Common case */
7342 return do_argstrip(self
, LEFTSTRIP
, args
);
7346 PyDoc_STRVAR(rstrip__doc__
,
7347 "S.rstrip([chars]) -> unicode\n\
7349 Return a copy of the string S with trailing whitespace removed.\n\
7350 If chars is given and not None, remove characters in chars instead.\n\
7351 If chars is a str, it will be converted to unicode before stripping");
7354 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
7356 if (PyTuple_GET_SIZE(args
) == 0)
7357 return do_strip(self
, RIGHTSTRIP
); /* Common case */
7359 return do_argstrip(self
, RIGHTSTRIP
, args
);
7364 unicode_repeat(PyUnicodeObject
*str
, Py_ssize_t len
)
7374 if (len
== 1 && PyUnicode_CheckExact(str
)) {
7375 /* no repeat, return original string */
7377 return (PyObject
*) str
;
7380 /* ensure # of chars needed doesn't overflow int and # of bytes
7381 * needed doesn't overflow size_t
7383 nchars
= len
* str
->length
;
7384 if (len
&& nchars
/ len
!= str
->length
) {
7385 PyErr_SetString(PyExc_OverflowError
,
7386 "repeated string is too long");
7389 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
7390 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
7391 PyErr_SetString(PyExc_OverflowError
,
7392 "repeated string is too long");
7395 u
= _PyUnicode_New(nchars
);
7401 if (str
->length
== 1 && len
> 0) {
7402 Py_UNICODE_FILL(p
, str
->str
[0], len
);
7404 Py_ssize_t done
= 0; /* number of characters copied this far */
7405 if (done
< nchars
) {
7406 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
7409 while (done
< nchars
) {
7410 Py_ssize_t n
= (done
<= nchars
-done
) ? done
: nchars
-done
;
7411 Py_UNICODE_COPY(p
+done
, p
, n
);
7416 return (PyObject
*) u
;
7419 PyObject
*PyUnicode_Replace(PyObject
*obj
,
7422 Py_ssize_t maxcount
)
7429 self
= PyUnicode_FromObject(obj
);
7432 str1
= PyUnicode_FromObject(subobj
);
7437 str2
= PyUnicode_FromObject(replobj
);
7443 result
= replace((PyUnicodeObject
*)self
,
7444 (PyUnicodeObject
*)str1
,
7445 (PyUnicodeObject
*)str2
,
7453 PyDoc_STRVAR(replace__doc__
,
7454 "S.replace (old, new[, count]) -> unicode\n\
7456 Return a copy of S with all occurrences of substring\n\
7457 old replaced by new. If the optional argument count is\n\
7458 given, only the first count occurrences are replaced.");
7461 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
7463 PyUnicodeObject
*str1
;
7464 PyUnicodeObject
*str2
;
7465 Py_ssize_t maxcount
= -1;
7468 if (!PyArg_ParseTuple(args
, "OO|n:replace", &str1
, &str2
, &maxcount
))
7470 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
7473 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
7479 result
= replace(self
, str1
, str2
, maxcount
);
7487 PyObject
*unicode_repr(PyObject
*unicode
)
7489 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
7490 PyUnicode_GET_SIZE(unicode
),
7494 PyDoc_STRVAR(rfind__doc__
,
7495 "S.rfind(sub [,start [,end]]) -> int\n\
7497 Return the highest index in S where substring sub is found,\n\
7498 such that sub is contained within s[start:end]. Optional\n\
7499 arguments start and end are interpreted as in slice notation.\n\
7501 Return -1 on failure.");
7504 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
7506 PyObject
*substring
;
7511 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7514 result
= stringlib_rfind_slice(
7515 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7516 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7520 Py_DECREF(substring
);
7522 return PyInt_FromSsize_t(result
);
7525 PyDoc_STRVAR(rindex__doc__
,
7526 "S.rindex(sub [,start [,end]]) -> int\n\
7528 Like S.rfind() but raise ValueError when the substring is not found.");
7531 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
7533 PyObject
*substring
;
7538 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7541 result
= stringlib_rfind_slice(
7542 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7543 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7547 Py_DECREF(substring
);
7550 PyErr_SetString(PyExc_ValueError
, "substring not found");
7553 return PyInt_FromSsize_t(result
);
7556 PyDoc_STRVAR(rjust__doc__
,
7557 "S.rjust(width[, fillchar]) -> unicode\n\
7559 Return S right-justified in a Unicode string of length width. Padding is\n\
7560 done using the specified fill character (default is a space).");
7563 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
7566 Py_UNICODE fillchar
= ' ';
7568 if (!PyArg_ParseTuple(args
, "n|O&:rjust", &width
, convert_uc
, &fillchar
))
7571 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7573 return (PyObject
*) self
;
7576 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
7580 unicode_slice(PyUnicodeObject
*self
, Py_ssize_t start
, Py_ssize_t end
)
7582 /* standard clamping */
7587 if (end
> self
->length
)
7589 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
7590 /* full slice, return original string */
7592 return (PyObject
*) self
;
7597 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
7601 PyObject
*PyUnicode_Split(PyObject
*s
,
7603 Py_ssize_t maxsplit
)
7607 s
= PyUnicode_FromObject(s
);
7611 sep
= PyUnicode_FromObject(sep
);
7618 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7625 PyDoc_STRVAR(split__doc__
,
7626 "S.split([sep [,maxsplit]]) -> list of strings\n\
7628 Return a list of the words in S, using sep as the\n\
7629 delimiter string. If maxsplit is given, at most maxsplit\n\
7630 splits are done. If sep is not specified or is None, any\n\
7631 whitespace string is a separator and empty strings are\n\
7632 removed from the result.");
7635 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
7637 PyObject
*substring
= Py_None
;
7638 Py_ssize_t maxcount
= -1;
7640 if (!PyArg_ParseTuple(args
, "|On:split", &substring
, &maxcount
))
7643 if (substring
== Py_None
)
7644 return split(self
, NULL
, maxcount
);
7645 else if (PyUnicode_Check(substring
))
7646 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
7648 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
7652 PyUnicode_Partition(PyObject
*str_in
, PyObject
*sep_in
)
7658 str_obj
= PyUnicode_FromObject(str_in
);
7661 sep_obj
= PyUnicode_FromObject(sep_in
);
7667 out
= stringlib_partition(
7668 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7669 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7680 PyUnicode_RPartition(PyObject
*str_in
, PyObject
*sep_in
)
7686 str_obj
= PyUnicode_FromObject(str_in
);
7689 sep_obj
= PyUnicode_FromObject(sep_in
);
7695 out
= stringlib_rpartition(
7696 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7697 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7706 PyDoc_STRVAR(partition__doc__
,
7707 "S.partition(sep) -> (head, sep, tail)\n\
7709 Search for the separator sep in S, and return the part before it,\n\
7710 the separator itself, and the part after it. If the separator is not\n\
7711 found, return S and two empty strings.");
7714 unicode_partition(PyUnicodeObject
*self
, PyObject
*separator
)
7716 return PyUnicode_Partition((PyObject
*)self
, separator
);
7719 PyDoc_STRVAR(rpartition__doc__
,
7720 "S.rpartition(sep) -> (tail, sep, head)\n\
7722 Search for the separator sep in S, starting at the end of S, and return\n\
7723 the part before it, the separator itself, and the part after it. If the\n\
7724 separator is not found, return two empty strings and S.");
7727 unicode_rpartition(PyUnicodeObject
*self
, PyObject
*separator
)
7729 return PyUnicode_RPartition((PyObject
*)self
, separator
);
7732 PyObject
*PyUnicode_RSplit(PyObject
*s
,
7734 Py_ssize_t maxsplit
)
7738 s
= PyUnicode_FromObject(s
);
7742 sep
= PyUnicode_FromObject(sep
);
7749 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7756 PyDoc_STRVAR(rsplit__doc__
,
7757 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7759 Return a list of the words in S, using sep as the\n\
7760 delimiter string, starting at the end of the string and\n\
7761 working to the front. If maxsplit is given, at most maxsplit\n\
7762 splits are done. If sep is not specified, any whitespace string\n\
7766 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
7768 PyObject
*substring
= Py_None
;
7769 Py_ssize_t maxcount
= -1;
7771 if (!PyArg_ParseTuple(args
, "|On:rsplit", &substring
, &maxcount
))
7774 if (substring
== Py_None
)
7775 return rsplit(self
, NULL
, maxcount
);
7776 else if (PyUnicode_Check(substring
))
7777 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
7779 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
7782 PyDoc_STRVAR(splitlines__doc__
,
7783 "S.splitlines([keepends]) -> list of strings\n\
7785 Return a list of the lines in S, breaking at line boundaries.\n\
7786 Line breaks are not included in the resulting list unless keepends\n\
7787 is given and true.");
7790 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
7794 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
7797 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
7801 PyObject
*unicode_str(PyUnicodeObject
*self
)
7803 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
7806 PyDoc_STRVAR(swapcase__doc__
,
7807 "S.swapcase() -> unicode\n\
7809 Return a copy of S with uppercase characters converted to lowercase\n\
7813 unicode_swapcase(PyUnicodeObject
*self
)
7815 return fixup(self
, fixswapcase
);
7818 PyDoc_STRVAR(translate__doc__
,
7819 "S.translate(table) -> unicode\n\
7821 Return a copy of the string S, where all characters have been mapped\n\
7822 through the given translation table, which must be a mapping of\n\
7823 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7824 Unmapped characters are left untouched. Characters mapped to None\n\
7828 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
7830 return PyUnicode_TranslateCharmap(self
->str
,
7836 PyDoc_STRVAR(upper__doc__
,
7837 "S.upper() -> unicode\n\
7839 Return a copy of S converted to uppercase.");
7842 unicode_upper(PyUnicodeObject
*self
)
7844 return fixup(self
, fixupper
);
7847 PyDoc_STRVAR(zfill__doc__
,
7848 "S.zfill(width) -> unicode\n\
7850 Pad a numeric string S with zeros on the left, to fill a field\n\
7851 of the specified width. The string S is never truncated.");
7854 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
7860 if (!PyArg_ParseTuple(args
, "n:zfill", &width
))
7863 if (self
->length
>= width
) {
7864 if (PyUnicode_CheckExact(self
)) {
7866 return (PyObject
*) self
;
7869 return PyUnicode_FromUnicode(
7870 PyUnicode_AS_UNICODE(self
),
7871 PyUnicode_GET_SIZE(self
)
7875 fill
= width
- self
->length
;
7877 u
= pad(self
, fill
, 0, '0');
7882 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
7883 /* move sign to beginning of string */
7884 u
->str
[0] = u
->str
[fill
];
7888 return (PyObject
*) u
;
7893 free_listsize(PyUnicodeObject
*self
)
7895 return PyInt_FromLong(numfree
);
7899 PyDoc_STRVAR(startswith__doc__
,
7900 "S.startswith(prefix[, start[, end]]) -> bool\n\
7902 Return True if S starts with the specified prefix, False otherwise.\n\
7903 With optional start, test S beginning at that position.\n\
7904 With optional end, stop comparing S at that position.\n\
7905 prefix can also be a tuple of strings to try.");
7908 unicode_startswith(PyUnicodeObject
*self
,
7912 PyUnicodeObject
*substring
;
7913 Py_ssize_t start
= 0;
7914 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7917 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &subobj
,
7918 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7920 if (PyTuple_Check(subobj
)) {
7922 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7923 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7924 PyTuple_GET_ITEM(subobj
, i
));
7925 if (substring
== NULL
)
7927 result
= tailmatch(self
, substring
, start
, end
, -1);
7928 Py_DECREF(substring
);
7933 /* nothing matched */
7936 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7937 if (substring
== NULL
)
7939 result
= tailmatch(self
, substring
, start
, end
, -1);
7940 Py_DECREF(substring
);
7941 return PyBool_FromLong(result
);
7945 PyDoc_STRVAR(endswith__doc__
,
7946 "S.endswith(suffix[, start[, end]]) -> bool\n\
7948 Return True if S ends with the specified suffix, False otherwise.\n\
7949 With optional start, test S beginning at that position.\n\
7950 With optional end, stop comparing S at that position.\n\
7951 suffix can also be a tuple of strings to try.");
7954 unicode_endswith(PyUnicodeObject
*self
,
7958 PyUnicodeObject
*substring
;
7959 Py_ssize_t start
= 0;
7960 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7963 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &subobj
,
7964 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7966 if (PyTuple_Check(subobj
)) {
7968 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7969 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7970 PyTuple_GET_ITEM(subobj
, i
));
7971 if (substring
== NULL
)
7973 result
= tailmatch(self
, substring
, start
, end
, +1);
7974 Py_DECREF(substring
);
7981 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7982 if (substring
== NULL
)
7985 result
= tailmatch(self
, substring
, start
, end
, +1);
7986 Py_DECREF(substring
);
7987 return PyBool_FromLong(result
);
7991 /* Implements do_string_format, which is unicode because of stringlib */
7992 #include "stringlib/string_format.h"
7994 PyDoc_STRVAR(format__doc__
,
7995 "S.format(*args, **kwargs) -> unicode\n\
8000 unicode__format__(PyObject
*self
, PyObject
*args
)
8002 PyObject
*format_spec
;
8003 PyObject
*result
= NULL
;
8004 PyObject
*tmp
= NULL
;
8006 /* If 2.x, convert format_spec to the same type as value */
8007 /* This is to allow things like u''.format('') */
8008 if (!PyArg_ParseTuple(args
, "O:__format__", &format_spec
))
8010 if (!(PyBytes_Check(format_spec
) || PyUnicode_Check(format_spec
))) {
8011 PyErr_Format(PyExc_TypeError
, "__format__ arg must be str "
8012 "or unicode, not %s", Py_TYPE(format_spec
)->tp_name
);
8015 tmp
= PyObject_Unicode(format_spec
);
8020 result
= _PyUnicode_FormatAdvanced(self
,
8021 PyUnicode_AS_UNICODE(format_spec
),
8022 PyUnicode_GET_SIZE(format_spec
));
8028 PyDoc_STRVAR(p_format__doc__
,
8029 "S.__format__(format_spec) -> unicode\n\
8034 unicode__sizeof__(PyUnicodeObject
*v
)
8036 return PyInt_FromSsize_t(sizeof(PyUnicodeObject
) +
8037 sizeof(Py_UNICODE
) * (v
->length
+ 1));
8040 PyDoc_STRVAR(sizeof__doc__
,
8041 "S.__sizeof__() -> size of S in memory, in bytes\n\
8046 unicode_getnewargs(PyUnicodeObject
*v
)
8048 return Py_BuildValue("(u#)", v
->str
, v
->length
);
8052 static PyMethodDef unicode_methods
[] = {
8054 /* Order is according to common usage: often used methods should
8055 appear first, since lookup is done sequentially. */
8057 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
, encode__doc__
},
8058 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
8059 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
8060 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
8061 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
8062 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
8063 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
8064 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
8065 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
8066 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
8067 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
8068 {"partition", (PyCFunction
) unicode_partition
, METH_O
, partition__doc__
},
8069 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
8070 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
8071 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
8072 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
8073 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
, decode__doc__
},
8074 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8075 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
8076 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
8077 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
8078 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
8079 {"rpartition", (PyCFunction
) unicode_rpartition
, METH_O
, rpartition__doc__
},
8080 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
8081 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
8082 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
8083 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
8084 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
8085 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
8086 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
8087 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
8088 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
8089 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
8090 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
8091 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
8092 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
8093 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
8094 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
8095 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
8096 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
8097 {"format", (PyCFunction
) do_string_format
, METH_VARARGS
| METH_KEYWORDS
, format__doc__
},
8098 {"__format__", (PyCFunction
) unicode__format__
, METH_VARARGS
, p_format__doc__
},
8099 {"_formatter_field_name_split", (PyCFunction
) formatter_field_name_split
, METH_NOARGS
},
8100 {"_formatter_parser", (PyCFunction
) formatter_parser
, METH_NOARGS
},
8101 {"__sizeof__", (PyCFunction
) unicode__sizeof__
, METH_NOARGS
, sizeof__doc__
},
8103 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
8107 /* This one is just used for debugging the implementation. */
8108 {"freelistsize", (PyCFunction
) free_listsize
, METH_NOARGS
},
8111 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
8116 unicode_mod(PyObject
*v
, PyObject
*w
)
8118 if (!PyUnicode_Check(v
)) {
8119 Py_INCREF(Py_NotImplemented
);
8120 return Py_NotImplemented
;
8122 return PyUnicode_Format(v
, w
);
8125 static PyNumberMethods unicode_as_number
= {
8130 unicode_mod
, /*nb_remainder*/
8133 static PySequenceMethods unicode_as_sequence
= {
8134 (lenfunc
) unicode_length
, /* sq_length */
8135 PyUnicode_Concat
, /* sq_concat */
8136 (ssizeargfunc
) unicode_repeat
, /* sq_repeat */
8137 (ssizeargfunc
) unicode_getitem
, /* sq_item */
8138 (ssizessizeargfunc
) unicode_slice
, /* sq_slice */
8139 0, /* sq_ass_item */
8140 0, /* sq_ass_slice */
8141 PyUnicode_Contains
, /* sq_contains */
8145 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
8147 if (PyIndex_Check(item
)) {
8148 Py_ssize_t i
= PyNumber_AsSsize_t(item
, PyExc_IndexError
);
8149 if (i
== -1 && PyErr_Occurred())
8152 i
+= PyUnicode_GET_SIZE(self
);
8153 return unicode_getitem(self
, i
);
8154 } else if (PySlice_Check(item
)) {
8155 Py_ssize_t start
, stop
, step
, slicelength
, cur
, i
;
8156 Py_UNICODE
* source_buf
;
8157 Py_UNICODE
* result_buf
;
8160 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyUnicode_GET_SIZE(self
),
8161 &start
, &stop
, &step
, &slicelength
) < 0) {
8165 if (slicelength
<= 0) {
8166 return PyUnicode_FromUnicode(NULL
, 0);
8167 } else if (start
== 0 && step
== 1 && slicelength
== self
->length
&&
8168 PyUnicode_CheckExact(self
)) {
8170 return (PyObject
*)self
;
8171 } else if (step
== 1) {
8172 return PyUnicode_FromUnicode(self
->str
+ start
, slicelength
);
8174 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
8175 result_buf
= (Py_UNICODE
*)PyObject_MALLOC(slicelength
*
8176 sizeof(Py_UNICODE
));
8178 if (result_buf
== NULL
)
8179 return PyErr_NoMemory();
8181 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
8182 result_buf
[i
] = source_buf
[cur
];
8185 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
8186 PyObject_FREE(result_buf
);
8190 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
8195 static PyMappingMethods unicode_as_mapping
= {
8196 (lenfunc
)unicode_length
, /* mp_length */
8197 (binaryfunc
)unicode_subscript
, /* mp_subscript */
8198 (objobjargproc
)0, /* mp_ass_subscript */
8202 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
8207 PyErr_SetString(PyExc_SystemError
,
8208 "accessing non-existent unicode segment");
8211 *ptr
= (void *) self
->str
;
8212 return PyUnicode_GET_DATA_SIZE(self
);
8216 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, Py_ssize_t index
,
8219 PyErr_SetString(PyExc_TypeError
,
8220 "cannot use unicode as modifiable buffer");
8225 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
8229 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
8234 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
8241 PyErr_SetString(PyExc_SystemError
,
8242 "accessing non-existent unicode segment");
8245 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
8248 *ptr
= (void *) PyString_AS_STRING(str
);
8249 return PyString_GET_SIZE(str
);
8252 /* Helpers for PyUnicode_Format() */
8255 getnextarg(PyObject
*args
, Py_ssize_t arglen
, Py_ssize_t
*p_argidx
)
8257 Py_ssize_t argidx
= *p_argidx
;
8258 if (argidx
< arglen
) {
8263 return PyTuple_GetItem(args
, argidx
);
8265 PyErr_SetString(PyExc_TypeError
,
8266 "not enough arguments for format string");
8270 #define F_LJUST (1<<0)
8271 #define F_SIGN (1<<1)
8272 #define F_BLANK (1<<2)
8273 #define F_ALT (1<<3)
8274 #define F_ZERO (1<<4)
8277 strtounicode(Py_UNICODE
*buffer
, const char *charbuffer
)
8279 register Py_ssize_t i
;
8280 Py_ssize_t len
= strlen(charbuffer
);
8281 for (i
= len
- 1; i
>= 0; i
--)
8282 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
8288 doubletounicode(Py_UNICODE
*buffer
, size_t len
, int format_code
,
8289 int precision
, int flags
, double x
)
8293 _PyOS_double_to_string((char *)buffer
, len
, x
, format_code
, precision
,
8295 result
= strtounicode(buffer
, (char *)buffer
);
8296 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
8300 longtounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, long x
)
8304 PyOS_snprintf((char *)buffer
, len
, format
, x
);
8305 result
= strtounicode(buffer
, (char *)buffer
);
8306 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
8309 /* XXX To save some code duplication, formatfloat/long/int could have been
8310 shared with stringobject.c, converting from 8-bit to Unicode after the
8311 formatting is done. */
8314 formatfloat(Py_UNICODE
*buf
,
8323 x
= PyFloat_AsDouble(v
);
8324 if (x
== -1.0 && PyErr_Occurred())
8328 /* make sure that the decimal representation of precision really does
8329 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
8330 if (prec
> 0x7fffffffL
) {
8331 PyErr_SetString(PyExc_OverflowError
,
8332 "outrageously large precision "
8333 "for formatted float");
8337 if (type
== 'f' && fabs(x
) >= 1e50
)
8339 /* Worst case length calc to ensure no buffer overrun:
8343 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8344 for any double rep.)
8345 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8348 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8349 len = 1 + 50 + 1 + prec = 52 + prec
8351 If prec=0 the effective precision is 1 (the leading digit is
8352 always given), therefore increase the length by one.
8355 if (((type
== 'g' || type
== 'G') &&
8356 buflen
<= (size_t)10 + (size_t)prec
) ||
8357 (type
== 'f' && buflen
<= (size_t)53 + (size_t)prec
)) {
8358 PyErr_SetString(PyExc_OverflowError
,
8359 "formatted float is too long (precision too large?)");
8362 return doubletounicode(buf
, buflen
, type
, prec
,
8363 (flags
&F_ALT
)?Py_DTSF_ALT
:0, x
);
8367 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
8371 PyObject
*str
; /* temporary string object. */
8372 PyUnicodeObject
*result
;
8374 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
8377 result
= _PyUnicode_New(len
);
8382 for (i
= 0; i
< len
; i
++)
8383 result
->str
[i
] = buf
[i
];
8384 result
->str
[len
] = 0;
8386 return (PyObject
*)result
;
8390 formatint(Py_UNICODE
*buf
,
8397 /* fmt = '%#.' + `prec` + 'l' + `type`
8398 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8402 char fmt
[64]; /* plenty big enough! */
8406 x
= PyInt_AsLong(v
);
8407 if (x
== -1 && PyErr_Occurred())
8409 if (x
< 0 && type
== 'u') {
8412 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
8419 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8420 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8422 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
8423 PyErr_SetString(PyExc_OverflowError
,
8424 "formatted integer is too long (precision too large?)");
8428 if ((flags
& F_ALT
) &&
8429 (type
== 'x' || type
== 'X')) {
8430 /* When converting under %#x or %#X, there are a number
8431 * of issues that cause pain:
8432 * - when 0 is being converted, the C standard leaves off
8433 * the '0x' or '0X', which is inconsistent with other
8434 * %#x/%#X conversions and inconsistent with Python's
8436 * - there are platforms that violate the standard and
8437 * convert 0 with the '0x' or '0X'
8438 * (Metrowerks, Compaq Tru64)
8439 * - there are platforms that give '0x' when converting
8440 * under %#X, but convert 0 in accordance with the
8441 * standard (OS/2 EMX)
8443 * We can achieve the desired consistency by inserting our
8444 * own '0x' or '0X' prefix, and substituting %x/%X in place
8447 * Note that this is the same approach as used in
8448 * formatint() in stringobject.c
8450 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
8451 sign
, type
, prec
, type
);
8454 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
8455 sign
, (flags
&F_ALT
) ? "#" : "",
8459 return longtounicode(buf
, buflen
, fmt
, -x
);
8461 return longtounicode(buf
, buflen
, fmt
, x
);
8465 formatchar(Py_UNICODE
*buf
,
8469 /* presume that the buffer is at least 2 characters long */
8470 if (PyUnicode_Check(v
)) {
8471 if (PyUnicode_GET_SIZE(v
) != 1)
8473 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
8476 else if (PyString_Check(v
)) {
8477 if (PyString_GET_SIZE(v
) != 1)
8479 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
8483 /* Integer input truncated to a character */
8485 x
= PyInt_AsLong(v
);
8486 if (x
== -1 && PyErr_Occurred())
8488 #ifdef Py_UNICODE_WIDE
8489 if (x
< 0 || x
> 0x10ffff) {
8490 PyErr_SetString(PyExc_OverflowError
,
8491 "%c arg not in range(0x110000) "
8492 "(wide Python build)");
8496 if (x
< 0 || x
> 0xffff) {
8497 PyErr_SetString(PyExc_OverflowError
,
8498 "%c arg not in range(0x10000) "
8499 "(narrow Python build)");
8503 buf
[0] = (Py_UNICODE
) x
;
8509 PyErr_SetString(PyExc_TypeError
,
8510 "%c requires int or char");
8514 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8516 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8517 chars are formatted. XXX This is a magic number. Each formatting
8518 routine does bounds checking to ensure no overflow, but a better
8519 solution may be to malloc a buffer of appropriate size for each
8520 format. For now, the current solution is sufficient.
8522 #define FORMATBUFLEN (size_t)120
8524 PyObject
*PyUnicode_Format(PyObject
*format
,
8527 Py_UNICODE
*fmt
, *res
;
8528 Py_ssize_t fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
8530 PyUnicodeObject
*result
= NULL
;
8531 PyObject
*dict
= NULL
;
8534 if (format
== NULL
|| args
== NULL
) {
8535 PyErr_BadInternalCall();
8538 uformat
= PyUnicode_FromObject(format
);
8539 if (uformat
== NULL
)
8541 fmt
= PyUnicode_AS_UNICODE(uformat
);
8542 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
8544 reslen
= rescnt
= fmtcnt
+ 100;
8545 result
= _PyUnicode_New(reslen
);
8548 res
= PyUnicode_AS_UNICODE(result
);
8550 if (PyTuple_Check(args
)) {
8551 arglen
= PyTuple_Size(args
);
8558 if (Py_TYPE(args
)->tp_as_mapping
&& !PyTuple_Check(args
) &&
8559 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
8562 while (--fmtcnt
>= 0) {
8565 rescnt
= fmtcnt
+ 100;
8567 if (_PyUnicode_Resize(&result
, reslen
) < 0)
8569 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
8575 /* Got a format specifier */
8577 Py_ssize_t width
= -1;
8579 Py_UNICODE c
= '\0';
8583 PyObject
*temp
= NULL
;
8587 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
8591 Py_UNICODE
*keystart
;
8597 PyErr_SetString(PyExc_TypeError
,
8598 "format requires a mapping");
8604 /* Skip over balanced parentheses */
8605 while (pcount
> 0 && --fmtcnt
>= 0) {
8608 else if (*fmt
== '(')
8612 keylen
= fmt
- keystart
- 1;
8613 if (fmtcnt
< 0 || pcount
> 0) {
8614 PyErr_SetString(PyExc_ValueError
,
8615 "incomplete format key");
8619 /* keys are converted to strings using UTF-8 and
8620 then looked up since Python uses strings to hold
8621 variables names etc. in its namespaces and we
8622 wouldn't want to break common idioms. */
8623 key
= PyUnicode_EncodeUTF8(keystart
,
8627 key
= PyUnicode_FromUnicode(keystart
, keylen
);
8635 args
= PyObject_GetItem(dict
, key
);
8644 while (--fmtcnt
>= 0) {
8645 switch (c
= *fmt
++) {
8646 case '-': flags
|= F_LJUST
; continue;
8647 case '+': flags
|= F_SIGN
; continue;
8648 case ' ': flags
|= F_BLANK
; continue;
8649 case '#': flags
|= F_ALT
; continue;
8650 case '0': flags
|= F_ZERO
; continue;
8655 v
= getnextarg(args
, arglen
, &argidx
);
8658 if (!PyInt_Check(v
)) {
8659 PyErr_SetString(PyExc_TypeError
,
8663 width
= PyInt_AsLong(v
);
8671 else if (c
>= '0' && c
<= '9') {
8673 while (--fmtcnt
>= 0) {
8675 if (c
< '0' || c
> '9')
8677 if ((width
*10) / 10 != width
) {
8678 PyErr_SetString(PyExc_ValueError
,
8682 width
= width
*10 + (c
- '0');
8690 v
= getnextarg(args
, arglen
, &argidx
);
8693 if (!PyInt_Check(v
)) {
8694 PyErr_SetString(PyExc_TypeError
,
8698 prec
= PyInt_AsLong(v
);
8704 else if (c
>= '0' && c
<= '9') {
8706 while (--fmtcnt
>= 0) {
8707 c
= Py_CHARMASK(*fmt
++);
8708 if (c
< '0' || c
> '9')
8710 if ((prec
*10) / 10 != prec
) {
8711 PyErr_SetString(PyExc_ValueError
,
8715 prec
= prec
*10 + (c
- '0');
8720 if (c
== 'h' || c
== 'l' || c
== 'L') {
8726 PyErr_SetString(PyExc_ValueError
,
8727 "incomplete format");
8731 v
= getnextarg(args
, arglen
, &argidx
);
8741 /* presume that buffer length is at least 1 */
8748 if (PyUnicode_Check(v
) && c
== 's') {
8755 temp
= PyObject_Unicode(v
);
8757 temp
= PyObject_Repr(v
);
8760 if (PyUnicode_Check(temp
))
8761 /* nothing to do */;
8762 else if (PyString_Check(temp
)) {
8763 /* convert to string to Unicode */
8764 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
8765 PyString_GET_SIZE(temp
),
8775 PyErr_SetString(PyExc_TypeError
,
8776 "%s argument has non-string str()");
8780 pbuf
= PyUnicode_AS_UNICODE(temp
);
8781 len
= PyUnicode_GET_SIZE(temp
);
8782 if (prec
>= 0 && len
> prec
)
8795 if (PyNumber_Check(v
)) {
8796 PyObject
*iobj
=NULL
;
8798 if (PyInt_Check(v
) || (PyLong_Check(v
))) {
8803 iobj
= PyNumber_Int(v
);
8804 if (iobj
==NULL
) iobj
= PyNumber_Long(v
);
8807 if (PyInt_Check(iobj
)) {
8810 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8811 flags
, prec
, c
, iobj
);
8817 else if (PyLong_Check(iobj
)) {
8819 temp
= formatlong(iobj
, flags
, prec
, c
);
8823 pbuf
= PyUnicode_AS_UNICODE(temp
);
8824 len
= PyUnicode_GET_SIZE(temp
);
8833 PyErr_Format(PyExc_TypeError
,
8834 "%%%c format: a number is required, "
8835 "not %.200s", (char)c
, Py_TYPE(v
)->tp_name
);
8851 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8862 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
8868 PyErr_Format(PyExc_ValueError
,
8869 "unsupported format character '%c' (0x%x) "
8871 (31<=c
&& c
<=126) ? (char)c
: '?',
8873 (Py_ssize_t
)(fmt
- 1 -
8874 PyUnicode_AS_UNICODE(uformat
)));
8878 if (*pbuf
== '-' || *pbuf
== '+') {
8882 else if (flags
& F_SIGN
)
8884 else if (flags
& F_BLANK
)
8891 if (rescnt
- (sign
!= 0) < width
) {
8893 rescnt
= width
+ fmtcnt
+ 100;
8900 if (_PyUnicode_Resize(&result
, reslen
) < 0) {
8904 res
= PyUnicode_AS_UNICODE(result
)
8914 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8915 assert(pbuf
[0] == '0');
8916 assert(pbuf
[1] == c
);
8927 if (width
> len
&& !(flags
& F_LJUST
)) {
8931 } while (--width
> len
);
8936 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8937 assert(pbuf
[0] == '0');
8938 assert(pbuf
[1] == c
);
8943 Py_UNICODE_COPY(res
, pbuf
, len
);
8946 while (--width
>= len
) {
8950 if (dict
&& (argidx
< arglen
) && c
!= '%') {
8951 PyErr_SetString(PyExc_TypeError
,
8952 "not all arguments converted during string formatting");
8959 if (argidx
< arglen
&& !dict
) {
8960 PyErr_SetString(PyExc_TypeError
,
8961 "not all arguments converted during string formatting");
8965 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
8971 return (PyObject
*)result
;
8982 static PyBufferProcs unicode_as_buffer
= {
8983 (readbufferproc
) unicode_buffer_getreadbuf
,
8984 (writebufferproc
) unicode_buffer_getwritebuf
,
8985 (segcountproc
) unicode_buffer_getsegcount
,
8986 (charbufferproc
) unicode_buffer_getcharbuf
,
8990 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
8993 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8996 static char *kwlist
[] = {"string", "encoding", "errors", 0};
8997 char *encoding
= NULL
;
8998 char *errors
= NULL
;
9000 if (type
!= &PyUnicode_Type
)
9001 return unicode_subtype_new(type
, args
, kwds
);
9002 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
9003 kwlist
, &x
, &encoding
, &errors
))
9006 return (PyObject
*)_PyUnicode_New(0);
9007 if (encoding
== NULL
&& errors
== NULL
)
9008 return PyObject_Unicode(x
);
9010 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
9014 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
9016 PyUnicodeObject
*tmp
, *pnew
;
9019 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
9020 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
9023 assert(PyUnicode_Check(tmp
));
9024 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
9029 pnew
->str
= (Py_UNICODE
*) PyObject_MALLOC(sizeof(Py_UNICODE
) * (n
+1));
9030 if (pnew
->str
== NULL
) {
9031 _Py_ForgetReference((PyObject
*)pnew
);
9034 return PyErr_NoMemory();
9036 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
9038 pnew
->hash
= tmp
->hash
;
9040 return (PyObject
*)pnew
;
9043 PyDoc_STRVAR(unicode_doc
,
9044 "unicode(string [, encoding[, errors]]) -> object\n\
9046 Create a new Unicode object from the given encoded string.\n\
9047 encoding defaults to the current default string encoding.\n\
9048 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9050 PyTypeObject PyUnicode_Type
= {
9051 PyVarObject_HEAD_INIT(&PyType_Type
, 0)
9052 "unicode", /* tp_name */
9053 sizeof(PyUnicodeObject
), /* tp_size */
9054 0, /* tp_itemsize */
9056 (destructor
)unicode_dealloc
, /* tp_dealloc */
9061 unicode_repr
, /* tp_repr */
9062 &unicode_as_number
, /* tp_as_number */
9063 &unicode_as_sequence
, /* tp_as_sequence */
9064 &unicode_as_mapping
, /* tp_as_mapping */
9065 (hashfunc
) unicode_hash
, /* tp_hash*/
9067 (reprfunc
) unicode_str
, /* tp_str */
9068 PyObject_GenericGetAttr
, /* tp_getattro */
9069 0, /* tp_setattro */
9070 &unicode_as_buffer
, /* tp_as_buffer */
9071 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
9072 Py_TPFLAGS_BASETYPE
| Py_TPFLAGS_UNICODE_SUBCLASS
, /* tp_flags */
9073 unicode_doc
, /* tp_doc */
9074 0, /* tp_traverse */
9076 PyUnicode_RichCompare
, /* tp_richcompare */
9077 0, /* tp_weaklistoffset */
9079 0, /* tp_iternext */
9080 unicode_methods
, /* tp_methods */
9083 &PyBaseString_Type
, /* tp_base */
9085 0, /* tp_descr_get */
9086 0, /* tp_descr_set */
9087 0, /* tp_dictoffset */
9090 unicode_new
, /* tp_new */
9091 PyObject_Del
, /* tp_free */
9094 /* Initialize the Unicode implementation */
9096 void _PyUnicode_Init(void)
9100 /* XXX - move this array to unicodectype.c ? */
9101 Py_UNICODE linebreak
[] = {
9102 0x000A, /* LINE FEED */
9103 0x000D, /* CARRIAGE RETURN */
9104 0x001C, /* FILE SEPARATOR */
9105 0x001D, /* GROUP SEPARATOR */
9106 0x001E, /* RECORD SEPARATOR */
9107 0x0085, /* NEXT LINE */
9108 0x2028, /* LINE SEPARATOR */
9109 0x2029, /* PARAGRAPH SEPARATOR */
9112 /* Init the implementation */
9115 unicode_empty
= _PyUnicode_New(0);
9119 strcpy(unicode_default_encoding
, "ascii");
9120 for (i
= 0; i
< 256; i
++)
9121 unicode_latin1
[i
] = NULL
;
9122 if (PyType_Ready(&PyUnicode_Type
) < 0)
9123 Py_FatalError("Can't initialize 'unicode'");
9125 /* initialize the linebreak bloom filter */
9126 bloom_linebreak
= make_bloom_mask(
9127 linebreak
, sizeof(linebreak
) / sizeof(linebreak
[0])
9130 PyType_Ready(&EncodingMapType
);
9133 /* Finalize the Unicode implementation */
9136 PyUnicode_ClearFreeList(void)
9138 int freelist_size
= numfree
;
9141 for (u
= free_list
; u
!= NULL
;) {
9142 PyUnicodeObject
*v
= u
;
9143 u
= *(PyUnicodeObject
**)u
;
9145 PyObject_DEL(v
->str
);
9146 Py_XDECREF(v
->defenc
);
9151 assert(numfree
== 0);
9152 return freelist_size
;
9156 _PyUnicode_Fini(void)
9160 Py_XDECREF(unicode_empty
);
9161 unicode_empty
= NULL
;
9163 for (i
= 0; i
< 256; i
++) {
9164 if (unicode_latin1
[i
]) {
9165 Py_DECREF(unicode_latin1
[i
]);
9166 unicode_latin1
[i
] = NULL
;
9169 (void)PyUnicode_ClearFreeList();
9180 indent-tabs-mode: nil