3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
45 #include "unicodeobject.h"
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
80 # define BYTEORDER_IS_LITTLE_ENDIAN
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
95 /* Free list for Unicode objects */
96 static PyUnicodeObject
*free_list
;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject
*unicode_empty
;
102 /* Single character Unicode strings in the Latin-1 range are being
104 static PyUnicodeObject
*unicode_latin1
[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding
[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace
[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * HORIZONTAL TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * VERTICAL TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak
[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000D, * CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 /* 0x001C, * FILE SEPARATOR */
154 /* 0x001D, * GROUP SEPARATOR */
155 /* 0x001E, * RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #define BLOOM_MASK unsigned long
195 static BLOOM_MASK bloom_linebreak
;
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
199 #define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
203 Py_LOCAL_INLINE(BLOOM_MASK
) make_bloom_mask(Py_UNICODE
* ptr
, Py_ssize_t len
)
205 /* calculate simple bloom-style bitmask for a given unicode string */
211 for (i
= 0; i
< len
; i
++)
212 mask
|= (1 << (ptr
[i
] & 0x1F));
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr
, Py_UNICODE
* set
, Py_ssize_t setlen
)
221 for (i
= 0; i
< setlen
; i
++)
228 #define BLOOM_MEMBER(mask, chr, set, setlen) \
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231 /* --- Unicode Object ----------------------------------------------------- */
234 int unicode_resize(register PyUnicodeObject
*unicode
,
239 /* Shortcut if there's nothing much to do. */
240 if (unicode
->length
== length
)
243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
247 if (unicode
== unicode_empty
||
248 (unicode
->length
== 1 &&
249 unicode
->str
[0] < 256U &&
250 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
251 PyErr_SetString(PyExc_SystemError
,
252 "can't resize shared unicode objects");
256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
258 safe to look at str[length] (without making any assumptions about what
261 oldstr
= unicode
->str
;
262 unicode
->str
= PyObject_REALLOC(unicode
->str
,
263 sizeof(Py_UNICODE
) * (length
+ 1));
265 unicode
->str
= (Py_UNICODE
*)oldstr
;
269 unicode
->str
[length
] = 0;
270 unicode
->length
= length
;
273 /* Reset the object caches */
274 if (unicode
->defenc
) {
275 Py_DECREF(unicode
->defenc
);
276 unicode
->defenc
= NULL
;
283 /* We allocate one more byte to make sure the string is
284 Ux0000 terminated -- XXX is this needed ?
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
292 PyUnicodeObject
*_PyUnicode_New(Py_ssize_t length
)
294 register PyUnicodeObject
*unicode
;
296 /* Optimization for empty strings */
297 if (length
== 0 && unicode_empty
!= NULL
) {
298 Py_INCREF(unicode_empty
);
299 return unicode_empty
;
302 /* Ensure we won't overflow the size. */
303 if (length
> ((PY_SSIZE_T_MAX
/ sizeof(Py_UNICODE
)) - 1)) {
304 return (PyUnicodeObject
*)PyErr_NoMemory();
307 /* Unicode freelist & memory allocation */
310 free_list
= *(PyUnicodeObject
**)unicode
;
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode
->length
< length
) &&
316 unicode_resize(unicode
, length
) < 0) {
317 PyObject_DEL(unicode
->str
);
322 size_t new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
323 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
325 PyObject_INIT(unicode
, &PyUnicode_Type
);
329 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
332 new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
333 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
340 /* Initialize the first element to guard against cases where
341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
348 unicode
->str
[length
] = 0;
349 unicode
->length
= length
;
351 unicode
->defenc
= NULL
;
355 /* XXX UNREF/NEWREF interface should be more symmetrical */
357 _Py_ForgetReference((PyObject
*)unicode
);
358 PyObject_Del(unicode
);
363 void unicode_dealloc(register PyUnicodeObject
*unicode
)
365 if (PyUnicode_CheckExact(unicode
) &&
366 numfree
< PyUnicode_MAXFREELIST
) {
367 /* Keep-Alive optimization */
368 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
369 PyObject_DEL(unicode
->str
);
373 if (unicode
->defenc
) {
374 Py_DECREF(unicode
->defenc
);
375 unicode
->defenc
= NULL
;
377 /* Add to free list */
378 *(PyUnicodeObject
**)unicode
= free_list
;
383 PyObject_DEL(unicode
->str
);
384 Py_XDECREF(unicode
->defenc
);
385 Py_TYPE(unicode
)->tp_free((PyObject
*)unicode
);
390 int _PyUnicode_Resize(PyUnicodeObject
**unicode
, Py_ssize_t length
)
392 register PyUnicodeObject
*v
;
394 /* Argument checks */
395 if (unicode
== NULL
) {
396 PyErr_BadInternalCall();
400 if (v
== NULL
|| !PyUnicode_Check(v
) || Py_REFCNT(v
) != 1 || length
< 0) {
401 PyErr_BadInternalCall();
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
408 if (v
->length
!= length
&&
409 (v
== unicode_empty
|| v
->length
== 1)) {
410 PyUnicodeObject
*w
= _PyUnicode_New(length
);
413 Py_UNICODE_COPY(w
->str
, v
->str
,
414 length
< v
->length
? length
: v
->length
);
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v
, length
);
425 int PyUnicode_Resize(PyObject
**unicode
, Py_ssize_t length
)
427 return _PyUnicode_Resize((PyUnicodeObject
**)unicode
, length
);
430 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
433 PyUnicodeObject
*unicode
;
435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
439 /* Optimization for empty strings */
440 if (size
== 0 && unicode_empty
!= NULL
) {
441 Py_INCREF(unicode_empty
);
442 return (PyObject
*)unicode_empty
;
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size
== 1 && *u
< 256) {
448 unicode
= unicode_latin1
[*u
];
450 unicode
= _PyUnicode_New(1);
453 unicode
->str
[0] = *u
;
454 unicode_latin1
[*u
] = unicode
;
457 return (PyObject
*)unicode
;
461 unicode
= _PyUnicode_New(size
);
465 /* Copy the Unicode data into the new object */
467 Py_UNICODE_COPY(unicode
->str
, u
, size
);
469 return (PyObject
*)unicode
;
472 PyObject
*PyUnicode_FromStringAndSize(const char *u
, Py_ssize_t size
)
474 PyUnicodeObject
*unicode
;
477 PyErr_SetString(PyExc_SystemError
,
478 "Negative size passed to PyUnicode_FromStringAndSize");
482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
488 /* Optimization for empty strings */
489 if (size
== 0 && unicode_empty
!= NULL
) {
490 Py_INCREF(unicode_empty
);
491 return (PyObject
*)unicode_empty
;
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size
== 1 && Py_CHARMASK(*u
) < 128) {
497 unicode
= unicode_latin1
[Py_CHARMASK(*u
)];
499 unicode
= _PyUnicode_New(1);
502 unicode
->str
[0] = Py_CHARMASK(*u
);
503 unicode_latin1
[Py_CHARMASK(*u
)] = unicode
;
506 return (PyObject
*)unicode
;
509 return PyUnicode_DecodeUTF8(u
, size
, NULL
);
512 unicode
= _PyUnicode_New(size
);
516 return (PyObject
*)unicode
;
519 PyObject
*PyUnicode_FromString(const char *u
)
521 size_t size
= strlen(u
);
522 if (size
> PY_SSIZE_T_MAX
) {
523 PyErr_SetString(PyExc_OverflowError
, "input too long");
527 return PyUnicode_FromStringAndSize(u
, size
);
532 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
533 # define CONVERT_WCHAR_TO_SURROGATES
536 #ifdef CONVERT_WCHAR_TO_SURROGATES
538 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
539 to convert from UTF32 to UTF16. */
541 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
544 PyUnicodeObject
*unicode
;
545 register Py_ssize_t i
;
547 const wchar_t *orig_w
;
550 PyErr_BadInternalCall();
556 for (i
= size
; i
> 0; i
--) {
562 unicode
= _PyUnicode_New(alloc
);
566 /* Copy the wchar_t data into the new object */
568 register Py_UNICODE
*u
;
569 u
= PyUnicode_AS_UNICODE(unicode
);
570 for (i
= size
; i
> 0; i
--) {
572 wchar_t ordinal
= *w
++;
574 *u
++ = 0xD800 | (ordinal
>> 10);
575 *u
++ = 0xDC00 | (ordinal
& 0x3FF);
581 return (PyObject
*)unicode
;
586 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
589 PyUnicodeObject
*unicode
;
592 PyErr_BadInternalCall();
596 unicode
= _PyUnicode_New(size
);
600 /* Copy the wchar_t data into the new object */
601 #ifdef HAVE_USABLE_WCHAR_T
602 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
605 register Py_UNICODE
*u
;
606 register Py_ssize_t i
;
607 u
= PyUnicode_AS_UNICODE(unicode
);
608 for (i
= size
; i
> 0; i
--)
613 return (PyObject
*)unicode
;
616 #endif /* CONVERT_WCHAR_TO_SURROGATES */
618 #undef CONVERT_WCHAR_TO_SURROGATES
621 makefmt(char *fmt
, int longflag
, int size_tflag
, int zeropad
, int width
, int precision
, char c
)
627 fmt
+= sprintf(fmt
, "%d", width
);
630 fmt
+= sprintf(fmt
, ".%d", precision
);
633 else if (size_tflag
) {
634 char *f
= PY_FORMAT_SIZE_T
;
642 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
645 PyUnicode_FromFormatV(const char *format
, va_list vargs
)
648 Py_ssize_t callcount
= 0;
649 PyObject
**callresults
= NULL
;
650 PyObject
**callresult
= NULL
;
658 /* used by sprintf */
660 /* use abuffer instead of buffer, if we need more space
661 * (which can happen if there's a format specifier with width). */
662 char *abuffer
= NULL
;
664 Py_ssize_t abuffersize
= 0;
665 char fmt
[60]; /* should be enough for %0width.precisionld */
668 #ifdef VA_LIST_IS_ARRAY
669 Py_MEMCPY(count
, vargs
, sizeof(va_list));
672 __va_copy(count
, vargs
);
677 /* step 1: count the number of %S/%R/%s format specifications
678 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
679 * objects once during step 3 and put the result in an array) */
680 for (f
= format
; *f
; f
++) {
684 if (*(f
+1)=='S' || *(f
+1)=='R')
686 while (isdigit((unsigned)*f
))
687 width
= (width
*10) + *f
++ - '0';
688 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
694 /* step 2: allocate memory for the results of
695 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
697 callresults
= PyObject_Malloc(sizeof(PyObject
*)*callcount
);
702 callresult
= callresults
;
704 /* step 3: figure out how large a buffer we need */
705 for (f
= format
; *f
; f
++) {
709 while (isdigit((unsigned)*f
))
710 width
= (width
*10) + *f
++ - '0';
711 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
714 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
715 * they don't affect the amount of space we reserve.
717 if ((*f
== 'l' || *f
== 'z') &&
718 (f
[1] == 'd' || f
[1] == 'u'))
723 (void)va_arg(count
, int);
724 /* fall through... */
728 case 'd': case 'u': case 'i': case 'x':
729 (void) va_arg(count
, int);
730 /* 20 bytes is enough to hold a 64-bit
731 integer. Decimal takes the most space.
732 This isn't enough for octal.
733 If a width is specified we need more
734 (which we allocate later). */
738 if (abuffersize
< width
)
744 const char *s
= va_arg(count
, const char*);
745 PyObject
*str
= PyUnicode_DecodeUTF8(s
, strlen(s
), "replace");
748 n
+= PyUnicode_GET_SIZE(str
);
749 /* Remember the str and switch to the next slot */
755 PyObject
*obj
= va_arg(count
, PyObject
*);
756 assert(obj
&& PyUnicode_Check(obj
));
757 n
+= PyUnicode_GET_SIZE(obj
);
762 PyObject
*obj
= va_arg(count
, PyObject
*);
763 const char *str
= va_arg(count
, const char *);
765 assert(!obj
|| PyUnicode_Check(obj
));
767 n
+= PyUnicode_GET_SIZE(obj
);
774 PyObject
*obj
= va_arg(count
, PyObject
*);
777 str
= PyObject_Str(obj
);
780 n
+= PyUnicode_GET_SIZE(str
);
781 /* Remember the str and switch to the next slot */
787 PyObject
*obj
= va_arg(count
, PyObject
*);
790 repr
= PyObject_Repr(obj
);
793 n
+= PyUnicode_GET_SIZE(repr
);
794 /* Remember the repr and switch to the next slot */
795 *callresult
++ = repr
;
799 (void) va_arg(count
, int);
800 /* maximum 64-bit pointer representation:
802 * so 19 characters is enough.
803 * XXX I count 18 -- what's the extra for?
808 /* if we stumble upon an unknown
809 formatting code, copy the rest of
810 the format string to the output
811 string. (we cannot just skip the
812 code, since there's no way to know
813 what's in the argument list) */
821 if (abuffersize
> 20) {
822 abuffer
= PyObject_Malloc(abuffersize
);
827 realbuffer
= abuffer
;
831 /* step 4: fill the buffer */
832 /* Since we've analyzed how much space we need for the worst case,
833 we don't have to resize the string.
834 There can be no errors beyond this point. */
835 string
= PyUnicode_FromUnicode(NULL
, n
);
839 s
= PyUnicode_AS_UNICODE(string
);
840 callresult
= callresults
;
842 for (f
= format
; *f
; f
++) {
847 zeropad
= (*f
== '0');
848 /* parse the width.precision part */
850 while (isdigit((unsigned)*f
))
851 width
= (width
*10) + *f
++ - '0';
855 while (isdigit((unsigned)*f
))
856 precision
= (precision
*10) + *f
++ - '0';
858 /* handle the long flag, but only for %ld and %lu.
859 others can be added when necessary. */
860 if (*f
== 'l' && (f
[1] == 'd' || f
[1] == 'u')) {
864 /* handle the size_t flag. */
865 if (*f
== 'z' && (f
[1] == 'd' || f
[1] == 'u')) {
872 *s
++ = va_arg(vargs
, int);
875 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'd');
877 sprintf(realbuffer
, fmt
, va_arg(vargs
, long));
879 sprintf(realbuffer
, fmt
, va_arg(vargs
, Py_ssize_t
));
881 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
882 appendstring(realbuffer
);
885 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'u');
887 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned long));
889 sprintf(realbuffer
, fmt
, va_arg(vargs
, size_t));
891 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned int));
892 appendstring(realbuffer
);
895 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'i');
896 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
897 appendstring(realbuffer
);
900 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'x');
901 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
902 appendstring(realbuffer
);
906 /* unused, since we already have the result */
907 (void) va_arg(vargs
, char *);
908 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(*callresult
),
909 PyUnicode_GET_SIZE(*callresult
));
910 s
+= PyUnicode_GET_SIZE(*callresult
);
911 /* We're done with the unicode()/repr() => forget it */
912 Py_DECREF(*callresult
);
913 /* switch to next unicode()/repr() result */
919 PyObject
*obj
= va_arg(vargs
, PyObject
*);
920 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
921 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
927 PyObject
*obj
= va_arg(vargs
, PyObject
*);
928 const char *str
= va_arg(vargs
, const char *);
930 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
931 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
944 /* unused, since we already have the result */
945 (void) va_arg(vargs
, PyObject
*);
946 ucopy
= PyUnicode_AS_UNICODE(*callresult
);
947 usize
= PyUnicode_GET_SIZE(*callresult
);
948 for (upos
= 0; upos
<usize
;)
949 *s
++ = ucopy
[upos
++];
950 /* We're done with the unicode()/repr() => forget it */
951 Py_DECREF(*callresult
);
952 /* switch to next unicode()/repr() result */
957 sprintf(buffer
, "%p", va_arg(vargs
, void*));
958 /* %p is ill-defined: ensure leading 0x. */
959 if (buffer
[1] == 'X')
961 else if (buffer
[1] != 'x') {
962 memmove(buffer
+2, buffer
, strlen(buffer
)+1);
966 appendstring(buffer
);
981 PyObject_Free(callresults
);
983 PyObject_Free(abuffer
);
984 PyUnicode_Resize(&string
, s
- PyUnicode_AS_UNICODE(string
));
988 PyObject
**callresult2
= callresults
;
989 while (callresult2
< callresult
) {
990 Py_DECREF(*callresult2
);
993 PyObject_Free(callresults
);
996 PyObject_Free(abuffer
);
1003 PyUnicode_FromFormat(const char *format
, ...)
1008 #ifdef HAVE_STDARG_PROTOTYPES
1009 va_start(vargs
, format
);
1013 ret
= PyUnicode_FromFormatV(format
, vargs
);
1018 Py_ssize_t
PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
1022 if (unicode
== NULL
) {
1023 PyErr_BadInternalCall();
1027 /* If possible, try to copy the 0-termination as well */
1028 if (size
> PyUnicode_GET_SIZE(unicode
))
1029 size
= PyUnicode_GET_SIZE(unicode
) + 1;
1031 #ifdef HAVE_USABLE_WCHAR_T
1032 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
1035 register Py_UNICODE
*u
;
1036 register Py_ssize_t i
;
1037 u
= PyUnicode_AS_UNICODE(unicode
);
1038 for (i
= size
; i
> 0; i
--)
1043 if (size
> PyUnicode_GET_SIZE(unicode
))
1044 return PyUnicode_GET_SIZE(unicode
);
1051 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
1055 #ifdef Py_UNICODE_WIDE
1056 if (ordinal
< 0 || ordinal
> 0x10ffff) {
1057 PyErr_SetString(PyExc_ValueError
,
1058 "unichr() arg not in range(0x110000) "
1059 "(wide Python build)");
1063 if (ordinal
< 0 || ordinal
> 0xffff) {
1064 PyErr_SetString(PyExc_ValueError
,
1065 "unichr() arg not in range(0x10000) "
1066 "(narrow Python build)");
1071 s
[0] = (Py_UNICODE
)ordinal
;
1072 return PyUnicode_FromUnicode(s
, 1);
1075 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
1077 /* XXX Perhaps we should make this API an alias of
1078 PyObject_Unicode() instead ?! */
1079 if (PyUnicode_CheckExact(obj
)) {
1083 if (PyUnicode_Check(obj
)) {
1084 /* For a Unicode subtype that's not a Unicode object,
1085 return a true Unicode object with the same data. */
1086 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
1087 PyUnicode_GET_SIZE(obj
));
1089 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
1092 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
1093 const char *encoding
,
1096 const char *s
= NULL
;
1101 PyErr_BadInternalCall();
1106 /* For b/w compatibility we also accept Unicode objects provided
1107 that no encodings is given and then redirect to
1108 PyObject_Unicode() which then applies the additional logic for
1111 NOTE: This API should really only be used for object which
1112 represent *encoded* Unicode !
1115 if (PyUnicode_Check(obj
)) {
1117 PyErr_SetString(PyExc_TypeError
,
1118 "decoding Unicode is not supported");
1121 return PyObject_Unicode(obj
);
1124 if (PyUnicode_Check(obj
)) {
1125 PyErr_SetString(PyExc_TypeError
,
1126 "decoding Unicode is not supported");
1132 if (PyString_Check(obj
)) {
1133 s
= PyString_AS_STRING(obj
);
1134 len
= PyString_GET_SIZE(obj
);
1136 else if (PyByteArray_Check(obj
)) {
1137 /* Python 2.x specific */
1138 PyErr_Format(PyExc_TypeError
,
1139 "decoding bytearray is not supported");
1142 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
1143 /* Overwrite the error message with something more useful in
1144 case of a TypeError. */
1145 if (PyErr_ExceptionMatches(PyExc_TypeError
))
1146 PyErr_Format(PyExc_TypeError
,
1147 "coercing to Unicode: need string or buffer, "
1149 Py_TYPE(obj
)->tp_name
);
1153 /* Convert to Unicode */
1155 Py_INCREF(unicode_empty
);
1156 v
= (PyObject
*)unicode_empty
;
1159 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
1167 PyObject
*PyUnicode_Decode(const char *s
,
1169 const char *encoding
,
1172 PyObject
*buffer
= NULL
, *unicode
;
1174 if (encoding
== NULL
)
1175 encoding
= PyUnicode_GetDefaultEncoding();
1177 /* Shortcuts for common default encodings */
1178 if (strcmp(encoding
, "utf-8") == 0)
1179 return PyUnicode_DecodeUTF8(s
, size
, errors
);
1180 else if (strcmp(encoding
, "latin-1") == 0)
1181 return PyUnicode_DecodeLatin1(s
, size
, errors
);
1182 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1183 else if (strcmp(encoding
, "mbcs") == 0)
1184 return PyUnicode_DecodeMBCS(s
, size
, errors
);
1186 else if (strcmp(encoding
, "ascii") == 0)
1187 return PyUnicode_DecodeASCII(s
, size
, errors
);
1189 /* Decode via the codec registry */
1190 buffer
= PyBuffer_FromMemory((void *)s
, size
);
1193 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
1194 if (unicode
== NULL
)
1196 if (!PyUnicode_Check(unicode
)) {
1197 PyErr_Format(PyExc_TypeError
,
1198 "decoder did not return an unicode object (type=%.400s)",
1199 Py_TYPE(unicode
)->tp_name
);
1211 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
1212 const char *encoding
,
1217 if (!PyUnicode_Check(unicode
)) {
1218 PyErr_BadArgument();
1222 if (encoding
== NULL
)
1223 encoding
= PyUnicode_GetDefaultEncoding();
1225 /* Decode via the codec registry */
1226 v
= PyCodec_Decode(unicode
, encoding
, errors
);
1235 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
1237 const char *encoding
,
1240 PyObject
*v
, *unicode
;
1242 unicode
= PyUnicode_FromUnicode(s
, size
);
1243 if (unicode
== NULL
)
1245 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
1250 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
1251 const char *encoding
,
1256 if (!PyUnicode_Check(unicode
)) {
1257 PyErr_BadArgument();
1261 if (encoding
== NULL
)
1262 encoding
= PyUnicode_GetDefaultEncoding();
1264 /* Encode via the codec registry */
1265 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1274 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
1275 const char *encoding
,
1280 if (!PyUnicode_Check(unicode
)) {
1281 PyErr_BadArgument();
1285 if (encoding
== NULL
)
1286 encoding
= PyUnicode_GetDefaultEncoding();
1288 /* Shortcuts for common default encodings */
1289 if (errors
== NULL
) {
1290 if (strcmp(encoding
, "utf-8") == 0)
1291 return PyUnicode_AsUTF8String(unicode
);
1292 else if (strcmp(encoding
, "latin-1") == 0)
1293 return PyUnicode_AsLatin1String(unicode
);
1294 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1295 else if (strcmp(encoding
, "mbcs") == 0)
1296 return PyUnicode_AsMBCSString(unicode
);
1298 else if (strcmp(encoding
, "ascii") == 0)
1299 return PyUnicode_AsASCIIString(unicode
);
1302 /* Encode via the codec registry */
1303 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1306 if (!PyString_Check(v
)) {
1307 PyErr_Format(PyExc_TypeError
,
1308 "encoder did not return a string object (type=%.400s)",
1309 Py_TYPE(v
)->tp_name
);
1319 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
1322 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
1326 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
1327 if (v
&& errors
== NULL
)
1328 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
1332 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
1334 if (!PyUnicode_Check(unicode
)) {
1335 PyErr_BadArgument();
1338 return PyUnicode_AS_UNICODE(unicode
);
1344 Py_ssize_t
PyUnicode_GetSize(PyObject
*unicode
)
1346 if (!PyUnicode_Check(unicode
)) {
1347 PyErr_BadArgument();
1350 return PyUnicode_GET_SIZE(unicode
);
1356 const char *PyUnicode_GetDefaultEncoding(void)
1358 return unicode_default_encoding
;
1361 int PyUnicode_SetDefaultEncoding(const char *encoding
)
1365 /* Make sure the encoding is valid. As side effect, this also
1366 loads the encoding into the codec registry cache. */
1367 v
= _PyCodec_Lookup(encoding
);
1371 strncpy(unicode_default_encoding
,
1373 sizeof(unicode_default_encoding
));
1380 /* error handling callback helper:
1381 build arguments, call the callback and check the arguments,
1382 if no exception occurred, copy the replacement to the output
1383 and adjust various state variables.
1384 return 0 on success, -1 on error
1388 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
1389 const char *encoding
, const char *reason
,
1390 const char *input
, Py_ssize_t insize
, Py_ssize_t
*startinpos
,
1391 Py_ssize_t
*endinpos
, PyObject
**exceptionObject
, const char **inptr
,
1392 PyUnicodeObject
**output
, Py_ssize_t
*outpos
, Py_UNICODE
**outptr
)
1394 static char *argparse
= "O!n;decoding error handler must return (unicode, int) tuple";
1396 PyObject
*restuple
= NULL
;
1397 PyObject
*repunicode
= NULL
;
1398 Py_ssize_t outsize
= PyUnicode_GET_SIZE(*output
);
1399 Py_ssize_t requiredsize
;
1405 if (*errorHandler
== NULL
) {
1406 *errorHandler
= PyCodec_LookupError(errors
);
1407 if (*errorHandler
== NULL
)
1411 if (*exceptionObject
== NULL
) {
1412 *exceptionObject
= PyUnicodeDecodeError_Create(
1413 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
1414 if (*exceptionObject
== NULL
)
1418 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
1420 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
1422 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
1426 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
1427 if (restuple
== NULL
)
1429 if (!PyTuple_Check(restuple
)) {
1430 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
1433 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
1436 newpos
= insize
+newpos
;
1437 if (newpos
<0 || newpos
>insize
) {
1438 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", newpos
);
1442 /* need more space? (at least enough for what we
1443 have+the replacement+the rest of the string (starting
1444 at the new input position), so we won't have to check space
1445 when there are no errors in the rest of the string) */
1446 repptr
= PyUnicode_AS_UNICODE(repunicode
);
1447 repsize
= PyUnicode_GET_SIZE(repunicode
);
1448 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
1449 if (requiredsize
> outsize
) {
1450 if (requiredsize
<2*outsize
)
1451 requiredsize
= 2*outsize
;
1452 if (_PyUnicode_Resize(output
, requiredsize
) < 0)
1454 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
1457 *inptr
= input
+ newpos
;
1458 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
1465 Py_XDECREF(restuple
);
1469 /* --- UTF-7 Codec -------------------------------------------------------- */
1471 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1473 /* Three simple macros defining base-64. */
1475 /* Is c a base-64 character? */
1477 #define IS_BASE64(c) \
1478 (isalnum(c) || (c) == '+' || (c) == '/')
1480 /* given that c is a base-64 character, what is its base-64 value? */
1482 #define FROM_BASE64(c) \
1483 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1484 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1485 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1486 (c) == '+' ? 62 : 63)
1488 /* What is the base-64 character of the bottom 6 bits of n? */
1490 #define TO_BASE64(n) \
1491 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1493 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1494 * decoded as itself. We are permissive on decoding; the only ASCII
1495 * byte not decoding to itself is the + which begins a base64
1498 #define DECODE_DIRECT(c) \
1499 ((c) <= 127 && (c) != '+')
1501 /* The UTF-7 encoder treats ASCII characters differently according to
1502 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1503 * the above). See RFC2152. This array identifies these different
1506 * alphanumeric and '(),-./:?
1508 * !"#$%&*;<=>@[]^_`{|}
1511 * 3 : special (must be base64 encoded)
1512 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1516 char utf7_category
[128] = {
1517 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1518 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1519 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1520 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1521 /* sp ! " # $ % & ' ( ) * + , - . / */
1522 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1523 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1525 /* @ A B C D E F G H I J K L M N O */
1526 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1527 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1529 /* ` a b c d e f g h i j k l m n o */
1530 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1531 /* p q r s t u v w x y z { | } ~ del */
1532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1535 /* ENCODE_DIRECT: this character should be encoded as itself. The
1536 * answer depends on whether we are encoding set O as itself, and also
1537 * on whether we are encoding whitespace as itself. RFC2152 makes it
1538 * clear that the answers to these questions vary between
1539 * applications, so this code needs to be flexible. */
1541 #define ENCODE_DIRECT(c, directO, directWS) \
1542 ((c) < 128 && (c) > 0 && \
1543 ((utf7_category[(c)] == 0) || \
1544 (directWS && (utf7_category[(c)] == 2)) || \
1545 (directO && (utf7_category[(c)] == 1))))
1547 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
1551 return PyUnicode_DecodeUTF7Stateful(s
, size
, errors
, NULL
);
1554 /* The decoder. The only state we preserve is our read position,
1555 * i.e. how many characters we have consumed. So if we end in the
1556 * middle of a shift sequence we have to back off the read position
1557 * and the output to the beginning of the sequence, otherwise we lose
1558 * all the shift state (seen bits, number of bits seen, high
1561 PyObject
*PyUnicode_DecodeUTF7Stateful(const char *s
,
1564 Py_ssize_t
*consumed
)
1566 const char *starts
= s
;
1567 Py_ssize_t startinpos
;
1568 Py_ssize_t endinpos
;
1571 PyUnicodeObject
*unicode
;
1573 const char *errmsg
= "";
1575 Py_UNICODE
*shiftOutStart
;
1576 unsigned int base64bits
= 0;
1577 unsigned long base64buffer
= 0;
1578 Py_UNICODE surrogate
= 0;
1579 PyObject
*errorHandler
= NULL
;
1580 PyObject
*exc
= NULL
;
1582 unicode
= _PyUnicode_New(size
);
1588 return (PyObject
*)unicode
;
1596 Py_UNICODE ch
= (unsigned char) *s
;
1598 if (inShift
) { /* in a base-64 section */
1599 if (IS_BASE64(ch
)) { /* consume a base-64 character */
1600 base64buffer
= (base64buffer
<< 6) | FROM_BASE64(ch
);
1603 if (base64bits
>= 16) {
1604 /* we have enough bits for a UTF-16 value */
1605 Py_UNICODE outCh
= (Py_UNICODE
)
1606 (base64buffer
>> (base64bits
-16));
1608 base64buffer
&= (1 << base64bits
) - 1; /* clear high bits */
1610 /* expecting a second surrogate */
1611 if (outCh
>= 0xDC00 && outCh
<= 0xDFFF) {
1612 #ifdef Py_UNICODE_WIDE
1613 *p
++ = (((surrogate
& 0x3FF)<<10)
1614 | (outCh
& 0x3FF)) + 0x10000;
1623 errmsg
= "second surrogate missing";
1627 else if (outCh
>= 0xD800 && outCh
<= 0xDBFF) {
1628 /* first surrogate */
1631 else if (outCh
>= 0xDC00 && outCh
<= 0xDFFF) {
1632 errmsg
= "unexpected second surrogate";
1640 else { /* now leaving a base-64 section */
1644 errmsg
= "second surrogate missing at end of shift sequence";
1647 if (base64bits
> 0) { /* left-over bits */
1648 if (base64bits
>= 6) {
1649 /* We've seen at least one base-64 character */
1650 errmsg
= "partial character in shift sequence";
1654 /* Some bits remain; they should be zero */
1655 if (base64buffer
!= 0) {
1656 errmsg
= "non-zero padding bits in shift sequence";
1662 /* '-' is absorbed; other terminating
1663 characters are preserved */
1668 else if ( ch
== '+' ) {
1669 startinpos
= s
-starts
;
1670 s
++; /* consume '+' */
1671 if (s
< e
&& *s
== '-') { /* '+-' encodes '+' */
1675 else { /* begin base64-encoded section */
1681 else if (DECODE_DIRECT(ch
)) { /* character decodes as itself */
1686 startinpos
= s
-starts
;
1688 errmsg
= "unexpected special character";
1693 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1694 endinpos
= s
-starts
;
1695 if (unicode_decode_call_errorhandler(
1696 errors
, &errorHandler
,
1698 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1699 &unicode
, &outpos
, &p
))
1705 if (inShift
&& !consumed
) { /* in shift sequence, no more to follow */
1706 /* if we're in an inconsistent state, that's an error */
1708 (base64bits
>= 6) ||
1709 (base64bits
> 0 && base64buffer
!= 0)) {
1710 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1712 if (unicode_decode_call_errorhandler(
1713 errors
, &errorHandler
,
1714 "utf7", "unterminated shift sequence",
1715 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1716 &unicode
, &outpos
, &p
))
1724 p
= shiftOutStart
; /* back off output */
1725 *consumed
= startinpos
;
1728 *consumed
= s
-starts
;
1732 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1735 Py_XDECREF(errorHandler
);
1737 return (PyObject
*)unicode
;
1740 Py_XDECREF(errorHandler
);
1747 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1750 int base64WhiteSpace
,
1754 /* It might be possible to tighten this worst case */
1755 Py_ssize_t allocated
= 8 * size
;
1758 unsigned int base64bits
= 0;
1759 unsigned long base64buffer
= 0;
1763 if (allocated
/ 8 != size
)
1764 return PyErr_NoMemory();
1767 return PyString_FromStringAndSize(NULL
, 0);
1769 v
= PyString_FromStringAndSize(NULL
, allocated
);
1773 start
= out
= PyString_AS_STRING(v
);
1774 for (;i
< size
; ++i
) {
1775 Py_UNICODE ch
= s
[i
];
1778 if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1780 if (base64bits
) { /* output remaining bits */
1781 *out
++ = TO_BASE64(base64buffer
<< (6-base64bits
));
1786 /* Characters not in the BASE64 set implicitly unshift the sequence
1787 so no '-' is required, except if the character is itself a '-' */
1788 if (IS_BASE64(ch
) || ch
== '-') {
1797 else { /* not in a shift sequence */
1802 else if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1813 #ifdef Py_UNICODE_WIDE
1814 if (ch
>= 0x10000) {
1815 /* code first surrogate */
1817 base64buffer
= (base64buffer
<< 16) | 0xd800 | ((ch
-0x10000) >> 10);
1818 while (base64bits
>= 6) {
1819 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1822 /* prepare second surrogate */
1823 ch
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1827 base64buffer
= (base64buffer
<< 16) | ch
;
1828 while (base64bits
>= 6) {
1829 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1834 *out
++= TO_BASE64(base64buffer
<< (6-base64bits
) );
1838 _PyString_Resize(&v
, out
- start
);
1845 #undef DECODE_DIRECT
1846 #undef ENCODE_DIRECT
1848 /* --- UTF-8 Codec -------------------------------------------------------- */
1851 char utf8_code_length
[256] = {
1852 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1853 illegal prefix. see RFC 2279 for details */
1854 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1867 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1869 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1872 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1876 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1879 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1882 Py_ssize_t
*consumed
)
1884 const char *starts
= s
;
1886 Py_ssize_t startinpos
;
1887 Py_ssize_t endinpos
;
1890 PyUnicodeObject
*unicode
;
1892 const char *errmsg
= "";
1893 PyObject
*errorHandler
= NULL
;
1894 PyObject
*exc
= NULL
;
1896 /* Note: size will always be longer than the resulting Unicode
1898 unicode
= _PyUnicode_New(size
);
1904 return (PyObject
*)unicode
;
1907 /* Unpack UTF-8 encoded data */
1912 Py_UCS4 ch
= (unsigned char)*s
;
1915 *p
++ = (Py_UNICODE
)ch
;
1920 n
= utf8_code_length
[ch
];
1926 errmsg
= "unexpected end of data";
1927 startinpos
= s
-starts
;
1936 errmsg
= "unexpected code byte";
1937 startinpos
= s
-starts
;
1938 endinpos
= startinpos
+1;
1942 errmsg
= "internal error";
1943 startinpos
= s
-starts
;
1944 endinpos
= startinpos
+1;
1948 if ((s
[1] & 0xc0) != 0x80) {
1949 errmsg
= "invalid data";
1950 startinpos
= s
-starts
;
1951 endinpos
= startinpos
+2;
1954 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1956 startinpos
= s
-starts
;
1957 endinpos
= startinpos
+2;
1958 errmsg
= "illegal encoding";
1962 *p
++ = (Py_UNICODE
)ch
;
1966 if ((s
[1] & 0xc0) != 0x80 ||
1967 (s
[2] & 0xc0) != 0x80) {
1968 errmsg
= "invalid data";
1969 startinpos
= s
-starts
;
1970 endinpos
= startinpos
+3;
1973 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1975 /* Note: UTF-8 encodings of surrogates are considered
1976 legal UTF-8 sequences;
1978 XXX For wide builds (UCS-4) we should probably try
1979 to recombine the surrogates into a single code
1982 errmsg
= "illegal encoding";
1983 startinpos
= s
-starts
;
1984 endinpos
= startinpos
+3;
1988 *p
++ = (Py_UNICODE
)ch
;
1992 if ((s
[1] & 0xc0) != 0x80 ||
1993 (s
[2] & 0xc0) != 0x80 ||
1994 (s
[3] & 0xc0) != 0x80) {
1995 errmsg
= "invalid data";
1996 startinpos
= s
-starts
;
1997 endinpos
= startinpos
+4;
2000 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
2001 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
2002 /* validate and convert to UTF-16 */
2003 if ((ch
< 0x10000) /* minimum value allowed for 4
2005 || (ch
> 0x10ffff)) /* maximum value allowed for
2008 errmsg
= "illegal encoding";
2009 startinpos
= s
-starts
;
2010 endinpos
= startinpos
+4;
2013 #ifdef Py_UNICODE_WIDE
2014 *p
++ = (Py_UNICODE
)ch
;
2016 /* compute and append the two surrogates: */
2018 /* translate from 10000..10FFFF to 0..FFFF */
2021 /* high surrogate = top 10 bits added to D800 */
2022 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
2024 /* low surrogate = bottom 10 bits added to DC00 */
2025 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
2030 /* Other sizes are only needed for UCS-4 */
2031 errmsg
= "unsupported Unicode code range";
2032 startinpos
= s
-starts
;
2033 endinpos
= startinpos
+n
;
2040 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2041 if (unicode_decode_call_errorhandler(
2042 errors
, &errorHandler
,
2044 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2045 &unicode
, &outpos
, &p
))
2049 *consumed
= s
-starts
;
2052 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2055 Py_XDECREF(errorHandler
);
2057 return (PyObject
*)unicode
;
2060 Py_XDECREF(errorHandler
);
2066 /* Allocation strategy: if the string is short, convert into a stack buffer
2067 and allocate exactly as much space needed at the end. Else allocate the
2068 maximum possible needed (4 result bytes per Unicode character), and return
2069 the excess memory at the end.
2072 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
2076 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2078 Py_ssize_t i
; /* index into s of next input byte */
2079 PyObject
*v
; /* result string object */
2080 char *p
; /* next free byte in output buffer */
2081 Py_ssize_t nallocated
; /* number of result bytes allocated */
2082 Py_ssize_t nneeded
; /* number of result bytes needed */
2083 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
2088 if (size
<= MAX_SHORT_UNICHARS
) {
2089 /* Write into the stack buffer; nallocated can't overflow.
2090 * At the end, we'll allocate exactly as much heap space as it
2091 * turns out we need.
2093 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
2094 v
= NULL
; /* will allocate after we're done */
2098 /* Overallocate on the heap, and give the excess back at the end. */
2099 nallocated
= size
* 4;
2100 if (nallocated
/ 4 != size
) /* overflow! */
2101 return PyErr_NoMemory();
2102 v
= PyString_FromStringAndSize(NULL
, nallocated
);
2105 p
= PyString_AS_STRING(v
);
2108 for (i
= 0; i
< size
;) {
2109 Py_UCS4 ch
= s
[i
++];
2115 else if (ch
< 0x0800) {
2116 /* Encode Latin-1 */
2117 *p
++ = (char)(0xc0 | (ch
>> 6));
2118 *p
++ = (char)(0x80 | (ch
& 0x3f));
2121 /* Encode UCS2 Unicode ordinals */
2123 /* Special case: check for high surrogate */
2124 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
2126 /* Check for low surrogate and combine the two to
2127 form a UCS4 value */
2128 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2129 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
2133 /* Fall through: handles isolated high surrogates */
2135 *p
++ = (char)(0xe0 | (ch
>> 12));
2136 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2137 *p
++ = (char)(0x80 | (ch
& 0x3f));
2141 /* Encode UCS4 Unicode ordinals */
2142 *p
++ = (char)(0xf0 | (ch
>> 18));
2143 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
2144 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2145 *p
++ = (char)(0x80 | (ch
& 0x3f));
2150 /* This was stack allocated. */
2151 nneeded
= p
- stackbuf
;
2152 assert(nneeded
<= nallocated
);
2153 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
2156 /* Cut back to size actually needed. */
2157 nneeded
= p
- PyString_AS_STRING(v
);
2158 assert(nneeded
<= nallocated
);
2159 _PyString_Resize(&v
, nneeded
);
2163 #undef MAX_SHORT_UNICHARS
2166 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
2168 if (!PyUnicode_Check(unicode
)) {
2169 PyErr_BadArgument();
2172 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
2173 PyUnicode_GET_SIZE(unicode
),
2177 /* --- UTF-32 Codec ------------------------------------------------------- */
2180 PyUnicode_DecodeUTF32(const char *s
,
2185 return PyUnicode_DecodeUTF32Stateful(s
, size
, errors
, byteorder
, NULL
);
2189 PyUnicode_DecodeUTF32Stateful(const char *s
,
2193 Py_ssize_t
*consumed
)
2195 const char *starts
= s
;
2196 Py_ssize_t startinpos
;
2197 Py_ssize_t endinpos
;
2199 PyUnicodeObject
*unicode
;
2201 #ifndef Py_UNICODE_WIDE
2204 const int pairs
= 0;
2206 const unsigned char *q
, *e
;
2207 int bo
= 0; /* assume native ordering by default */
2208 const char *errmsg
= "";
2209 /* Offsets from q for retrieving bytes in the right order. */
2210 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2211 int iorder
[] = {0, 1, 2, 3};
2213 int iorder
[] = {3, 2, 1, 0};
2215 PyObject
*errorHandler
= NULL
;
2216 PyObject
*exc
= NULL
;
2217 /* On narrow builds we split characters outside the BMP into two
2218 codepoints => count how much extra space we need. */
2219 #ifndef Py_UNICODE_WIDE
2220 for (i
= pairs
= 0; i
< size
/4; i
++)
2221 if (((Py_UCS4
*)s
)[i
] >= 0x10000)
2225 /* This might be one to much, because of a BOM */
2226 unicode
= _PyUnicode_New((size
+3)/4+pairs
);
2230 return (PyObject
*)unicode
;
2232 /* Unpack UTF-32 encoded data */
2234 q
= (unsigned char *)s
;
2240 /* Check for BOM marks (U+FEFF) in the input and adjust current
2241 byte order setting accordingly. In native mode, the leading BOM
2242 mark is skipped, in all other modes, it is copied to the output
2243 stream as-is (giving a ZWNBSP character). */
2246 const Py_UCS4 bom
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2247 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2248 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2249 if (bom
== 0x0000FEFF) {
2253 else if (bom
== 0xFFFE0000) {
2258 if (bom
== 0x0000FEFF) {
2262 else if (bom
== 0xFFFE0000) {
2287 /* remaining bytes at the end? (size should be divisible by 4) */
2291 errmsg
= "truncated data";
2292 startinpos
= ((const char *)q
)-starts
;
2293 endinpos
= ((const char *)e
)-starts
;
2295 /* The remaining input chars are ignored if the callback
2296 chooses to skip the input */
2298 ch
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2299 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2303 errmsg
= "codepoint not in range(0x110000)";
2304 startinpos
= ((const char *)q
)-starts
;
2305 endinpos
= startinpos
+4;
2308 #ifndef Py_UNICODE_WIDE
2311 *p
++ = 0xD800 | ((ch
-0x10000) >> 10);
2312 *p
++ = 0xDC00 | ((ch
-0x10000) & 0x3FF);
2320 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2321 if (unicode_decode_call_errorhandler(
2322 errors
, &errorHandler
,
2324 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2325 &unicode
, &outpos
, &p
))
2333 *consumed
= (const char *)q
-starts
;
2336 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2339 Py_XDECREF(errorHandler
);
2341 return (PyObject
*)unicode
;
2345 Py_XDECREF(errorHandler
);
2351 PyUnicode_EncodeUTF32(const Py_UNICODE
*s
,
2358 Py_ssize_t nsize
, bytesize
;
2359 #ifndef Py_UNICODE_WIDE
2360 Py_ssize_t i
, pairs
;
2362 const int pairs
= 0;
2364 /* Offsets from p for storing byte pairs in the right order. */
2365 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366 int iorder
[] = {0, 1, 2, 3};
2368 int iorder
[] = {3, 2, 1, 0};
2371 #define STORECHAR(CH) \
2373 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2374 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2375 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2376 p[iorder[0]] = (CH) & 0xff; \
2380 /* In narrow builds we can output surrogate pairs as one codepoint,
2381 so we need less space. */
2382 #ifndef Py_UNICODE_WIDE
2383 for (i
= pairs
= 0; i
< size
-1; i
++)
2384 if (0xD800 <= s
[i
] && s
[i
] <= 0xDBFF &&
2385 0xDC00 <= s
[i
+1] && s
[i
+1] <= 0xDFFF)
2388 nsize
= (size
- pairs
+ (byteorder
== 0));
2389 bytesize
= nsize
* 4;
2390 if (bytesize
/ 4 != nsize
)
2391 return PyErr_NoMemory();
2392 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2396 p
= (unsigned char *)PyString_AS_STRING(v
);
2402 if (byteorder
== -1) {
2409 else if (byteorder
== 1) {
2417 while (size
-- > 0) {
2419 #ifndef Py_UNICODE_WIDE
2420 if (0xD800 <= ch
&& ch
<= 0xDBFF && size
> 0) {
2422 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2423 ch
= (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2435 PyObject
*PyUnicode_AsUTF32String(PyObject
*unicode
)
2437 if (!PyUnicode_Check(unicode
)) {
2438 PyErr_BadArgument();
2441 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode
),
2442 PyUnicode_GET_SIZE(unicode
),
2447 /* --- UTF-16 Codec ------------------------------------------------------- */
2450 PyUnicode_DecodeUTF16(const char *s
,
2455 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
2459 PyUnicode_DecodeUTF16Stateful(const char *s
,
2463 Py_ssize_t
*consumed
)
2465 const char *starts
= s
;
2466 Py_ssize_t startinpos
;
2467 Py_ssize_t endinpos
;
2469 PyUnicodeObject
*unicode
;
2471 const unsigned char *q
, *e
;
2472 int bo
= 0; /* assume native ordering by default */
2473 const char *errmsg
= "";
2474 /* Offsets from q for retrieving byte pairs in the right order. */
2475 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2476 int ihi
= 1, ilo
= 0;
2478 int ihi
= 0, ilo
= 1;
2480 PyObject
*errorHandler
= NULL
;
2481 PyObject
*exc
= NULL
;
2483 /* Note: size will always be longer than the resulting Unicode
2485 unicode
= _PyUnicode_New(size
);
2489 return (PyObject
*)unicode
;
2491 /* Unpack UTF-16 encoded data */
2493 q
= (unsigned char *)s
;
2499 /* Check for BOM marks (U+FEFF) in the input and adjust current
2500 byte order setting accordingly. In native mode, the leading BOM
2501 mark is skipped, in all other modes, it is copied to the output
2502 stream as-is (giving a ZWNBSP character). */
2505 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
2506 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2507 if (bom
== 0xFEFF) {
2511 else if (bom
== 0xFFFE) {
2516 if (bom
== 0xFEFF) {
2520 else if (bom
== 0xFFFE) {
2541 /* remaining bytes at the end? (size should be even) */
2545 errmsg
= "truncated data";
2546 startinpos
= ((const char *)q
)-starts
;
2547 endinpos
= ((const char *)e
)-starts
;
2549 /* The remaining input chars are ignored if the callback
2550 chooses to skip the input */
2552 ch
= (q
[ihi
] << 8) | q
[ilo
];
2556 if (ch
< 0xD800 || ch
> 0xDFFF) {
2561 /* UTF-16 code pair: */
2563 errmsg
= "unexpected end of data";
2564 startinpos
= (((const char *)q
)-2)-starts
;
2565 endinpos
= ((const char *)e
)-starts
;
2568 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
2569 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
2571 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2572 #ifndef Py_UNICODE_WIDE
2576 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2581 errmsg
= "illegal UTF-16 surrogate";
2582 startinpos
= (((const char *)q
)-4)-starts
;
2583 endinpos
= startinpos
+2;
2588 errmsg
= "illegal encoding";
2589 startinpos
= (((const char *)q
)-2)-starts
;
2590 endinpos
= startinpos
+2;
2591 /* Fall through to report the error */
2594 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2595 if (unicode_decode_call_errorhandler(
2596 errors
, &errorHandler
,
2598 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2599 &unicode
, &outpos
, &p
))
2607 *consumed
= (const char *)q
-starts
;
2610 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2613 Py_XDECREF(errorHandler
);
2615 return (PyObject
*)unicode
;
2619 Py_XDECREF(errorHandler
);
2625 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
2632 Py_ssize_t nsize
, bytesize
;
2633 #ifdef Py_UNICODE_WIDE
2634 Py_ssize_t i
, pairs
;
2636 const int pairs
= 0;
2638 /* Offsets from p for storing byte pairs in the right order. */
2639 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640 int ihi
= 1, ilo
= 0;
2642 int ihi
= 0, ilo
= 1;
2645 #define STORECHAR(CH) \
2647 p[ihi] = ((CH) >> 8) & 0xff; \
2648 p[ilo] = (CH) & 0xff; \
2652 #ifdef Py_UNICODE_WIDE
2653 for (i
= pairs
= 0; i
< size
; i
++)
2654 if (s
[i
] >= 0x10000)
2657 /* 2 * (size + pairs + (byteorder == 0)) */
2658 if (size
> PY_SSIZE_T_MAX
||
2659 size
> PY_SSIZE_T_MAX
- pairs
- (byteorder
== 0))
2660 return PyErr_NoMemory();
2661 nsize
= size
+ pairs
+ (byteorder
== 0);
2662 bytesize
= nsize
* 2;
2663 if (bytesize
/ 2 != nsize
)
2664 return PyErr_NoMemory();
2665 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2669 p
= (unsigned char *)PyString_AS_STRING(v
);
2675 if (byteorder
== -1) {
2680 else if (byteorder
== 1) {
2686 while (size
-- > 0) {
2687 Py_UNICODE ch
= *s
++;
2689 #ifdef Py_UNICODE_WIDE
2690 if (ch
>= 0x10000) {
2691 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
2692 ch
= 0xD800 | ((ch
-0x10000) >> 10);
2703 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
2705 if (!PyUnicode_Check(unicode
)) {
2706 PyErr_BadArgument();
2709 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
2710 PyUnicode_GET_SIZE(unicode
),
2715 /* --- Unicode Escape Codec ----------------------------------------------- */
2717 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
2719 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
2723 const char *starts
= s
;
2724 Py_ssize_t startinpos
;
2725 Py_ssize_t endinpos
;
2732 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
2733 PyObject
*errorHandler
= NULL
;
2734 PyObject
*exc
= NULL
;
2736 /* Escaped strings will always be longer than the resulting
2737 Unicode string, so we start with size here and then reduce the
2738 length after conversion to the true value.
2739 (but if the error callback returns a long replacement string
2740 we'll have to allocate more space) */
2741 v
= _PyUnicode_New(size
);
2745 return (PyObject
*)v
;
2747 p
= PyUnicode_AS_UNICODE(v
);
2755 /* Non-escape characters are interpreted as Unicode ordinals */
2757 *p
++ = (unsigned char) *s
++;
2761 startinpos
= s
-starts
;
2766 c
= '\0'; /* Invalid after \ */
2771 case '\\': *p
++ = '\\'; break;
2772 case '\'': *p
++ = '\''; break;
2773 case '\"': *p
++ = '\"'; break;
2774 case 'b': *p
++ = '\b'; break;
2775 case 'f': *p
++ = '\014'; break; /* FF */
2776 case 't': *p
++ = '\t'; break;
2777 case 'n': *p
++ = '\n'; break;
2778 case 'r': *p
++ = '\r'; break;
2779 case 'v': *p
++ = '\013'; break; /* VT */
2780 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
2782 /* \OOO (octal) escapes */
2783 case '0': case '1': case '2': case '3':
2784 case '4': case '5': case '6': case '7':
2786 if (s
< end
&& '0' <= *s
&& *s
<= '7') {
2787 x
= (x
<<3) + *s
++ - '0';
2788 if (s
< end
&& '0' <= *s
&& *s
<= '7')
2789 x
= (x
<<3) + *s
++ - '0';
2798 message
= "truncated \\xXX escape";
2804 message
= "truncated \\uXXXX escape";
2810 message
= "truncated \\UXXXXXXXX escape";
2813 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2816 if (unicode_decode_call_errorhandler(
2817 errors
, &errorHandler
,
2818 "unicodeescape", "end of string in escape sequence",
2819 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2824 for (i
= 0; i
< digits
; ++i
) {
2825 c
= (unsigned char) s
[i
];
2827 endinpos
= (s
+i
+1)-starts
;
2828 if (unicode_decode_call_errorhandler(
2829 errors
, &errorHandler
,
2830 "unicodeescape", message
,
2831 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2836 chr
= (chr
<<4) & ~0xF;
2837 if (c
>= '0' && c
<= '9')
2839 else if (c
>= 'a' && c
<= 'f')
2840 chr
+= 10 + c
- 'a';
2842 chr
+= 10 + c
- 'A';
2845 if (chr
== 0xffffffff && PyErr_Occurred())
2846 /* _decoding_error will have already written into the
2850 /* when we get here, chr is a 32-bit unicode character */
2852 /* UCS-2 character */
2853 *p
++ = (Py_UNICODE
) chr
;
2854 else if (chr
<= 0x10ffff) {
2855 /* UCS-4 character. Either store directly, or as
2857 #ifdef Py_UNICODE_WIDE
2861 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
2862 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
2865 endinpos
= s
-starts
;
2866 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2867 if (unicode_decode_call_errorhandler(
2868 errors
, &errorHandler
,
2869 "unicodeescape", "illegal Unicode character",
2870 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2878 message
= "malformed \\N character escape";
2879 if (ucnhash_CAPI
== NULL
) {
2880 /* load the unicode data module */
2882 m
= PyImport_ImportModuleNoBlock("unicodedata");
2885 api
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
2889 ucnhash_CAPI
= (_PyUnicode_Name_CAPI
*)PyCObject_AsVoidPtr(api
);
2891 if (ucnhash_CAPI
== NULL
)
2895 const char *start
= s
+1;
2896 /* look for the closing brace */
2897 while (*s
!= '}' && s
< end
)
2899 if (s
> start
&& s
< end
&& *s
== '}') {
2900 /* found a name. look it up in the unicode database */
2901 message
= "unknown Unicode character name";
2903 if (ucnhash_CAPI
->getcode(NULL
, start
, (int)(s
-start
-1), &chr
))
2907 endinpos
= s
-starts
;
2908 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2909 if (unicode_decode_call_errorhandler(
2910 errors
, &errorHandler
,
2911 "unicodeescape", message
,
2912 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2919 message
= "\\ at end of string";
2921 endinpos
= s
-starts
;
2922 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2923 if (unicode_decode_call_errorhandler(
2924 errors
, &errorHandler
,
2925 "unicodeescape", message
,
2926 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2932 *p
++ = (unsigned char)s
[-1];
2939 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2941 Py_XDECREF(errorHandler
);
2943 return (PyObject
*)v
;
2948 "\\N escapes not supported (can't load unicodedata module)"
2951 Py_XDECREF(errorHandler
);
2957 Py_XDECREF(errorHandler
);
2962 /* Return a Unicode-Escape string version of the Unicode object.
2964 If quotes is true, the string is enclosed in u"" or u'' quotes as
2969 Py_LOCAL_INLINE(const Py_UNICODE
*) findchar(const Py_UNICODE
*s
,
2973 /* like wcschr, but doesn't stop at NULL characters */
2975 while (size
-- > 0) {
2985 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
2992 static const char *hexdigit
= "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE
2994 const Py_ssize_t expandsize
= 10;
2996 const Py_ssize_t expandsize
= 6;
2999 /* XXX(nnorwitz): rather than over-allocating, it would be
3000 better to choose a different scheme. Perhaps scan the
3001 first N-chars of the string and allocate based on that size.
3003 /* Initial allocation is based on the longest-possible unichr
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007 unichr, so in this case it's the longest unichr escape. In
3008 narrow (UTF-16) builds this is five chars per source unichr
3009 since there are two unichrs in the surrogate pair, so in narrow
3010 (UTF-16) builds it's not the longest unichr escape.
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013 so in the narrow (UTF-16) build case it's the longest unichr
3017 if (size
> (PY_SSIZE_T_MAX
- 2 - 1) / expandsize
)
3018 return PyErr_NoMemory();
3020 repr
= PyString_FromStringAndSize(NULL
,
3027 p
= PyString_AS_STRING(repr
);
3031 *p
++ = (findchar(s
, size
, '\'') &&
3032 !findchar(s
, size
, '"')) ? '"' : '\'';
3034 while (size
-- > 0) {
3035 Py_UNICODE ch
= *s
++;
3037 /* Escape quotes and backslashes */
3039 ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1]) || ch
== '\\') {
3045 #ifdef Py_UNICODE_WIDE
3046 /* Map 21-bit characters to '\U00xxxxxx' */
3047 else if (ch
>= 0x10000) {
3050 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
3051 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
3052 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
3053 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
3054 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
3055 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
3056 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
3057 *p
++ = hexdigit
[ch
& 0x0000000F];
3061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062 else if (ch
>= 0xD800 && ch
< 0xDC00) {
3068 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3069 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3072 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
3073 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
3074 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
3075 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
3076 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
3077 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
3078 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
3079 *p
++ = hexdigit
[ucs
& 0x0000000F];
3082 /* Fall through: isolated surrogates are copied as-is */
3088 /* Map 16-bit characters to '\uxxxx' */
3092 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
3093 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
3094 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3095 *p
++ = hexdigit
[ch
& 0x000F];
3098 /* Map special whitespace to '\t', \n', '\r' */
3099 else if (ch
== '\t') {
3103 else if (ch
== '\n') {
3107 else if (ch
== '\r') {
3112 /* Map non-printable US ASCII to '\xhh' */
3113 else if (ch
< ' ' || ch
>= 0x7F) {
3116 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3117 *p
++ = hexdigit
[ch
& 0x000F];
3120 /* Copy everything else as-is */
3125 *p
++ = PyString_AS_STRING(repr
)[1];
3128 _PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
));
3132 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
3135 return unicodeescape_string(s
, size
, 0);
3138 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
3140 if (!PyUnicode_Check(unicode
)) {
3141 PyErr_BadArgument();
3144 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3145 PyUnicode_GET_SIZE(unicode
));
3148 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3150 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
3154 const char *starts
= s
;
3155 Py_ssize_t startinpos
;
3156 Py_ssize_t endinpos
;
3162 PyObject
*errorHandler
= NULL
;
3163 PyObject
*exc
= NULL
;
3165 /* Escaped strings will always be longer than the resulting
3166 Unicode string, so we start with size here and then reduce the
3167 length after conversion to the true value. (But decoding error
3168 handler might have to resize the string) */
3169 v
= _PyUnicode_New(size
);
3173 return (PyObject
*)v
;
3174 p
= PyUnicode_AS_UNICODE(v
);
3182 /* Non-escape characters are interpreted as Unicode ordinals */
3184 *p
++ = (unsigned char)*s
++;
3187 startinpos
= s
-starts
;
3189 /* \u-escapes are only interpreted iff the number of leading
3190 backslashes if odd */
3195 *p
++ = (unsigned char)*s
++;
3197 if (((s
- bs
) & 1) == 0 ||
3199 (*s
!= 'u' && *s
!= 'U')) {
3203 count
= *s
=='u' ? 4 : 8;
3206 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3207 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3208 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
3209 c
= (unsigned char)*s
;
3211 endinpos
= s
-starts
;
3212 if (unicode_decode_call_errorhandler(
3213 errors
, &errorHandler
,
3214 "rawunicodeescape", "truncated \\uXXXX",
3215 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3221 if (c
>= '0' && c
<= '9')
3223 else if (c
>= 'a' && c
<= 'f')
3229 /* UCS-2 character */
3230 *p
++ = (Py_UNICODE
) x
;
3231 else if (x
<= 0x10ffff) {
3232 /* UCS-4 character. Either store directly, or as
3234 #ifdef Py_UNICODE_WIDE
3235 *p
++ = (Py_UNICODE
) x
;
3238 *p
++ = 0xD800 + (Py_UNICODE
) (x
>> 10);
3239 *p
++ = 0xDC00 + (Py_UNICODE
) (x
& 0x03FF);
3242 endinpos
= s
-starts
;
3243 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3244 if (unicode_decode_call_errorhandler(
3245 errors
, &errorHandler
,
3246 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3247 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3254 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3256 Py_XDECREF(errorHandler
);
3258 return (PyObject
*)v
;
3262 Py_XDECREF(errorHandler
);
3267 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
3274 static const char *hexdigit
= "0123456789abcdef";
3275 #ifdef Py_UNICODE_WIDE
3276 const Py_ssize_t expandsize
= 10;
3278 const Py_ssize_t expandsize
= 6;
3281 if (size
> PY_SSIZE_T_MAX
/ expandsize
)
3282 return PyErr_NoMemory();
3284 repr
= PyString_FromStringAndSize(NULL
, expandsize
* size
);
3290 p
= q
= PyString_AS_STRING(repr
);
3291 while (size
-- > 0) {
3292 Py_UNICODE ch
= *s
++;
3293 #ifdef Py_UNICODE_WIDE
3294 /* Map 32-bit characters to '\Uxxxxxxxx' */
3295 if (ch
>= 0x10000) {
3298 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
3299 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
3300 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
3301 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
3302 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3303 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3304 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3305 *p
++ = hexdigit
[ch
& 15];
3309 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3310 if (ch
>= 0xD800 && ch
< 0xDC00) {
3316 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3317 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3320 *p
++ = hexdigit
[(ucs
>> 28) & 0xf];
3321 *p
++ = hexdigit
[(ucs
>> 24) & 0xf];
3322 *p
++ = hexdigit
[(ucs
>> 20) & 0xf];
3323 *p
++ = hexdigit
[(ucs
>> 16) & 0xf];
3324 *p
++ = hexdigit
[(ucs
>> 12) & 0xf];
3325 *p
++ = hexdigit
[(ucs
>> 8) & 0xf];
3326 *p
++ = hexdigit
[(ucs
>> 4) & 0xf];
3327 *p
++ = hexdigit
[ucs
& 0xf];
3330 /* Fall through: isolated surrogates are copied as-is */
3335 /* Map 16-bit characters to '\uxxxx' */
3339 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3340 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3341 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3342 *p
++ = hexdigit
[ch
& 15];
3344 /* Copy everything else as-is */
3349 _PyString_Resize(&repr
, p
- q
);
3353 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
3355 if (!PyUnicode_Check(unicode
)) {
3356 PyErr_BadArgument();
3359 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3360 PyUnicode_GET_SIZE(unicode
));
3363 /* --- Unicode Internal Codec ------------------------------------------- */
3365 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
3369 const char *starts
= s
;
3370 Py_ssize_t startinpos
;
3371 Py_ssize_t endinpos
;
3377 PyObject
*errorHandler
= NULL
;
3378 PyObject
*exc
= NULL
;
3380 #ifdef Py_UNICODE_WIDE
3381 Py_UNICODE unimax
= PyUnicode_GetMax();
3384 /* XXX overflow detection missing */
3385 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
3388 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
3389 return (PyObject
*)v
;
3390 p
= PyUnicode_AS_UNICODE(v
);
3394 memcpy(p
, s
, sizeof(Py_UNICODE
));
3395 /* We have to sanity check the raw data, otherwise doom looms for
3396 some malformed UCS-4 data. */
3398 #ifdef Py_UNICODE_WIDE
3399 *p
> unimax
|| *p
< 0 ||
3401 end
-s
< Py_UNICODE_SIZE
3404 startinpos
= s
- starts
;
3405 if (end
-s
< Py_UNICODE_SIZE
) {
3406 endinpos
= end
-starts
;
3407 reason
= "truncated input";
3410 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
3411 reason
= "illegal code point (> 0x10FFFF)";
3413 outpos
= p
- PyUnicode_AS_UNICODE(v
);
3414 if (unicode_decode_call_errorhandler(
3415 errors
, &errorHandler
,
3416 "unicode_internal", reason
,
3417 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3424 s
+= Py_UNICODE_SIZE
;
3428 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3430 Py_XDECREF(errorHandler
);
3432 return (PyObject
*)v
;
3436 Py_XDECREF(errorHandler
);
3441 /* --- Latin-1 Codec ------------------------------------------------------ */
3443 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
3450 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3452 Py_UNICODE r
= *(unsigned char*)s
;
3453 return PyUnicode_FromUnicode(&r
, 1);
3456 v
= _PyUnicode_New(size
);
3460 return (PyObject
*)v
;
3461 p
= PyUnicode_AS_UNICODE(v
);
3463 *p
++ = (unsigned char)*s
++;
3464 return (PyObject
*)v
;
3471 /* create or adjust a UnicodeEncodeError */
3472 static void make_encode_exception(PyObject
**exceptionObject
,
3473 const char *encoding
,
3474 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3475 Py_ssize_t startpos
, Py_ssize_t endpos
,
3478 if (*exceptionObject
== NULL
) {
3479 *exceptionObject
= PyUnicodeEncodeError_Create(
3480 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3483 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
3485 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
3487 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
3491 Py_DECREF(*exceptionObject
);
3492 *exceptionObject
= NULL
;
3496 /* raises a UnicodeEncodeError */
3497 static void raise_encode_exception(PyObject
**exceptionObject
,
3498 const char *encoding
,
3499 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3500 Py_ssize_t startpos
, Py_ssize_t endpos
,
3503 make_encode_exception(exceptionObject
,
3504 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3505 if (*exceptionObject
!= NULL
)
3506 PyCodec_StrictErrors(*exceptionObject
);
3509 /* error handling callback helper:
3510 build arguments, call the callback and check the arguments,
3511 put the result into newpos and return the replacement string, which
3512 has to be freed by the caller */
3513 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
3514 PyObject
**errorHandler
,
3515 const char *encoding
, const char *reason
,
3516 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
3517 Py_ssize_t startpos
, Py_ssize_t endpos
,
3520 static char *argparse
= "O!n;encoding error handler must return (unicode, int) tuple";
3523 PyObject
*resunicode
;
3525 if (*errorHandler
== NULL
) {
3526 *errorHandler
= PyCodec_LookupError(errors
);
3527 if (*errorHandler
== NULL
)
3531 make_encode_exception(exceptionObject
,
3532 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3533 if (*exceptionObject
== NULL
)
3536 restuple
= PyObject_CallFunctionObjArgs(
3537 *errorHandler
, *exceptionObject
, NULL
);
3538 if (restuple
== NULL
)
3540 if (!PyTuple_Check(restuple
)) {
3541 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
3542 Py_DECREF(restuple
);
3545 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3546 &resunicode
, newpos
)) {
3547 Py_DECREF(restuple
);
3551 *newpos
= size
+*newpos
;
3552 if (*newpos
<0 || *newpos
>size
) {
3553 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
3554 Py_DECREF(restuple
);
3557 Py_INCREF(resunicode
);
3558 Py_DECREF(restuple
);
3562 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
3569 /* pointers to the beginning and end+1 of input */
3570 const Py_UNICODE
*startp
= p
;
3571 const Py_UNICODE
*endp
= p
+ size
;
3572 /* pointer to the beginning of the unencodable characters */
3573 /* const Py_UNICODE *badp = NULL; */
3574 /* pointer into the output */
3576 /* current output position */
3577 Py_ssize_t respos
= 0;
3579 const char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
3580 const char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3581 PyObject
*errorHandler
= NULL
;
3582 PyObject
*exc
= NULL
;
3583 /* the following variable is used for caching string comparisons
3584 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3585 int known_errorHandler
= -1;
3587 /* allocate enough for a simple encoding without
3588 replacements, if we need more, we'll resize */
3589 res
= PyString_FromStringAndSize(NULL
, size
);
3594 str
= PyString_AS_STRING(res
);
3600 /* can we encode this? */
3602 /* no overflow check, because we know that the space is enough */
3607 Py_ssize_t unicodepos
= p
-startp
;
3608 Py_ssize_t requiredsize
;
3609 PyObject
*repunicode
;
3614 /* startpos for collecting unencodable chars */
3615 const Py_UNICODE
*collstart
= p
;
3616 const Py_UNICODE
*collend
= p
;
3617 /* find all unecodable characters */
3618 while ((collend
< endp
) && ((*collend
)>=limit
))
3620 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3621 if (known_errorHandler
==-1) {
3622 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3623 known_errorHandler
= 1;
3624 else if (!strcmp(errors
, "replace"))
3625 known_errorHandler
= 2;
3626 else if (!strcmp(errors
, "ignore"))
3627 known_errorHandler
= 3;
3628 else if (!strcmp(errors
, "xmlcharrefreplace"))
3629 known_errorHandler
= 4;
3631 known_errorHandler
= 0;
3633 switch (known_errorHandler
) {
3634 case 1: /* strict */
3635 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
3637 case 2: /* replace */
3638 while (collstart
++<collend
)
3639 *str
++ = '?'; /* fall through */
3640 case 3: /* ignore */
3643 case 4: /* xmlcharrefreplace */
3644 respos
= str
-PyString_AS_STRING(res
);
3645 /* determine replacement size (temporarily (mis)uses p) */
3646 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
3655 #ifndef Py_UNICODE_WIDE
3661 else if (*p
<1000000)
3667 requiredsize
= respos
+repsize
+(endp
-collend
);
3668 if (requiredsize
> ressize
) {
3669 if (requiredsize
<2*ressize
)
3670 requiredsize
= 2*ressize
;
3671 if (_PyString_Resize(&res
, requiredsize
))
3673 str
= PyString_AS_STRING(res
) + respos
;
3674 ressize
= requiredsize
;
3676 /* generate replacement (temporarily (mis)uses p) */
3677 for (p
= collstart
; p
< collend
; ++p
) {
3678 str
+= sprintf(str
, "&#%d;", (int)*p
);
3683 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
3684 encoding
, reason
, startp
, size
, &exc
,
3685 collstart
-startp
, collend
-startp
, &newpos
);
3686 if (repunicode
== NULL
)
3688 /* need more space? (at least enough for what we have+the
3689 replacement+the rest of the string, so we won't have to
3690 check space for encodable characters) */
3691 respos
= str
-PyString_AS_STRING(res
);
3692 repsize
= PyUnicode_GET_SIZE(repunicode
);
3693 requiredsize
= respos
+repsize
+(endp
-collend
);
3694 if (requiredsize
> ressize
) {
3695 if (requiredsize
<2*ressize
)
3696 requiredsize
= 2*ressize
;
3697 if (_PyString_Resize(&res
, requiredsize
)) {
3698 Py_DECREF(repunicode
);
3701 str
= PyString_AS_STRING(res
) + respos
;
3702 ressize
= requiredsize
;
3704 /* check if there is anything unencodable in the replacement
3705 and copy it to the output */
3706 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
3709 raise_encode_exception(&exc
, encoding
, startp
, size
,
3710 unicodepos
, unicodepos
+1, reason
);
3711 Py_DECREF(repunicode
);
3716 p
= startp
+ newpos
;
3717 Py_DECREF(repunicode
);
3721 /* Resize if we allocated to much */
3722 respos
= str
-PyString_AS_STRING(res
);
3724 /* If this falls res will be NULL */
3725 _PyString_Resize(&res
, respos
);
3726 Py_XDECREF(errorHandler
);
3732 Py_XDECREF(errorHandler
);
3737 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
3741 return unicode_encode_ucs1(p
, size
, errors
, 256);
3744 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
3746 if (!PyUnicode_Check(unicode
)) {
3747 PyErr_BadArgument();
3750 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
3751 PyUnicode_GET_SIZE(unicode
),
3755 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3757 PyObject
*PyUnicode_DecodeASCII(const char *s
,
3761 const char *starts
= s
;
3764 Py_ssize_t startinpos
;
3765 Py_ssize_t endinpos
;
3768 PyObject
*errorHandler
= NULL
;
3769 PyObject
*exc
= NULL
;
3771 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3772 if (size
== 1 && *(unsigned char*)s
< 128) {
3773 Py_UNICODE r
= *(unsigned char*)s
;
3774 return PyUnicode_FromUnicode(&r
, 1);
3777 v
= _PyUnicode_New(size
);
3781 return (PyObject
*)v
;
3782 p
= PyUnicode_AS_UNICODE(v
);
3785 register unsigned char c
= (unsigned char)*s
;
3791 startinpos
= s
-starts
;
3792 endinpos
= startinpos
+ 1;
3793 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
3794 if (unicode_decode_call_errorhandler(
3795 errors
, &errorHandler
,
3796 "ascii", "ordinal not in range(128)",
3797 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3802 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
3803 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3805 Py_XDECREF(errorHandler
);
3807 return (PyObject
*)v
;
3811 Py_XDECREF(errorHandler
);
3816 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
3820 return unicode_encode_ucs1(p
, size
, errors
, 128);
3823 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
3825 if (!PyUnicode_Check(unicode
)) {
3826 PyErr_BadArgument();
3829 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
3830 PyUnicode_GET_SIZE(unicode
),
3834 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3836 /* --- MBCS codecs for Windows -------------------------------------------- */
3838 #if SIZEOF_INT < SIZEOF_SIZE_T
3842 /* XXX This code is limited to "true" double-byte encodings, as
3843 a) it assumes an incomplete character consists of a single byte, and
3844 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3845 encodings, see IsDBCSLeadByteEx documentation. */
3847 static int is_dbcs_lead_byte(const char *s
, int offset
)
3849 const char *curr
= s
+ offset
;
3851 if (IsDBCSLeadByte(*curr
)) {
3852 const char *prev
= CharPrev(s
, curr
);
3853 return (prev
== curr
) || !IsDBCSLeadByte(*prev
) || (curr
- prev
== 2);
3859 * Decode MBCS string into unicode object. If 'final' is set, converts
3860 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3862 static int decode_mbcs(PyUnicodeObject
**v
,
3863 const char *s
, /* MBCS string */
3864 int size
, /* sizeof MBCS string */
3873 /* Skip trailing lead-byte unless 'final' is set */
3874 if (!final
&& size
>= 1 && is_dbcs_lead_byte(s
, size
- 1))
3877 /* First get the size of the result */
3879 usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
3881 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3887 /* Create unicode object */
3888 *v
= _PyUnicode_New(usize
);
3893 /* Extend unicode object */
3894 n
= PyUnicode_GET_SIZE(*v
);
3895 if (_PyUnicode_Resize(v
, n
+ usize
) < 0)
3899 /* Do the conversion */
3901 p
= PyUnicode_AS_UNICODE(*v
) + n
;
3902 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
3903 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3911 PyObject
*PyUnicode_DecodeMBCSStateful(const char *s
,
3914 Py_ssize_t
*consumed
)
3916 PyUnicodeObject
*v
= NULL
;
3925 done
= decode_mbcs(&v
, s
, INT_MAX
, 0);
3928 done
= decode_mbcs(&v
, s
, (int)size
, !consumed
);
3939 if (size
> INT_MAX
) {
3946 return (PyObject
*)v
;
3949 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
3953 return PyUnicode_DecodeMBCSStateful(s
, size
, errors
, NULL
);
3957 * Convert unicode into string object (MBCS).
3958 * Returns 0 if succeed, -1 otherwise.
3960 static int encode_mbcs(PyObject
**repr
,
3961 const Py_UNICODE
*p
, /* unicode */
3962 int size
) /* size of unicode */
3969 /* First get the size of the result */
3971 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
3972 if (mbcssize
== 0) {
3973 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3978 if (*repr
== NULL
) {
3979 /* Create string object */
3980 *repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
3985 /* Extend string object */
3986 n
= PyString_Size(*repr
);
3987 if (_PyString_Resize(repr
, n
+ mbcssize
) < 0)
3991 /* Do the conversion */
3993 char *s
= PyString_AS_STRING(*repr
) + n
;
3994 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
3995 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
4003 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
4007 PyObject
*repr
= NULL
;
4013 ret
= encode_mbcs(&repr
, p
, INT_MAX
);
4016 ret
= encode_mbcs(&repr
, p
, (int)size
);
4024 if (size
> INT_MAX
) {
4034 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
4036 if (!PyUnicode_Check(unicode
)) {
4037 PyErr_BadArgument();
4040 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
4041 PyUnicode_GET_SIZE(unicode
),
4047 #endif /* MS_WINDOWS */
4049 /* --- Character Mapping Codec -------------------------------------------- */
4051 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
4056 const char *starts
= s
;
4057 Py_ssize_t startinpos
;
4058 Py_ssize_t endinpos
;
4063 Py_ssize_t extrachars
= 0;
4064 PyObject
*errorHandler
= NULL
;
4065 PyObject
*exc
= NULL
;
4066 Py_UNICODE
*mapstring
= NULL
;
4067 Py_ssize_t maplen
= 0;
4069 /* Default to Latin-1 */
4070 if (mapping
== NULL
)
4071 return PyUnicode_DecodeLatin1(s
, size
, errors
);
4073 v
= _PyUnicode_New(size
);
4077 return (PyObject
*)v
;
4078 p
= PyUnicode_AS_UNICODE(v
);
4080 if (PyUnicode_CheckExact(mapping
)) {
4081 mapstring
= PyUnicode_AS_UNICODE(mapping
);
4082 maplen
= PyUnicode_GET_SIZE(mapping
);
4084 unsigned char ch
= *s
;
4085 Py_UNICODE x
= 0xfffe; /* illegal value */
4091 /* undefined mapping */
4092 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4093 startinpos
= s
-starts
;
4094 endinpos
= startinpos
+1;
4095 if (unicode_decode_call_errorhandler(
4096 errors
, &errorHandler
,
4097 "charmap", "character maps to <undefined>",
4098 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4110 unsigned char ch
= *s
;
4113 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4114 w
= PyInt_FromLong((long)ch
);
4117 x
= PyObject_GetItem(mapping
, w
);
4120 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4121 /* No mapping found means: mapping is undefined. */
4130 if (PyInt_Check(x
)) {
4131 long value
= PyInt_AS_LONG(x
);
4132 if (value
< 0 || value
> 65535) {
4133 PyErr_SetString(PyExc_TypeError
,
4134 "character mapping must be in range(65536)");
4138 *p
++ = (Py_UNICODE
)value
;
4140 else if (x
== Py_None
) {
4141 /* undefined mapping */
4142 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4143 startinpos
= s
-starts
;
4144 endinpos
= startinpos
+1;
4145 if (unicode_decode_call_errorhandler(
4146 errors
, &errorHandler
,
4147 "charmap", "character maps to <undefined>",
4148 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4156 else if (PyUnicode_Check(x
)) {
4157 Py_ssize_t targetsize
= PyUnicode_GET_SIZE(x
);
4159 if (targetsize
== 1)
4161 *p
++ = *PyUnicode_AS_UNICODE(x
);
4163 else if (targetsize
> 1) {
4165 if (targetsize
> extrachars
) {
4167 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
4168 Py_ssize_t needed
= (targetsize
- extrachars
) + \
4170 extrachars
+= needed
;
4171 /* XXX overflow detection missing */
4172 if (_PyUnicode_Resize(&v
,
4173 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
4177 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
4180 PyUnicode_AS_UNICODE(x
),
4183 extrachars
-= targetsize
;
4185 /* 1-0 mapping: skip the character */
4188 /* wrong return value */
4189 PyErr_SetString(PyExc_TypeError
,
4190 "character mapping must return integer, None or unicode");
4198 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
4199 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
4201 Py_XDECREF(errorHandler
);
4203 return (PyObject
*)v
;
4206 Py_XDECREF(errorHandler
);
4212 /* Charmap encoding: the lookup table */
4214 struct encoding_map
{
4216 unsigned char level1
[32];
4218 unsigned char level23
[1];
4222 encoding_map_size(PyObject
*obj
, PyObject
* args
)
4224 struct encoding_map
*map
= (struct encoding_map
*)obj
;
4225 return PyInt_FromLong(sizeof(*map
) - 1 + 16*map
->count2
+
4229 static PyMethodDef encoding_map_methods
[] = {
4230 {"size", encoding_map_size
, METH_NOARGS
,
4231 PyDoc_STR("Return the size (in bytes) of this object") },
4236 encoding_map_dealloc(PyObject
* o
)
4241 static PyTypeObject EncodingMapType
= {
4242 PyVarObject_HEAD_INIT(NULL
, 0)
4243 "EncodingMap", /*tp_name*/
4244 sizeof(struct encoding_map
), /*tp_basicsize*/
4247 encoding_map_dealloc
, /*tp_dealloc*/
4254 0, /*tp_as_sequence*/
4255 0, /*tp_as_mapping*/
4262 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
4266 0, /*tp_richcompare*/
4267 0, /*tp_weaklistoffset*/
4270 encoding_map_methods
, /*tp_methods*/
4277 0, /*tp_dictoffset*/
4286 PyUnicode_BuildEncodingMap(PyObject
* string
)
4290 struct encoding_map
*mresult
;
4293 unsigned char level1
[32];
4294 unsigned char level2
[512];
4295 unsigned char *mlevel1
, *mlevel2
, *mlevel3
;
4296 int count2
= 0, count3
= 0;
4298 if (!PyUnicode_Check(string
) || PyUnicode_GetSize(string
) != 256) {
4299 PyErr_BadArgument();
4302 decode
= PyUnicode_AS_UNICODE(string
);
4303 memset(level1
, 0xFF, sizeof level1
);
4304 memset(level2
, 0xFF, sizeof level2
);
4306 /* If there isn't a one-to-one mapping of NULL to \0,
4307 or if there are non-BMP characters, we need to use
4308 a mapping dictionary. */
4311 for (i
= 1; i
< 256; i
++) {
4314 #ifdef Py_UNICODE_WIDE
4315 || decode
[i
] > 0xFFFF
4321 if (decode
[i
] == 0xFFFE)
4322 /* unmapped character */
4324 l1
= decode
[i
] >> 11;
4325 l2
= decode
[i
] >> 7;
4326 if (level1
[l1
] == 0xFF)
4327 level1
[l1
] = count2
++;
4328 if (level2
[l2
] == 0xFF)
4329 level2
[l2
] = count3
++;
4332 if (count2
>= 0xFF || count3
>= 0xFF)
4336 PyObject
*result
= PyDict_New();
4337 PyObject
*key
, *value
;
4340 for (i
= 0; i
< 256; i
++) {
4342 key
= PyInt_FromLong(decode
[i
]);
4343 value
= PyInt_FromLong(i
);
4346 if (PyDict_SetItem(result
, key
, value
) == -1)
4359 /* Create a three-level trie */
4360 result
= PyObject_MALLOC(sizeof(struct encoding_map
) +
4361 16*count2
+ 128*count3
- 1);
4363 return PyErr_NoMemory();
4364 PyObject_Init(result
, &EncodingMapType
);
4365 mresult
= (struct encoding_map
*)result
;
4366 mresult
->count2
= count2
;
4367 mresult
->count3
= count3
;
4368 mlevel1
= mresult
->level1
;
4369 mlevel2
= mresult
->level23
;
4370 mlevel3
= mresult
->level23
+ 16*count2
;
4371 memcpy(mlevel1
, level1
, 32);
4372 memset(mlevel2
, 0xFF, 16*count2
);
4373 memset(mlevel3
, 0, 128*count3
);
4375 for (i
= 1; i
< 256; i
++) {
4376 int o1
, o2
, o3
, i2
, i3
;
4377 if (decode
[i
] == 0xFFFE)
4378 /* unmapped character */
4381 o2
= (decode
[i
]>>7) & 0xF;
4382 i2
= 16*mlevel1
[o1
] + o2
;
4383 if (mlevel2
[i2
] == 0xFF)
4384 mlevel2
[i2
] = count3
++;
4385 o3
= decode
[i
] & 0x7F;
4386 i3
= 128*mlevel2
[i2
] + o3
;
4393 encoding_map_lookup(Py_UNICODE c
, PyObject
*mapping
)
4395 struct encoding_map
*map
= (struct encoding_map
*)mapping
;
4397 int l2
= (c
>>7) & 0xF;
4401 #ifdef Py_UNICODE_WIDE
4409 i
= map
->level1
[l1
];
4414 i
= map
->level23
[16*i
+l2
];
4419 i
= map
->level23
[16*map
->count2
+ 128*i
+ l3
];
4426 /* Lookup the character ch in the mapping. If the character
4427 can't be found, Py_None is returned (or NULL, if another
4429 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
4431 PyObject
*w
= PyInt_FromLong((long)c
);
4436 x
= PyObject_GetItem(mapping
, w
);
4439 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4440 /* No mapping found means: mapping is undefined. */
4448 else if (x
== Py_None
)
4450 else if (PyInt_Check(x
)) {
4451 long value
= PyInt_AS_LONG(x
);
4452 if (value
< 0 || value
> 255) {
4453 PyErr_SetString(PyExc_TypeError
,
4454 "character mapping must be in range(256)");
4460 else if (PyString_Check(x
))
4463 /* wrong return value */
4464 PyErr_SetString(PyExc_TypeError
,
4465 "character mapping must return integer, None or str");
4472 charmapencode_resize(PyObject
**outobj
, Py_ssize_t
*outpos
, Py_ssize_t requiredsize
)
4474 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4475 /* exponentially overallocate to minimize reallocations */
4476 if (requiredsize
< 2*outsize
)
4477 requiredsize
= 2*outsize
;
4478 if (_PyString_Resize(outobj
, requiredsize
)) {
4484 typedef enum charmapencode_result
{
4485 enc_SUCCESS
, enc_FAILED
, enc_EXCEPTION
4486 }charmapencode_result
;
4487 /* lookup the character, put the result in the output string and adjust
4488 various state variables. Reallocate the output string if not enough
4489 space is available. Return a new reference to the object that
4490 was put in the output buffer, or Py_None, if the mapping was undefined
4491 (in which case no character was written) or NULL, if a
4492 reallocation error occurred. The caller must decref the result */
4494 charmapencode_result
charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
4495 PyObject
**outobj
, Py_ssize_t
*outpos
)
4499 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4501 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4502 int res
= encoding_map_lookup(c
, mapping
);
4503 Py_ssize_t requiredsize
= *outpos
+1;
4506 if (outsize
<requiredsize
)
4507 if (!charmapencode_resize(outobj
, outpos
, requiredsize
))
4508 return enc_EXCEPTION
;
4509 outstart
= PyString_AS_STRING(*outobj
);
4510 outstart
[(*outpos
)++] = (char)res
;
4514 rep
= charmapencode_lookup(c
, mapping
);
4516 return enc_EXCEPTION
;
4517 else if (rep
==Py_None
) {
4521 if (PyInt_Check(rep
)) {
4522 Py_ssize_t requiredsize
= *outpos
+1;
4523 if (outsize
<requiredsize
)
4524 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4526 return enc_EXCEPTION
;
4528 outstart
= PyString_AS_STRING(*outobj
);
4529 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
4532 const char *repchars
= PyString_AS_STRING(rep
);
4533 Py_ssize_t repsize
= PyString_GET_SIZE(rep
);
4534 Py_ssize_t requiredsize
= *outpos
+repsize
;
4535 if (outsize
<requiredsize
)
4536 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4538 return enc_EXCEPTION
;
4540 outstart
= PyString_AS_STRING(*outobj
);
4541 memcpy(outstart
+ *outpos
, repchars
, repsize
);
4549 /* handle an error in PyUnicode_EncodeCharmap
4550 Return 0 on success, -1 on error */
4552 int charmap_encoding_error(
4553 const Py_UNICODE
*p
, Py_ssize_t size
, Py_ssize_t
*inpos
, PyObject
*mapping
,
4554 PyObject
**exceptionObject
,
4555 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
4556 PyObject
**res
, Py_ssize_t
*respos
)
4558 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4562 /* startpos for collecting unencodable chars */
4563 Py_ssize_t collstartpos
= *inpos
;
4564 Py_ssize_t collendpos
= *inpos
+1;
4566 char *encoding
= "charmap";
4567 char *reason
= "character maps to <undefined>";
4568 charmapencode_result x
;
4570 /* find all unencodable characters */
4571 while (collendpos
< size
) {
4573 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4574 int res
= encoding_map_lookup(p
[collendpos
], mapping
);
4581 rep
= charmapencode_lookup(p
[collendpos
], mapping
);
4584 else if (rep
!=Py_None
) {
4591 /* cache callback name lookup
4592 * (if not done yet, i.e. it's the first error) */
4593 if (*known_errorHandler
==-1) {
4594 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4595 *known_errorHandler
= 1;
4596 else if (!strcmp(errors
, "replace"))
4597 *known_errorHandler
= 2;
4598 else if (!strcmp(errors
, "ignore"))
4599 *known_errorHandler
= 3;
4600 else if (!strcmp(errors
, "xmlcharrefreplace"))
4601 *known_errorHandler
= 4;
4603 *known_errorHandler
= 0;
4605 switch (*known_errorHandler
) {
4606 case 1: /* strict */
4607 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4609 case 2: /* replace */
4610 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
4611 x
= charmapencode_output('?', mapping
, res
, respos
);
4612 if (x
==enc_EXCEPTION
) {
4615 else if (x
==enc_FAILED
) {
4616 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4621 case 3: /* ignore */
4622 *inpos
= collendpos
;
4624 case 4: /* xmlcharrefreplace */
4625 /* generate replacement (temporarily (mis)uses p) */
4626 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
4627 char buffer
[2+29+1+1];
4629 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
4630 for (cp
= buffer
; *cp
; ++cp
) {
4631 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
4632 if (x
==enc_EXCEPTION
)
4634 else if (x
==enc_FAILED
) {
4635 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4640 *inpos
= collendpos
;
4643 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
4644 encoding
, reason
, p
, size
, exceptionObject
,
4645 collstartpos
, collendpos
, &newpos
);
4646 if (repunicode
== NULL
)
4648 /* generate replacement */
4649 repsize
= PyUnicode_GET_SIZE(repunicode
);
4650 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
4651 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
4652 if (x
==enc_EXCEPTION
) {
4655 else if (x
==enc_FAILED
) {
4656 Py_DECREF(repunicode
);
4657 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4662 Py_DECREF(repunicode
);
4667 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
4673 PyObject
*res
= NULL
;
4674 /* current input position */
4675 Py_ssize_t inpos
= 0;
4676 /* current output position */
4677 Py_ssize_t respos
= 0;
4678 PyObject
*errorHandler
= NULL
;
4679 PyObject
*exc
= NULL
;
4680 /* the following variable is used for caching string comparisons
4681 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4682 * 3=ignore, 4=xmlcharrefreplace */
4683 int known_errorHandler
= -1;
4685 /* Default to Latin-1 */
4686 if (mapping
== NULL
)
4687 return PyUnicode_EncodeLatin1(p
, size
, errors
);
4689 /* allocate enough for a simple encoding without
4690 replacements, if we need more, we'll resize */
4691 res
= PyString_FromStringAndSize(NULL
, size
);
4697 while (inpos
<size
) {
4698 /* try to encode it */
4699 charmapencode_result x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
4700 if (x
==enc_EXCEPTION
) /* error */
4702 if (x
==enc_FAILED
) { /* unencodable character */
4703 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
4705 &known_errorHandler
, &errorHandler
, errors
,
4711 /* done with this character => adjust input position */
4715 /* Resize if we allocated to much */
4716 if (respos
<PyString_GET_SIZE(res
)) {
4717 if (_PyString_Resize(&res
, respos
))
4721 Py_XDECREF(errorHandler
);
4727 Py_XDECREF(errorHandler
);
4731 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
4734 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
4735 PyErr_BadArgument();
4738 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
4739 PyUnicode_GET_SIZE(unicode
),
4744 /* create or adjust a UnicodeTranslateError */
4745 static void make_translate_exception(PyObject
**exceptionObject
,
4746 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4747 Py_ssize_t startpos
, Py_ssize_t endpos
,
4750 if (*exceptionObject
== NULL
) {
4751 *exceptionObject
= PyUnicodeTranslateError_Create(
4752 unicode
, size
, startpos
, endpos
, reason
);
4755 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
4757 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
4759 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
4763 Py_DECREF(*exceptionObject
);
4764 *exceptionObject
= NULL
;
4768 /* raises a UnicodeTranslateError */
4769 static void raise_translate_exception(PyObject
**exceptionObject
,
4770 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4771 Py_ssize_t startpos
, Py_ssize_t endpos
,
4774 make_translate_exception(exceptionObject
,
4775 unicode
, size
, startpos
, endpos
, reason
);
4776 if (*exceptionObject
!= NULL
)
4777 PyCodec_StrictErrors(*exceptionObject
);
4780 /* error handling callback helper:
4781 build arguments, call the callback and check the arguments,
4782 put the result into newpos and return the replacement string, which
4783 has to be freed by the caller */
4784 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
4785 PyObject
**errorHandler
,
4787 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
4788 Py_ssize_t startpos
, Py_ssize_t endpos
,
4791 static char *argparse
= "O!n;translating error handler must return (unicode, int) tuple";
4793 Py_ssize_t i_newpos
;
4795 PyObject
*resunicode
;
4797 if (*errorHandler
== NULL
) {
4798 *errorHandler
= PyCodec_LookupError(errors
);
4799 if (*errorHandler
== NULL
)
4803 make_translate_exception(exceptionObject
,
4804 unicode
, size
, startpos
, endpos
, reason
);
4805 if (*exceptionObject
== NULL
)
4808 restuple
= PyObject_CallFunctionObjArgs(
4809 *errorHandler
, *exceptionObject
, NULL
);
4810 if (restuple
== NULL
)
4812 if (!PyTuple_Check(restuple
)) {
4813 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
4814 Py_DECREF(restuple
);
4817 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
4818 &resunicode
, &i_newpos
)) {
4819 Py_DECREF(restuple
);
4823 *newpos
= size
+i_newpos
;
4826 if (*newpos
<0 || *newpos
>size
) {
4827 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
4828 Py_DECREF(restuple
);
4831 Py_INCREF(resunicode
);
4832 Py_DECREF(restuple
);
4836 /* Lookup the character ch in the mapping and put the result in result,
4837 which must be decrefed by the caller.
4838 Return 0 on success, -1 on error */
4840 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
4842 PyObject
*w
= PyInt_FromLong((long)c
);
4847 x
= PyObject_GetItem(mapping
, w
);
4850 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4851 /* No mapping found means: use 1:1 mapping. */
4858 else if (x
== Py_None
) {
4862 else if (PyInt_Check(x
)) {
4863 long value
= PyInt_AS_LONG(x
);
4864 long max
= PyUnicode_GetMax();
4865 if (value
< 0 || value
> max
) {
4866 PyErr_Format(PyExc_TypeError
,
4867 "character mapping must be in range(0x%lx)", max
+1);
4874 else if (PyUnicode_Check(x
)) {
4879 /* wrong return value */
4880 PyErr_SetString(PyExc_TypeError
,
4881 "character mapping must return integer, None or unicode");
4886 /* ensure that *outobj is at least requiredsize characters long,
4887 if not reallocate and adjust various state variables.
4888 Return 0 on success, -1 on error */
4890 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
4891 Py_ssize_t requiredsize
)
4893 Py_ssize_t oldsize
= PyUnicode_GET_SIZE(*outobj
);
4894 if (requiredsize
> oldsize
) {
4895 /* remember old output position */
4896 Py_ssize_t outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
4897 /* exponentially overallocate to minimize reallocations */
4898 if (requiredsize
< 2 * oldsize
)
4899 requiredsize
= 2 * oldsize
;
4900 if (PyUnicode_Resize(outobj
, requiredsize
) < 0)
4902 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
4906 /* lookup the character, put the result in the output string and adjust
4907 various state variables. Return a new reference to the object that
4908 was put in the output buffer in *result, or Py_None, if the mapping was
4909 undefined (in which case no character was written).
4910 The called must decref result.
4911 Return 0 on success, -1 on error. */
4913 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
4914 Py_ssize_t insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
4917 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
4920 /* not found => default to 1:1 mapping */
4921 *(*outp
)++ = *curinp
;
4923 else if (*res
==Py_None
)
4925 else if (PyInt_Check(*res
)) {
4926 /* no overflow check, because we know that the space is enough */
4927 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
4929 else if (PyUnicode_Check(*res
)) {
4930 Py_ssize_t repsize
= PyUnicode_GET_SIZE(*res
);
4932 /* no overflow check, because we know that the space is enough */
4933 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
4935 else if (repsize
!=0) {
4936 /* more than one character */
4937 Py_ssize_t requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
4938 (insize
- (curinp
-startinp
)) +
4940 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
4942 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
4951 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
4957 PyObject
*res
= NULL
;
4958 /* pointers to the beginning and end+1 of input */
4959 const Py_UNICODE
*startp
= p
;
4960 const Py_UNICODE
*endp
= p
+ size
;
4961 /* pointer into the output */
4963 /* current output position */
4964 Py_ssize_t respos
= 0;
4965 char *reason
= "character maps to <undefined>";
4966 PyObject
*errorHandler
= NULL
;
4967 PyObject
*exc
= NULL
;
4968 /* the following variable is used for caching string comparisons
4969 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4970 * 3=ignore, 4=xmlcharrefreplace */
4971 int known_errorHandler
= -1;
4973 if (mapping
== NULL
) {
4974 PyErr_BadArgument();
4978 /* allocate enough for a simple 1:1 translation without
4979 replacements, if we need more, we'll resize */
4980 res
= PyUnicode_FromUnicode(NULL
, size
);
4985 str
= PyUnicode_AS_UNICODE(res
);
4988 /* try to encode it */
4990 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
4995 if (x
!=Py_None
) /* it worked => adjust input pointer */
4997 else { /* untranslatable character */
4998 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
5002 /* startpos for collecting untranslatable chars */
5003 const Py_UNICODE
*collstart
= p
;
5004 const Py_UNICODE
*collend
= p
+1;
5005 const Py_UNICODE
*coll
;
5007 /* find all untranslatable characters */
5008 while (collend
< endp
) {
5009 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
5016 /* cache callback name lookup
5017 * (if not done yet, i.e. it's the first error) */
5018 if (known_errorHandler
==-1) {
5019 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5020 known_errorHandler
= 1;
5021 else if (!strcmp(errors
, "replace"))
5022 known_errorHandler
= 2;
5023 else if (!strcmp(errors
, "ignore"))
5024 known_errorHandler
= 3;
5025 else if (!strcmp(errors
, "xmlcharrefreplace"))
5026 known_errorHandler
= 4;
5028 known_errorHandler
= 0;
5030 switch (known_errorHandler
) {
5031 case 1: /* strict */
5032 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
5034 case 2: /* replace */
5035 /* No need to check for space, this is a 1:1 replacement */
5036 for (coll
= collstart
; coll
<collend
; ++coll
)
5039 case 3: /* ignore */
5042 case 4: /* xmlcharrefreplace */
5043 /* generate replacement (temporarily (mis)uses p) */
5044 for (p
= collstart
; p
< collend
; ++p
) {
5045 char buffer
[2+29+1+1];
5047 sprintf(buffer
, "&#%d;", (int)*p
);
5048 if (charmaptranslate_makespace(&res
, &str
,
5049 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
5051 for (cp
= buffer
; *cp
; ++cp
)
5057 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
5058 reason
, startp
, size
, &exc
,
5059 collstart
-startp
, collend
-startp
, &newpos
);
5060 if (repunicode
== NULL
)
5062 /* generate replacement */
5063 repsize
= PyUnicode_GET_SIZE(repunicode
);
5064 if (charmaptranslate_makespace(&res
, &str
,
5065 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
5066 Py_DECREF(repunicode
);
5069 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
5071 p
= startp
+ newpos
;
5072 Py_DECREF(repunicode
);
5076 /* Resize if we allocated to much */
5077 respos
= str
-PyUnicode_AS_UNICODE(res
);
5078 if (respos
<PyUnicode_GET_SIZE(res
)) {
5079 if (PyUnicode_Resize(&res
, respos
) < 0)
5083 Py_XDECREF(errorHandler
);
5089 Py_XDECREF(errorHandler
);
5093 PyObject
*PyUnicode_Translate(PyObject
*str
,
5099 str
= PyUnicode_FromObject(str
);
5102 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
5103 PyUnicode_GET_SIZE(str
),
5114 /* --- Decimal Encoder ---------------------------------------------------- */
5116 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
5121 Py_UNICODE
*p
, *end
;
5122 PyObject
*errorHandler
= NULL
;
5123 PyObject
*exc
= NULL
;
5124 const char *encoding
= "decimal";
5125 const char *reason
= "invalid decimal Unicode string";
5126 /* the following variable is used for caching string comparisons
5127 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5128 int known_errorHandler
= -1;
5130 if (output
== NULL
) {
5131 PyErr_BadArgument();
5138 register Py_UNICODE ch
= *p
;
5140 PyObject
*repunicode
;
5144 Py_UNICODE
*collstart
;
5145 Py_UNICODE
*collend
;
5147 if (Py_UNICODE_ISSPACE(ch
)) {
5152 decimal
= Py_UNICODE_TODECIMAL(ch
);
5154 *output
++ = '0' + decimal
;
5158 if (0 < ch
&& ch
< 256) {
5159 *output
++ = (char)ch
;
5163 /* All other characters are considered unencodable */
5166 while (collend
< end
) {
5167 if ((0 < *collend
&& *collend
< 256) ||
5168 !Py_UNICODE_ISSPACE(*collend
) ||
5169 Py_UNICODE_TODECIMAL(*collend
))
5172 /* cache callback name lookup
5173 * (if not done yet, i.e. it's the first error) */
5174 if (known_errorHandler
==-1) {
5175 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5176 known_errorHandler
= 1;
5177 else if (!strcmp(errors
, "replace"))
5178 known_errorHandler
= 2;
5179 else if (!strcmp(errors
, "ignore"))
5180 known_errorHandler
= 3;
5181 else if (!strcmp(errors
, "xmlcharrefreplace"))
5182 known_errorHandler
= 4;
5184 known_errorHandler
= 0;
5186 switch (known_errorHandler
) {
5187 case 1: /* strict */
5188 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
5190 case 2: /* replace */
5191 for (p
= collstart
; p
< collend
; ++p
)
5194 case 3: /* ignore */
5197 case 4: /* xmlcharrefreplace */
5198 /* generate replacement (temporarily (mis)uses p) */
5199 for (p
= collstart
; p
< collend
; ++p
)
5200 output
+= sprintf(output
, "&#%d;", (int)*p
);
5204 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
5205 encoding
, reason
, s
, length
, &exc
,
5206 collstart
-s
, collend
-s
, &newpos
);
5207 if (repunicode
== NULL
)
5209 /* generate replacement */
5210 repsize
= PyUnicode_GET_SIZE(repunicode
);
5211 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
5212 Py_UNICODE ch
= *uni2
;
5213 if (Py_UNICODE_ISSPACE(ch
))
5216 decimal
= Py_UNICODE_TODECIMAL(ch
);
5218 *output
++ = '0' + decimal
;
5219 else if (0 < ch
&& ch
< 256)
5220 *output
++ = (char)ch
;
5222 Py_DECREF(repunicode
);
5223 raise_encode_exception(&exc
, encoding
,
5224 s
, length
, collstart
-s
, collend
-s
, reason
);
5230 Py_DECREF(repunicode
);
5233 /* 0-terminate the output string */
5236 Py_XDECREF(errorHandler
);
5241 Py_XDECREF(errorHandler
);
5245 /* --- Helpers ------------------------------------------------------------ */
5247 #include "stringlib/unicodedefs.h"
5249 #define FROM_UNICODE
5251 #include "stringlib/fastsearch.h"
5253 #include "stringlib/count.h"
5254 #include "stringlib/find.h"
5255 #include "stringlib/partition.h"
5257 /* helper macro to fixup start/end slice values */
5258 #define FIX_START_END(obj) \
5260 start += (obj)->length; \
5263 if (end > (obj)->length) \
5264 end = (obj)->length; \
5266 end += (obj)->length; \
5270 Py_ssize_t
PyUnicode_Count(PyObject
*str
,
5276 PyUnicodeObject
* str_obj
;
5277 PyUnicodeObject
* sub_obj
;
5279 str_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(str
);
5282 sub_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(substr
);
5288 FIX_START_END(str_obj
);
5290 result
= stringlib_count(
5291 str_obj
->str
+ start
, end
- start
, sub_obj
->str
, sub_obj
->length
5300 Py_ssize_t
PyUnicode_Find(PyObject
*str
,
5308 str
= PyUnicode_FromObject(str
);
5311 sub
= PyUnicode_FromObject(sub
);
5318 result
= stringlib_find_slice(
5319 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5320 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5324 result
= stringlib_rfind_slice(
5325 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5326 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5337 int tailmatch(PyUnicodeObject
*self
,
5338 PyUnicodeObject
*substring
,
5343 if (substring
->length
== 0)
5346 FIX_START_END(self
);
5348 end
-= substring
->length
;
5352 if (direction
> 0) {
5353 if (Py_UNICODE_MATCH(self
, end
, substring
))
5356 if (Py_UNICODE_MATCH(self
, start
, substring
))
5363 Py_ssize_t
PyUnicode_Tailmatch(PyObject
*str
,
5371 str
= PyUnicode_FromObject(str
);
5374 substr
= PyUnicode_FromObject(substr
);
5375 if (substr
== NULL
) {
5380 result
= tailmatch((PyUnicodeObject
*)str
,
5381 (PyUnicodeObject
*)substr
,
5382 start
, end
, direction
);
5388 /* Apply fixfct filter to the Unicode object self and return a
5389 reference to the modified object */
5392 PyObject
*fixup(PyUnicodeObject
*self
,
5393 int (*fixfct
)(PyUnicodeObject
*s
))
5398 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5402 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5404 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
5405 /* fixfct should return TRUE if it modified the buffer. If
5406 FALSE, return a reference to the original buffer instead
5407 (to save space, not time) */
5410 return (PyObject
*) self
;
5412 return (PyObject
*) u
;
5416 int fixupper(PyUnicodeObject
*self
)
5418 Py_ssize_t len
= self
->length
;
5419 Py_UNICODE
*s
= self
->str
;
5423 register Py_UNICODE ch
;
5425 ch
= Py_UNICODE_TOUPPER(*s
);
5437 int fixlower(PyUnicodeObject
*self
)
5439 Py_ssize_t len
= self
->length
;
5440 Py_UNICODE
*s
= self
->str
;
5444 register Py_UNICODE ch
;
5446 ch
= Py_UNICODE_TOLOWER(*s
);
5458 int fixswapcase(PyUnicodeObject
*self
)
5460 Py_ssize_t len
= self
->length
;
5461 Py_UNICODE
*s
= self
->str
;
5465 if (Py_UNICODE_ISUPPER(*s
)) {
5466 *s
= Py_UNICODE_TOLOWER(*s
);
5468 } else if (Py_UNICODE_ISLOWER(*s
)) {
5469 *s
= Py_UNICODE_TOUPPER(*s
);
5479 int fixcapitalize(PyUnicodeObject
*self
)
5481 Py_ssize_t len
= self
->length
;
5482 Py_UNICODE
*s
= self
->str
;
5487 if (Py_UNICODE_ISLOWER(*s
)) {
5488 *s
= Py_UNICODE_TOUPPER(*s
);
5493 if (Py_UNICODE_ISUPPER(*s
)) {
5494 *s
= Py_UNICODE_TOLOWER(*s
);
5503 int fixtitle(PyUnicodeObject
*self
)
5505 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5506 register Py_UNICODE
*e
;
5507 int previous_is_cased
;
5509 /* Shortcut for single character strings */
5510 if (PyUnicode_GET_SIZE(self
) == 1) {
5511 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
5520 e
= p
+ PyUnicode_GET_SIZE(self
);
5521 previous_is_cased
= 0;
5522 for (; p
< e
; p
++) {
5523 register const Py_UNICODE ch
= *p
;
5525 if (previous_is_cased
)
5526 *p
= Py_UNICODE_TOLOWER(ch
);
5528 *p
= Py_UNICODE_TOTITLE(ch
);
5530 if (Py_UNICODE_ISLOWER(ch
) ||
5531 Py_UNICODE_ISUPPER(ch
) ||
5532 Py_UNICODE_ISTITLE(ch
))
5533 previous_is_cased
= 1;
5535 previous_is_cased
= 0;
5541 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
5543 PyObject
*internal_separator
= NULL
;
5544 const Py_UNICODE blank
= ' ';
5545 const Py_UNICODE
*sep
= &blank
;
5546 Py_ssize_t seplen
= 1;
5547 PyUnicodeObject
*res
= NULL
; /* the result */
5548 Py_ssize_t res_alloc
= 100; /* # allocated bytes for string in res */
5549 Py_ssize_t res_used
; /* # used bytes */
5550 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
5551 PyObject
*fseq
; /* PySequence_Fast(seq) */
5552 Py_ssize_t seqlen
; /* len(fseq) -- number of items in sequence */
5556 fseq
= PySequence_Fast(seq
, "");
5561 /* Grrrr. A codec may be invoked to convert str objects to
5562 * Unicode, and so it's possible to call back into Python code
5563 * during PyUnicode_FromObject(), and so it's possible for a sick
5564 * codec to change the size of fseq (if seq is a list). Therefore
5565 * we have to keep refetching the size -- can't assume seqlen
5568 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5569 /* If empty sequence, return u"". */
5571 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
5574 /* If singleton sequence with an exact Unicode, return that. */
5576 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
5577 if (PyUnicode_CheckExact(item
)) {
5579 res
= (PyUnicodeObject
*)item
;
5584 /* At least two items to join, or one that isn't exact Unicode. */
5586 /* Set up sep and seplen -- they're needed. */
5587 if (separator
== NULL
) {
5592 internal_separator
= PyUnicode_FromObject(separator
);
5593 if (internal_separator
== NULL
)
5595 sep
= PyUnicode_AS_UNICODE(internal_separator
);
5596 seplen
= PyUnicode_GET_SIZE(internal_separator
);
5597 /* In case PyUnicode_FromObject() mutated seq. */
5598 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5603 res
= _PyUnicode_New(res_alloc
);
5606 res_p
= PyUnicode_AS_UNICODE(res
);
5609 for (i
= 0; i
< seqlen
; ++i
) {
5611 Py_ssize_t new_res_used
;
5613 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
5614 /* Convert item to Unicode. */
5615 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
5616 PyErr_Format(PyExc_TypeError
,
5617 "sequence item %zd: expected string or Unicode,"
5619 i
, Py_TYPE(item
)->tp_name
);
5622 item
= PyUnicode_FromObject(item
);
5625 /* We own a reference to item from here on. */
5627 /* In case PyUnicode_FromObject() mutated seq. */
5628 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5630 /* Make sure we have enough space for the separator and the item. */
5631 itemlen
= PyUnicode_GET_SIZE(item
);
5632 new_res_used
= res_used
+ itemlen
;
5633 if (new_res_used
< 0)
5635 if (i
< seqlen
- 1) {
5636 new_res_used
+= seplen
;
5637 if (new_res_used
< 0)
5640 if (new_res_used
> res_alloc
) {
5641 /* double allocated size until it's big enough */
5643 res_alloc
+= res_alloc
;
5646 } while (new_res_used
> res_alloc
);
5647 if (_PyUnicode_Resize(&res
, res_alloc
) < 0) {
5651 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
5654 /* Copy item, and maybe the separator. */
5655 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), itemlen
);
5657 if (i
< seqlen
- 1) {
5658 Py_UNICODE_COPY(res_p
, sep
, seplen
);
5662 res_used
= new_res_used
;
5665 /* Shrink res to match the used area; this probably can't fail,
5666 * but it's cheap to check.
5668 if (_PyUnicode_Resize(&res
, res_used
) < 0)
5672 Py_XDECREF(internal_separator
);
5674 return (PyObject
*)res
;
5677 PyErr_SetString(PyExc_OverflowError
,
5678 "join() result is too long for a Python string");
5683 Py_XDECREF(internal_separator
);
5690 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
5702 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
5707 if (left
> PY_SSIZE_T_MAX
- self
->length
||
5708 right
> PY_SSIZE_T_MAX
- (left
+ self
->length
)) {
5709 PyErr_SetString(PyExc_OverflowError
, "padded string is too long");
5712 u
= _PyUnicode_New(left
+ self
->length
+ right
);
5715 Py_UNICODE_FILL(u
->str
, fill
, left
);
5716 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
5718 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
5724 #define SPLIT_APPEND(data, left, right) \
5725 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5728 if (PyList_Append(list, str)) { \
5736 PyObject
*split_whitespace(PyUnicodeObject
*self
,
5738 Py_ssize_t maxcount
)
5740 register Py_ssize_t i
;
5741 register Py_ssize_t j
;
5742 Py_ssize_t len
= self
->length
;
5744 register const Py_UNICODE
*buf
= self
->str
;
5746 for (i
= j
= 0; i
< len
; ) {
5748 while (i
< len
&& Py_UNICODE_ISSPACE(buf
[i
]))
5751 while (i
< len
&& !Py_UNICODE_ISSPACE(buf
[i
]))
5754 if (maxcount
-- <= 0)
5756 SPLIT_APPEND(buf
, j
, i
);
5757 while (i
< len
&& Py_UNICODE_ISSPACE(buf
[i
]))
5763 SPLIT_APPEND(buf
, j
, len
);
5772 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
5775 register Py_ssize_t i
;
5776 register Py_ssize_t j
;
5782 string
= PyUnicode_FromObject(string
);
5785 data
= PyUnicode_AS_UNICODE(string
);
5786 len
= PyUnicode_GET_SIZE(string
);
5788 list
= PyList_New(0);
5792 for (i
= j
= 0; i
< len
; ) {
5795 /* Find a line and append it */
5796 while (i
< len
&& !BLOOM_LINEBREAK(data
[i
]))
5799 /* Skip the line break reading CRLF as one line break */
5802 if (data
[i
] == '\r' && i
+ 1 < len
&&
5810 SPLIT_APPEND(data
, j
, eol
);
5814 SPLIT_APPEND(data
, j
, len
);
5827 PyObject
*split_char(PyUnicodeObject
*self
,
5830 Py_ssize_t maxcount
)
5832 register Py_ssize_t i
;
5833 register Py_ssize_t j
;
5834 Py_ssize_t len
= self
->length
;
5836 register const Py_UNICODE
*buf
= self
->str
;
5838 for (i
= j
= 0; i
< len
; ) {
5840 if (maxcount
-- <= 0)
5842 SPLIT_APPEND(buf
, j
, i
);
5848 SPLIT_APPEND(buf
, j
, len
);
5858 PyObject
*split_substring(PyUnicodeObject
*self
,
5860 PyUnicodeObject
*substring
,
5861 Py_ssize_t maxcount
)
5863 register Py_ssize_t i
;
5864 register Py_ssize_t j
;
5865 Py_ssize_t len
= self
->length
;
5866 Py_ssize_t sublen
= substring
->length
;
5869 for (i
= j
= 0; i
<= len
- sublen
; ) {
5870 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
5871 if (maxcount
-- <= 0)
5873 SPLIT_APPEND(self
->str
, j
, i
);
5879 SPLIT_APPEND(self
->str
, j
, len
);
5889 PyObject
*rsplit_whitespace(PyUnicodeObject
*self
,
5891 Py_ssize_t maxcount
)
5893 register Py_ssize_t i
;
5894 register Py_ssize_t j
;
5895 Py_ssize_t len
= self
->length
;
5897 register const Py_UNICODE
*buf
= self
->str
;
5899 for (i
= j
= len
- 1; i
>= 0; ) {
5901 while (i
>= 0 && Py_UNICODE_ISSPACE(buf
[i
]))
5904 while (i
>= 0 && !Py_UNICODE_ISSPACE(buf
[i
]))
5907 if (maxcount
-- <= 0)
5909 SPLIT_APPEND(buf
, i
+ 1, j
+ 1);
5910 while (i
>= 0 && Py_UNICODE_ISSPACE(buf
[i
]))
5916 SPLIT_APPEND(buf
, 0, j
+ 1);
5918 if (PyList_Reverse(list
) < 0)
5928 PyObject
*rsplit_char(PyUnicodeObject
*self
,
5931 Py_ssize_t maxcount
)
5933 register Py_ssize_t i
;
5934 register Py_ssize_t j
;
5935 Py_ssize_t len
= self
->length
;
5937 register const Py_UNICODE
*buf
= self
->str
;
5939 for (i
= j
= len
- 1; i
>= 0; ) {
5941 if (maxcount
-- <= 0)
5943 SPLIT_APPEND(buf
, i
+ 1, j
+ 1);
5949 SPLIT_APPEND(buf
, 0, j
+ 1);
5951 if (PyList_Reverse(list
) < 0)
5961 PyObject
*rsplit_substring(PyUnicodeObject
*self
,
5963 PyUnicodeObject
*substring
,
5964 Py_ssize_t maxcount
)
5966 register Py_ssize_t i
;
5967 register Py_ssize_t j
;
5968 Py_ssize_t len
= self
->length
;
5969 Py_ssize_t sublen
= substring
->length
;
5972 for (i
= len
- sublen
, j
= len
; i
>= 0; ) {
5973 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
5974 if (maxcount
-- <= 0)
5976 SPLIT_APPEND(self
->str
, i
+ sublen
, j
);
5983 SPLIT_APPEND(self
->str
, 0, j
);
5985 if (PyList_Reverse(list
) < 0)
5997 PyObject
*split(PyUnicodeObject
*self
,
5998 PyUnicodeObject
*substring
,
5999 Py_ssize_t maxcount
)
6004 maxcount
= PY_SSIZE_T_MAX
;
6006 list
= PyList_New(0);
6010 if (substring
== NULL
)
6011 return split_whitespace(self
,list
,maxcount
);
6013 else if (substring
->length
== 1)
6014 return split_char(self
,list
,substring
->str
[0],maxcount
);
6016 else if (substring
->length
== 0) {
6018 PyErr_SetString(PyExc_ValueError
, "empty separator");
6022 return split_substring(self
,list
,substring
,maxcount
);
6026 PyObject
*rsplit(PyUnicodeObject
*self
,
6027 PyUnicodeObject
*substring
,
6028 Py_ssize_t maxcount
)
6033 maxcount
= PY_SSIZE_T_MAX
;
6035 list
= PyList_New(0);
6039 if (substring
== NULL
)
6040 return rsplit_whitespace(self
,list
,maxcount
);
6042 else if (substring
->length
== 1)
6043 return rsplit_char(self
,list
,substring
->str
[0],maxcount
);
6045 else if (substring
->length
== 0) {
6047 PyErr_SetString(PyExc_ValueError
, "empty separator");
6051 return rsplit_substring(self
,list
,substring
,maxcount
);
6055 PyObject
*replace(PyUnicodeObject
*self
,
6056 PyUnicodeObject
*str1
,
6057 PyUnicodeObject
*str2
,
6058 Py_ssize_t maxcount
)
6063 maxcount
= PY_SSIZE_T_MAX
;
6065 if (str1
->length
== str2
->length
) {
6068 if (str1
->length
== 1) {
6069 /* replace characters */
6071 if (!findchar(self
->str
, self
->length
, str1
->str
[0]))
6073 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
6076 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
6079 for (i
= 0; i
< u
->length
; i
++)
6080 if (u
->str
[i
] == u1
) {
6087 self
->str
, self
->length
, str1
->str
, str1
->length
, FAST_SEARCH
6091 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
6094 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
6095 while (i
<= self
->length
- str1
->length
)
6096 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
6099 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
6106 Py_ssize_t n
, i
, j
, e
;
6107 Py_ssize_t product
, new_size
, delta
;
6110 /* replace strings */
6111 n
= stringlib_count(self
->str
, self
->length
, str1
->str
, str1
->length
);
6116 /* new_size = self->length + n * (str2->length - str1->length)); */
6117 delta
= (str2
->length
- str1
->length
);
6119 new_size
= self
->length
;
6121 product
= n
* (str2
->length
- str1
->length
);
6122 if ((product
/ (str2
->length
- str1
->length
)) != n
) {
6123 PyErr_SetString(PyExc_OverflowError
,
6124 "replace string is too long");
6127 new_size
= self
->length
+ product
;
6129 PyErr_SetString(PyExc_OverflowError
,
6130 "replace string is too long");
6134 u
= _PyUnicode_New(new_size
);
6139 e
= self
->length
- str1
->length
;
6140 if (str1
->length
> 0) {
6142 /* look for next match */
6145 if (Py_UNICODE_MATCH(self
, j
, str1
))
6152 /* copy unchanged part [i:j] */
6153 Py_UNICODE_COPY(p
, self
->str
+i
, j
-i
);
6156 /* copy substitution string */
6157 if (str2
->length
> 0) {
6158 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
6161 i
= j
+ str1
->length
;
6163 if (i
< self
->length
)
6164 /* copy tail [i:] */
6165 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
6169 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
6173 *p
++ = self
->str
[i
++];
6175 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
6178 return (PyObject
*) u
;
6181 /* nothing to replace; return original string (when possible) */
6182 if (PyUnicode_CheckExact(self
)) {
6184 return (PyObject
*) self
;
6186 return PyUnicode_FromUnicode(self
->str
, self
->length
);
6189 /* --- Unicode Object Methods --------------------------------------------- */
6191 PyDoc_STRVAR(title__doc__
,
6192 "S.title() -> unicode\n\
6194 Return a titlecased version of S, i.e. words start with title case\n\
6195 characters, all remaining cased characters have lower case.");
6198 unicode_title(PyUnicodeObject
*self
)
6200 return fixup(self
, fixtitle
);
6203 PyDoc_STRVAR(capitalize__doc__
,
6204 "S.capitalize() -> unicode\n\
6206 Return a capitalized version of S, i.e. make the first character\n\
6210 unicode_capitalize(PyUnicodeObject
*self
)
6212 return fixup(self
, fixcapitalize
);
6216 PyDoc_STRVAR(capwords__doc__
,
6217 "S.capwords() -> unicode\n\
6219 Apply .capitalize() to all words in S and return the result with\n\
6220 normalized whitespace (all whitespace strings are replaced by ' ').");
6223 unicode_capwords(PyUnicodeObject
*self
)
6229 /* Split into words */
6230 list
= split(self
, NULL
, -1);
6234 /* Capitalize each word */
6235 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
6236 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
6240 Py_DECREF(PyList_GET_ITEM(list
, i
));
6241 PyList_SET_ITEM(list
, i
, item
);
6244 /* Join the words to form a new string */
6245 item
= PyUnicode_Join(NULL
, list
);
6249 return (PyObject
*)item
;
6253 /* Argument converter. Coerces to a single unicode character */
6256 convert_uc(PyObject
*obj
, void *addr
)
6258 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
6262 uniobj
= PyUnicode_FromObject(obj
);
6263 if (uniobj
== NULL
) {
6264 PyErr_SetString(PyExc_TypeError
,
6265 "The fill character cannot be converted to Unicode");
6268 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
6269 PyErr_SetString(PyExc_TypeError
,
6270 "The fill character must be exactly one character long");
6274 unistr
= PyUnicode_AS_UNICODE(uniobj
);
6275 *fillcharloc
= unistr
[0];
6280 PyDoc_STRVAR(center__doc__
,
6281 "S.center(width[, fillchar]) -> unicode\n\
6283 Return S centered in a Unicode string of length width. Padding is\n\
6284 done using the specified fill character (default is a space)");
6287 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
6289 Py_ssize_t marg
, left
;
6291 Py_UNICODE fillchar
= ' ';
6293 if (!PyArg_ParseTuple(args
, "n|O&:center", &width
, convert_uc
, &fillchar
))
6296 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6298 return (PyObject
*) self
;
6301 marg
= width
- self
->length
;
6302 left
= marg
/ 2 + (marg
& width
& 1);
6304 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
6309 /* This code should go into some future Unicode collation support
6310 module. The basic comparison should compare ordinals on a naive
6311 basis (this is what Java does and thus Jython too). */
6313 /* speedy UTF-16 code point order comparison */
6315 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6317 static short utf16Fixup
[32] =
6319 0, 0, 0, 0, 0, 0, 0, 0,
6320 0, 0, 0, 0, 0, 0, 0, 0,
6321 0, 0, 0, 0, 0, 0, 0, 0,
6322 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6326 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6328 Py_ssize_t len1
, len2
;
6330 Py_UNICODE
*s1
= str1
->str
;
6331 Py_UNICODE
*s2
= str2
->str
;
6333 len1
= str1
->length
;
6334 len2
= str2
->length
;
6336 while (len1
> 0 && len2
> 0) {
6342 if (c1
> (1<<11) * 26)
6343 c1
+= utf16Fixup
[c1
>>11];
6344 if (c2
> (1<<11) * 26)
6345 c2
+= utf16Fixup
[c2
>>11];
6346 /* now c1 and c2 are in UTF-32-compatible order */
6349 return (c1
< c2
) ? -1 : 1;
6354 return (len1
< len2
) ? -1 : (len1
!= len2
);
6360 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6362 register Py_ssize_t len1
, len2
;
6364 Py_UNICODE
*s1
= str1
->str
;
6365 Py_UNICODE
*s2
= str2
->str
;
6367 len1
= str1
->length
;
6368 len2
= str2
->length
;
6370 while (len1
> 0 && len2
> 0) {
6377 return (c1
< c2
) ? -1 : 1;
6382 return (len1
< len2
) ? -1 : (len1
!= len2
);
6387 int PyUnicode_Compare(PyObject
*left
,
6390 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
6393 /* Coerce the two arguments */
6394 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6397 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6401 /* Shortcut for empty or interned objects */
6408 result
= unicode_compare(u
, v
);
6420 PyObject
*PyUnicode_RichCompare(PyObject
*left
,
6426 result
= PyUnicode_Compare(left
, right
);
6427 if (result
== -1 && PyErr_Occurred())
6430 /* Convert the return value to a Boolean */
6433 result
= (result
== 0);
6436 result
= (result
!= 0);
6439 result
= (result
<= 0);
6442 result
= (result
>= 0);
6445 result
= (result
== -1);
6448 result
= (result
== 1);
6451 return PyBool_FromLong(result
);
6457 Type errors mean that PyUnicode_FromObject() could not convert
6458 one of the arguments (usually the right hand side) to Unicode,
6459 ie. we can't handle the comparison request. However, it is
6460 possible that the other object knows a comparison method, which
6461 is why we return Py_NotImplemented to give the other object a
6465 if (PyErr_ExceptionMatches(PyExc_TypeError
)) {
6467 Py_INCREF(Py_NotImplemented
);
6468 return Py_NotImplemented
;
6470 if (op
!= Py_EQ
&& op
!= Py_NE
)
6473 /* Equality comparison.
6475 This is a special case: we silence any PyExc_UnicodeDecodeError
6476 and instead turn it into a PyErr_UnicodeWarning.
6479 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
))
6482 if (PyErr_Warn(PyExc_UnicodeWarning
,
6484 "Unicode equal comparison "
6485 "failed to convert both arguments to Unicode - "
6486 "interpreting them as being unequal" :
6487 "Unicode unequal comparison "
6488 "failed to convert both arguments to Unicode - "
6489 "interpreting them as being unequal"
6492 result
= (op
== Py_NE
);
6493 return PyBool_FromLong(result
);
6496 int PyUnicode_Contains(PyObject
*container
,
6499 PyObject
*str
, *sub
;
6502 /* Coerce the two arguments */
6503 sub
= PyUnicode_FromObject(element
);
6505 PyErr_SetString(PyExc_TypeError
,
6506 "'in <string>' requires string as left operand");
6510 str
= PyUnicode_FromObject(container
);
6516 result
= stringlib_contains_obj(str
, sub
);
6524 /* Concat to string or Unicode object giving a new Unicode object. */
6526 PyObject
*PyUnicode_Concat(PyObject
*left
,
6529 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
6531 /* Coerce the two arguments */
6532 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6535 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6540 if (v
== unicode_empty
) {
6542 return (PyObject
*)u
;
6544 if (u
== unicode_empty
) {
6546 return (PyObject
*)v
;
6549 /* Concat the two Unicode strings */
6550 w
= _PyUnicode_New(u
->length
+ v
->length
);
6553 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
6554 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
6558 return (PyObject
*)w
;
6566 PyDoc_STRVAR(count__doc__
,
6567 "S.count(sub[, start[, end]]) -> int\n\
6569 Return the number of non-overlapping occurrences of substring sub in\n\
6570 Unicode string S[start:end]. Optional arguments start and end are\n\
6571 interpreted as in slice notation.");
6574 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
6576 PyUnicodeObject
*substring
;
6577 Py_ssize_t start
= 0;
6578 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6581 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
6582 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6585 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6586 (PyObject
*)substring
);
6587 if (substring
== NULL
)
6590 FIX_START_END(self
);
6592 result
= PyInt_FromSsize_t(
6593 stringlib_count(self
->str
+ start
, end
- start
,
6594 substring
->str
, substring
->length
)
6597 Py_DECREF(substring
);
6602 PyDoc_STRVAR(encode__doc__
,
6603 "S.encode([encoding[,errors]]) -> string or unicode\n\
6605 Encodes S using the codec registered for encoding. encoding defaults\n\
6606 to the default encoding. errors may be given to set a different error\n\
6607 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6608 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6609 'xmlcharrefreplace' as well as any other name registered with\n\
6610 codecs.register_error that can handle UnicodeEncodeErrors.");
6613 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
, PyObject
*kwargs
)
6615 static char *kwlist
[] = {"encoding", "errors", 0};
6616 char *encoding
= NULL
;
6617 char *errors
= NULL
;
6620 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "|ss:encode",
6621 kwlist
, &encoding
, &errors
))
6623 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
6626 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6627 PyErr_Format(PyExc_TypeError
,
6628 "encoder did not return a string/unicode object "
6630 Py_TYPE(v
)->tp_name
);
6640 PyDoc_STRVAR(decode__doc__
,
6641 "S.decode([encoding[,errors]]) -> string or unicode\n\
6643 Decodes S using the codec registered for encoding. encoding defaults\n\
6644 to the default encoding. errors may be given to set a different error\n\
6645 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6646 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6647 as well as any other name registerd with codecs.register_error that is\n\
6648 able to handle UnicodeDecodeErrors.");
6651 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
, PyObject
*kwargs
)
6653 static char *kwlist
[] = {"encoding", "errors", 0};
6654 char *encoding
= NULL
;
6655 char *errors
= NULL
;
6658 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "|ss:decode",
6659 kwlist
, &encoding
, &errors
))
6661 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
6664 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6665 PyErr_Format(PyExc_TypeError
,
6666 "decoder did not return a string/unicode object "
6668 Py_TYPE(v
)->tp_name
);
6678 PyDoc_STRVAR(expandtabs__doc__
,
6679 "S.expandtabs([tabsize]) -> unicode\n\
6681 Return a copy of S where all tab characters are expanded using spaces.\n\
6682 If tabsize is not given, a tab size of 8 characters is assumed.");
6685 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
6691 Py_ssize_t i
, j
, incr
;
6695 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
6698 /* First pass: determine size of output string */
6699 i
= 0; /* chars up to and including most recent \n or \r */
6700 j
= 0; /* chars since most recent \n or \r (use in tab calculations) */
6701 e
= self
->str
+ self
->length
; /* end of input */
6702 for (p
= self
->str
; p
< e
; p
++)
6705 incr
= tabsize
- (j
% tabsize
); /* cannot overflow */
6706 if (j
> PY_SSIZE_T_MAX
- incr
)
6712 if (j
> PY_SSIZE_T_MAX
- 1)
6715 if (*p
== '\n' || *p
== '\r') {
6716 if (i
> PY_SSIZE_T_MAX
- j
)
6723 if (i
> PY_SSIZE_T_MAX
- j
)
6726 /* Second pass: create output string and fill it */
6727 u
= _PyUnicode_New(i
+ j
);
6731 j
= 0; /* same as in first pass */
6732 q
= u
->str
; /* next output char */
6733 qe
= u
->str
+ u
->length
; /* end of output */
6735 for (p
= self
->str
; p
< e
; p
++)
6738 i
= tabsize
- (j
% tabsize
);
6752 if (*p
== '\n' || *p
== '\r')
6756 return (PyObject
*) u
;
6761 PyErr_SetString(PyExc_OverflowError
, "new string is too long");
6765 PyDoc_STRVAR(find__doc__
,
6766 "S.find(sub [,start [,end]]) -> int\n\
6768 Return the lowest index in S where substring sub is found,\n\
6769 such that sub is contained within s[start:end]. Optional\n\
6770 arguments start and end are interpreted as in slice notation.\n\
6772 Return -1 on failure.");
6775 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
6777 PyObject
*substring
;
6782 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6785 result
= stringlib_find_slice(
6786 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6787 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6791 Py_DECREF(substring
);
6793 return PyInt_FromSsize_t(result
);
6797 unicode_getitem(PyUnicodeObject
*self
, Py_ssize_t index
)
6799 if (index
< 0 || index
>= self
->length
) {
6800 PyErr_SetString(PyExc_IndexError
, "string index out of range");
6804 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
6808 unicode_hash(PyUnicodeObject
*self
)
6810 /* Since Unicode objects compare equal to their ASCII string
6811 counterparts, they should use the individual character values
6812 as basis for their hash value. This is needed to assure that
6813 strings and Unicode objects behave in the same way as
6816 register Py_ssize_t len
;
6817 register Py_UNICODE
*p
;
6820 if (self
->hash
!= -1)
6822 len
= PyUnicode_GET_SIZE(self
);
6823 p
= PyUnicode_AS_UNICODE(self
);
6826 x
= (1000003*x
) ^ *p
++;
6827 x
^= PyUnicode_GET_SIZE(self
);
6834 PyDoc_STRVAR(index__doc__
,
6835 "S.index(sub [,start [,end]]) -> int\n\
6837 Like S.find() but raise ValueError when the substring is not found.");
6840 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
6843 PyObject
*substring
;
6847 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6850 result
= stringlib_find_slice(
6851 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6852 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6856 Py_DECREF(substring
);
6859 PyErr_SetString(PyExc_ValueError
, "substring not found");
6863 return PyInt_FromSsize_t(result
);
6866 PyDoc_STRVAR(islower__doc__
,
6867 "S.islower() -> bool\n\
6869 Return True if all cased characters in S are lowercase and there is\n\
6870 at least one cased character in S, False otherwise.");
6873 unicode_islower(PyUnicodeObject
*self
)
6875 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6876 register const Py_UNICODE
*e
;
6879 /* Shortcut for single character strings */
6880 if (PyUnicode_GET_SIZE(self
) == 1)
6881 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
6883 /* Special case for empty strings */
6884 if (PyUnicode_GET_SIZE(self
) == 0)
6885 return PyBool_FromLong(0);
6887 e
= p
+ PyUnicode_GET_SIZE(self
);
6889 for (; p
< e
; p
++) {
6890 register const Py_UNICODE ch
= *p
;
6892 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
6893 return PyBool_FromLong(0);
6894 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
6897 return PyBool_FromLong(cased
);
6900 PyDoc_STRVAR(isupper__doc__
,
6901 "S.isupper() -> bool\n\
6903 Return True if all cased characters in S are uppercase and there is\n\
6904 at least one cased character in S, False otherwise.");
6907 unicode_isupper(PyUnicodeObject
*self
)
6909 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6910 register const Py_UNICODE
*e
;
6913 /* Shortcut for single character strings */
6914 if (PyUnicode_GET_SIZE(self
) == 1)
6915 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
6917 /* Special case for empty strings */
6918 if (PyUnicode_GET_SIZE(self
) == 0)
6919 return PyBool_FromLong(0);
6921 e
= p
+ PyUnicode_GET_SIZE(self
);
6923 for (; p
< e
; p
++) {
6924 register const Py_UNICODE ch
= *p
;
6926 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
6927 return PyBool_FromLong(0);
6928 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
6931 return PyBool_FromLong(cased
);
6934 PyDoc_STRVAR(istitle__doc__
,
6935 "S.istitle() -> bool\n\
6937 Return True if S is a titlecased string and there is at least one\n\
6938 character in S, i.e. upper- and titlecase characters may only\n\
6939 follow uncased characters and lowercase characters only cased ones.\n\
6940 Return False otherwise.");
6943 unicode_istitle(PyUnicodeObject
*self
)
6945 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6946 register const Py_UNICODE
*e
;
6947 int cased
, previous_is_cased
;
6949 /* Shortcut for single character strings */
6950 if (PyUnicode_GET_SIZE(self
) == 1)
6951 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
6952 (Py_UNICODE_ISUPPER(*p
) != 0));
6954 /* Special case for empty strings */
6955 if (PyUnicode_GET_SIZE(self
) == 0)
6956 return PyBool_FromLong(0);
6958 e
= p
+ PyUnicode_GET_SIZE(self
);
6960 previous_is_cased
= 0;
6961 for (; p
< e
; p
++) {
6962 register const Py_UNICODE ch
= *p
;
6964 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
6965 if (previous_is_cased
)
6966 return PyBool_FromLong(0);
6967 previous_is_cased
= 1;
6970 else if (Py_UNICODE_ISLOWER(ch
)) {
6971 if (!previous_is_cased
)
6972 return PyBool_FromLong(0);
6973 previous_is_cased
= 1;
6977 previous_is_cased
= 0;
6979 return PyBool_FromLong(cased
);
6982 PyDoc_STRVAR(isspace__doc__
,
6983 "S.isspace() -> bool\n\
6985 Return True if all characters in S are whitespace\n\
6986 and there is at least one character in S, False otherwise.");
6989 unicode_isspace(PyUnicodeObject
*self
)
6991 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6992 register const Py_UNICODE
*e
;
6994 /* Shortcut for single character strings */
6995 if (PyUnicode_GET_SIZE(self
) == 1 &&
6996 Py_UNICODE_ISSPACE(*p
))
6997 return PyBool_FromLong(1);
6999 /* Special case for empty strings */
7000 if (PyUnicode_GET_SIZE(self
) == 0)
7001 return PyBool_FromLong(0);
7003 e
= p
+ PyUnicode_GET_SIZE(self
);
7004 for (; p
< e
; p
++) {
7005 if (!Py_UNICODE_ISSPACE(*p
))
7006 return PyBool_FromLong(0);
7008 return PyBool_FromLong(1);
7011 PyDoc_STRVAR(isalpha__doc__
,
7012 "S.isalpha() -> bool\n\
7014 Return True if all characters in S are alphabetic\n\
7015 and there is at least one character in S, False otherwise.");
7018 unicode_isalpha(PyUnicodeObject
*self
)
7020 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7021 register const Py_UNICODE
*e
;
7023 /* Shortcut for single character strings */
7024 if (PyUnicode_GET_SIZE(self
) == 1 &&
7025 Py_UNICODE_ISALPHA(*p
))
7026 return PyBool_FromLong(1);
7028 /* Special case for empty strings */
7029 if (PyUnicode_GET_SIZE(self
) == 0)
7030 return PyBool_FromLong(0);
7032 e
= p
+ PyUnicode_GET_SIZE(self
);
7033 for (; p
< e
; p
++) {
7034 if (!Py_UNICODE_ISALPHA(*p
))
7035 return PyBool_FromLong(0);
7037 return PyBool_FromLong(1);
7040 PyDoc_STRVAR(isalnum__doc__
,
7041 "S.isalnum() -> bool\n\
7043 Return True if all characters in S are alphanumeric\n\
7044 and there is at least one character in S, False otherwise.");
7047 unicode_isalnum(PyUnicodeObject
*self
)
7049 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7050 register const Py_UNICODE
*e
;
7052 /* Shortcut for single character strings */
7053 if (PyUnicode_GET_SIZE(self
) == 1 &&
7054 Py_UNICODE_ISALNUM(*p
))
7055 return PyBool_FromLong(1);
7057 /* Special case for empty strings */
7058 if (PyUnicode_GET_SIZE(self
) == 0)
7059 return PyBool_FromLong(0);
7061 e
= p
+ PyUnicode_GET_SIZE(self
);
7062 for (; p
< e
; p
++) {
7063 if (!Py_UNICODE_ISALNUM(*p
))
7064 return PyBool_FromLong(0);
7066 return PyBool_FromLong(1);
7069 PyDoc_STRVAR(isdecimal__doc__
,
7070 "S.isdecimal() -> bool\n\
7072 Return True if there are only decimal characters in S,\n\
7076 unicode_isdecimal(PyUnicodeObject
*self
)
7078 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7079 register const Py_UNICODE
*e
;
7081 /* Shortcut for single character strings */
7082 if (PyUnicode_GET_SIZE(self
) == 1 &&
7083 Py_UNICODE_ISDECIMAL(*p
))
7084 return PyBool_FromLong(1);
7086 /* Special case for empty strings */
7087 if (PyUnicode_GET_SIZE(self
) == 0)
7088 return PyBool_FromLong(0);
7090 e
= p
+ PyUnicode_GET_SIZE(self
);
7091 for (; p
< e
; p
++) {
7092 if (!Py_UNICODE_ISDECIMAL(*p
))
7093 return PyBool_FromLong(0);
7095 return PyBool_FromLong(1);
7098 PyDoc_STRVAR(isdigit__doc__
,
7099 "S.isdigit() -> bool\n\
7101 Return True if all characters in S are digits\n\
7102 and there is at least one character in S, False otherwise.");
7105 unicode_isdigit(PyUnicodeObject
*self
)
7107 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7108 register const Py_UNICODE
*e
;
7110 /* Shortcut for single character strings */
7111 if (PyUnicode_GET_SIZE(self
) == 1 &&
7112 Py_UNICODE_ISDIGIT(*p
))
7113 return PyBool_FromLong(1);
7115 /* Special case for empty strings */
7116 if (PyUnicode_GET_SIZE(self
) == 0)
7117 return PyBool_FromLong(0);
7119 e
= p
+ PyUnicode_GET_SIZE(self
);
7120 for (; p
< e
; p
++) {
7121 if (!Py_UNICODE_ISDIGIT(*p
))
7122 return PyBool_FromLong(0);
7124 return PyBool_FromLong(1);
7127 PyDoc_STRVAR(isnumeric__doc__
,
7128 "S.isnumeric() -> bool\n\
7130 Return True if there are only numeric characters in S,\n\
7134 unicode_isnumeric(PyUnicodeObject
*self
)
7136 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7137 register const Py_UNICODE
*e
;
7139 /* Shortcut for single character strings */
7140 if (PyUnicode_GET_SIZE(self
) == 1 &&
7141 Py_UNICODE_ISNUMERIC(*p
))
7142 return PyBool_FromLong(1);
7144 /* Special case for empty strings */
7145 if (PyUnicode_GET_SIZE(self
) == 0)
7146 return PyBool_FromLong(0);
7148 e
= p
+ PyUnicode_GET_SIZE(self
);
7149 for (; p
< e
; p
++) {
7150 if (!Py_UNICODE_ISNUMERIC(*p
))
7151 return PyBool_FromLong(0);
7153 return PyBool_FromLong(1);
7156 PyDoc_STRVAR(join__doc__
,
7157 "S.join(iterable) -> unicode\n\
7159 Return a string which is the concatenation of the strings in the\n\
7160 iterable. The separator between elements is S.");
7163 unicode_join(PyObject
*self
, PyObject
*data
)
7165 return PyUnicode_Join(self
, data
);
7169 unicode_length(PyUnicodeObject
*self
)
7171 return self
->length
;
7174 PyDoc_STRVAR(ljust__doc__
,
7175 "S.ljust(width[, fillchar]) -> int\n\
7177 Return S left-justified in a Unicode string of length width. Padding is\n\
7178 done using the specified fill character (default is a space).");
7181 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
7184 Py_UNICODE fillchar
= ' ';
7186 if (!PyArg_ParseTuple(args
, "n|O&:ljust", &width
, convert_uc
, &fillchar
))
7189 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7191 return (PyObject
*) self
;
7194 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
7197 PyDoc_STRVAR(lower__doc__
,
7198 "S.lower() -> unicode\n\
7200 Return a copy of the string S converted to lowercase.");
7203 unicode_lower(PyUnicodeObject
*self
)
7205 return fixup(self
, fixlower
);
7209 #define RIGHTSTRIP 1
7212 /* Arrays indexed by above */
7213 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7215 #define STRIPNAME(i) (stripformat[i]+3)
7217 /* externally visible for str.strip(unicode) */
7219 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
7221 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
7222 Py_ssize_t len
= PyUnicode_GET_SIZE(self
);
7223 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
7224 Py_ssize_t seplen
= PyUnicode_GET_SIZE(sepobj
);
7227 BLOOM_MASK sepmask
= make_bloom_mask(sep
, seplen
);
7230 if (striptype
!= RIGHTSTRIP
) {
7231 while (i
< len
&& BLOOM_MEMBER(sepmask
, s
[i
], sep
, seplen
)) {
7237 if (striptype
!= LEFTSTRIP
) {
7240 } while (j
>= i
&& BLOOM_MEMBER(sepmask
, s
[j
], sep
, seplen
));
7244 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7246 return (PyObject
*)self
;
7249 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7254 do_strip(PyUnicodeObject
*self
, int striptype
)
7256 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
7257 Py_ssize_t len
= PyUnicode_GET_SIZE(self
), i
, j
;
7260 if (striptype
!= RIGHTSTRIP
) {
7261 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
7267 if (striptype
!= LEFTSTRIP
) {
7270 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
7274 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7276 return (PyObject
*)self
;
7279 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7284 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
7286 PyObject
*sep
= NULL
;
7288 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
7291 if (sep
!= NULL
&& sep
!= Py_None
) {
7292 if (PyUnicode_Check(sep
))
7293 return _PyUnicode_XStrip(self
, striptype
, sep
);
7294 else if (PyString_Check(sep
)) {
7296 sep
= PyUnicode_FromObject(sep
);
7299 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
7304 PyErr_Format(PyExc_TypeError
,
7305 "%s arg must be None, unicode or str",
7306 STRIPNAME(striptype
));
7311 return do_strip(self
, striptype
);
7315 PyDoc_STRVAR(strip__doc__
,
7316 "S.strip([chars]) -> unicode\n\
7318 Return a copy of the string S with leading and trailing\n\
7319 whitespace removed.\n\
7320 If chars is given and not None, remove characters in chars instead.\n\
7321 If chars is a str, it will be converted to unicode before stripping");
7324 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
7326 if (PyTuple_GET_SIZE(args
) == 0)
7327 return do_strip(self
, BOTHSTRIP
); /* Common case */
7329 return do_argstrip(self
, BOTHSTRIP
, args
);
7333 PyDoc_STRVAR(lstrip__doc__
,
7334 "S.lstrip([chars]) -> unicode\n\
7336 Return a copy of the string S with leading whitespace removed.\n\
7337 If chars is given and not None, remove characters in chars instead.\n\
7338 If chars is a str, it will be converted to unicode before stripping");
7341 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
7343 if (PyTuple_GET_SIZE(args
) == 0)
7344 return do_strip(self
, LEFTSTRIP
); /* Common case */
7346 return do_argstrip(self
, LEFTSTRIP
, args
);
7350 PyDoc_STRVAR(rstrip__doc__
,
7351 "S.rstrip([chars]) -> unicode\n\
7353 Return a copy of the string S with trailing whitespace removed.\n\
7354 If chars is given and not None, remove characters in chars instead.\n\
7355 If chars is a str, it will be converted to unicode before stripping");
7358 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
7360 if (PyTuple_GET_SIZE(args
) == 0)
7361 return do_strip(self
, RIGHTSTRIP
); /* Common case */
7363 return do_argstrip(self
, RIGHTSTRIP
, args
);
7368 unicode_repeat(PyUnicodeObject
*str
, Py_ssize_t len
)
7378 if (len
== 1 && PyUnicode_CheckExact(str
)) {
7379 /* no repeat, return original string */
7381 return (PyObject
*) str
;
7384 /* ensure # of chars needed doesn't overflow int and # of bytes
7385 * needed doesn't overflow size_t
7387 nchars
= len
* str
->length
;
7388 if (len
&& nchars
/ len
!= str
->length
) {
7389 PyErr_SetString(PyExc_OverflowError
,
7390 "repeated string is too long");
7393 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
7394 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
7395 PyErr_SetString(PyExc_OverflowError
,
7396 "repeated string is too long");
7399 u
= _PyUnicode_New(nchars
);
7405 if (str
->length
== 1 && len
> 0) {
7406 Py_UNICODE_FILL(p
, str
->str
[0], len
);
7408 Py_ssize_t done
= 0; /* number of characters copied this far */
7409 if (done
< nchars
) {
7410 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
7413 while (done
< nchars
) {
7414 Py_ssize_t n
= (done
<= nchars
-done
) ? done
: nchars
-done
;
7415 Py_UNICODE_COPY(p
+done
, p
, n
);
7420 return (PyObject
*) u
;
7423 PyObject
*PyUnicode_Replace(PyObject
*obj
,
7426 Py_ssize_t maxcount
)
7433 self
= PyUnicode_FromObject(obj
);
7436 str1
= PyUnicode_FromObject(subobj
);
7441 str2
= PyUnicode_FromObject(replobj
);
7447 result
= replace((PyUnicodeObject
*)self
,
7448 (PyUnicodeObject
*)str1
,
7449 (PyUnicodeObject
*)str2
,
7457 PyDoc_STRVAR(replace__doc__
,
7458 "S.replace (old, new[, count]) -> unicode\n\
7460 Return a copy of S with all occurrences of substring\n\
7461 old replaced by new. If the optional argument count is\n\
7462 given, only the first count occurrences are replaced.");
7465 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
7467 PyUnicodeObject
*str1
;
7468 PyUnicodeObject
*str2
;
7469 Py_ssize_t maxcount
= -1;
7472 if (!PyArg_ParseTuple(args
, "OO|n:replace", &str1
, &str2
, &maxcount
))
7474 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
7477 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
7483 result
= replace(self
, str1
, str2
, maxcount
);
7491 PyObject
*unicode_repr(PyObject
*unicode
)
7493 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
7494 PyUnicode_GET_SIZE(unicode
),
7498 PyDoc_STRVAR(rfind__doc__
,
7499 "S.rfind(sub [,start [,end]]) -> int\n\
7501 Return the highest index in S where substring sub is found,\n\
7502 such that sub is contained within s[start:end]. Optional\n\
7503 arguments start and end are interpreted as in slice notation.\n\
7505 Return -1 on failure.");
7508 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
7510 PyObject
*substring
;
7515 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7518 result
= stringlib_rfind_slice(
7519 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7520 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7524 Py_DECREF(substring
);
7526 return PyInt_FromSsize_t(result
);
7529 PyDoc_STRVAR(rindex__doc__
,
7530 "S.rindex(sub [,start [,end]]) -> int\n\
7532 Like S.rfind() but raise ValueError when the substring is not found.");
7535 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
7537 PyObject
*substring
;
7542 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7545 result
= stringlib_rfind_slice(
7546 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7547 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7551 Py_DECREF(substring
);
7554 PyErr_SetString(PyExc_ValueError
, "substring not found");
7557 return PyInt_FromSsize_t(result
);
7560 PyDoc_STRVAR(rjust__doc__
,
7561 "S.rjust(width[, fillchar]) -> unicode\n\
7563 Return S right-justified in a Unicode string of length width. Padding is\n\
7564 done using the specified fill character (default is a space).");
7567 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
7570 Py_UNICODE fillchar
= ' ';
7572 if (!PyArg_ParseTuple(args
, "n|O&:rjust", &width
, convert_uc
, &fillchar
))
7575 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7577 return (PyObject
*) self
;
7580 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
7584 unicode_slice(PyUnicodeObject
*self
, Py_ssize_t start
, Py_ssize_t end
)
7586 /* standard clamping */
7591 if (end
> self
->length
)
7593 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
7594 /* full slice, return original string */
7596 return (PyObject
*) self
;
7601 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
7605 PyObject
*PyUnicode_Split(PyObject
*s
,
7607 Py_ssize_t maxsplit
)
7611 s
= PyUnicode_FromObject(s
);
7615 sep
= PyUnicode_FromObject(sep
);
7622 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7629 PyDoc_STRVAR(split__doc__
,
7630 "S.split([sep [,maxsplit]]) -> list of strings\n\
7632 Return a list of the words in S, using sep as the\n\
7633 delimiter string. If maxsplit is given, at most maxsplit\n\
7634 splits are done. If sep is not specified or is None, any\n\
7635 whitespace string is a separator and empty strings are\n\
7636 removed from the result.");
7639 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
7641 PyObject
*substring
= Py_None
;
7642 Py_ssize_t maxcount
= -1;
7644 if (!PyArg_ParseTuple(args
, "|On:split", &substring
, &maxcount
))
7647 if (substring
== Py_None
)
7648 return split(self
, NULL
, maxcount
);
7649 else if (PyUnicode_Check(substring
))
7650 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
7652 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
7656 PyUnicode_Partition(PyObject
*str_in
, PyObject
*sep_in
)
7662 str_obj
= PyUnicode_FromObject(str_in
);
7665 sep_obj
= PyUnicode_FromObject(sep_in
);
7671 out
= stringlib_partition(
7672 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7673 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7684 PyUnicode_RPartition(PyObject
*str_in
, PyObject
*sep_in
)
7690 str_obj
= PyUnicode_FromObject(str_in
);
7693 sep_obj
= PyUnicode_FromObject(sep_in
);
7699 out
= stringlib_rpartition(
7700 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7701 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7710 PyDoc_STRVAR(partition__doc__
,
7711 "S.partition(sep) -> (head, sep, tail)\n\
7713 Search for the separator sep in S, and return the part before it,\n\
7714 the separator itself, and the part after it. If the separator is not\n\
7715 found, return S and two empty strings.");
7718 unicode_partition(PyUnicodeObject
*self
, PyObject
*separator
)
7720 return PyUnicode_Partition((PyObject
*)self
, separator
);
7723 PyDoc_STRVAR(rpartition__doc__
,
7724 "S.rpartition(sep) -> (tail, sep, head)\n\
7726 Search for the separator sep in S, starting at the end of S, and return\n\
7727 the part before it, the separator itself, and the part after it. If the\n\
7728 separator is not found, return two empty strings and S.");
7731 unicode_rpartition(PyUnicodeObject
*self
, PyObject
*separator
)
7733 return PyUnicode_RPartition((PyObject
*)self
, separator
);
7736 PyObject
*PyUnicode_RSplit(PyObject
*s
,
7738 Py_ssize_t maxsplit
)
7742 s
= PyUnicode_FromObject(s
);
7746 sep
= PyUnicode_FromObject(sep
);
7753 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7760 PyDoc_STRVAR(rsplit__doc__
,
7761 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7763 Return a list of the words in S, using sep as the\n\
7764 delimiter string, starting at the end of the string and\n\
7765 working to the front. If maxsplit is given, at most maxsplit\n\
7766 splits are done. If sep is not specified, any whitespace string\n\
7770 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
7772 PyObject
*substring
= Py_None
;
7773 Py_ssize_t maxcount
= -1;
7775 if (!PyArg_ParseTuple(args
, "|On:rsplit", &substring
, &maxcount
))
7778 if (substring
== Py_None
)
7779 return rsplit(self
, NULL
, maxcount
);
7780 else if (PyUnicode_Check(substring
))
7781 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
7783 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
7786 PyDoc_STRVAR(splitlines__doc__
,
7787 "S.splitlines([keepends]) -> list of strings\n\
7789 Return a list of the lines in S, breaking at line boundaries.\n\
7790 Line breaks are not included in the resulting list unless keepends\n\
7791 is given and true.");
7794 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
7798 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
7801 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
7805 PyObject
*unicode_str(PyUnicodeObject
*self
)
7807 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
7810 PyDoc_STRVAR(swapcase__doc__
,
7811 "S.swapcase() -> unicode\n\
7813 Return a copy of S with uppercase characters converted to lowercase\n\
7817 unicode_swapcase(PyUnicodeObject
*self
)
7819 return fixup(self
, fixswapcase
);
7822 PyDoc_STRVAR(translate__doc__
,
7823 "S.translate(table) -> unicode\n\
7825 Return a copy of the string S, where all characters have been mapped\n\
7826 through the given translation table, which must be a mapping of\n\
7827 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7828 Unmapped characters are left untouched. Characters mapped to None\n\
7832 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
7834 return PyUnicode_TranslateCharmap(self
->str
,
7840 PyDoc_STRVAR(upper__doc__
,
7841 "S.upper() -> unicode\n\
7843 Return a copy of S converted to uppercase.");
7846 unicode_upper(PyUnicodeObject
*self
)
7848 return fixup(self
, fixupper
);
7851 PyDoc_STRVAR(zfill__doc__
,
7852 "S.zfill(width) -> unicode\n\
7854 Pad a numeric string S with zeros on the left, to fill a field\n\
7855 of the specified width. The string S is never truncated.");
7858 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
7864 if (!PyArg_ParseTuple(args
, "n:zfill", &width
))
7867 if (self
->length
>= width
) {
7868 if (PyUnicode_CheckExact(self
)) {
7870 return (PyObject
*) self
;
7873 return PyUnicode_FromUnicode(
7874 PyUnicode_AS_UNICODE(self
),
7875 PyUnicode_GET_SIZE(self
)
7879 fill
= width
- self
->length
;
7881 u
= pad(self
, fill
, 0, '0');
7886 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
7887 /* move sign to beginning of string */
7888 u
->str
[0] = u
->str
[fill
];
7892 return (PyObject
*) u
;
7897 free_listsize(PyUnicodeObject
*self
)
7899 return PyInt_FromLong(numfree
);
7903 PyDoc_STRVAR(startswith__doc__
,
7904 "S.startswith(prefix[, start[, end]]) -> bool\n\
7906 Return True if S starts with the specified prefix, False otherwise.\n\
7907 With optional start, test S beginning at that position.\n\
7908 With optional end, stop comparing S at that position.\n\
7909 prefix can also be a tuple of strings to try.");
7912 unicode_startswith(PyUnicodeObject
*self
,
7916 PyUnicodeObject
*substring
;
7917 Py_ssize_t start
= 0;
7918 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7921 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &subobj
,
7922 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7924 if (PyTuple_Check(subobj
)) {
7926 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7927 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7928 PyTuple_GET_ITEM(subobj
, i
));
7929 if (substring
== NULL
)
7931 result
= tailmatch(self
, substring
, start
, end
, -1);
7932 Py_DECREF(substring
);
7937 /* nothing matched */
7940 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7941 if (substring
== NULL
)
7943 result
= tailmatch(self
, substring
, start
, end
, -1);
7944 Py_DECREF(substring
);
7945 return PyBool_FromLong(result
);
7949 PyDoc_STRVAR(endswith__doc__
,
7950 "S.endswith(suffix[, start[, end]]) -> bool\n\
7952 Return True if S ends with the specified suffix, False otherwise.\n\
7953 With optional start, test S beginning at that position.\n\
7954 With optional end, stop comparing S at that position.\n\
7955 suffix can also be a tuple of strings to try.");
7958 unicode_endswith(PyUnicodeObject
*self
,
7962 PyUnicodeObject
*substring
;
7963 Py_ssize_t start
= 0;
7964 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7967 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &subobj
,
7968 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7970 if (PyTuple_Check(subobj
)) {
7972 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7973 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7974 PyTuple_GET_ITEM(subobj
, i
));
7975 if (substring
== NULL
)
7977 result
= tailmatch(self
, substring
, start
, end
, +1);
7978 Py_DECREF(substring
);
7985 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7986 if (substring
== NULL
)
7989 result
= tailmatch(self
, substring
, start
, end
, +1);
7990 Py_DECREF(substring
);
7991 return PyBool_FromLong(result
);
7995 /* Implements do_string_format, which is unicode because of stringlib */
7996 #include "stringlib/string_format.h"
7998 PyDoc_STRVAR(format__doc__
,
7999 "S.format(*args, **kwargs) -> unicode\n\
8004 unicode__format__(PyObject
*self
, PyObject
*args
)
8006 PyObject
*format_spec
;
8007 PyObject
*result
= NULL
;
8008 PyObject
*tmp
= NULL
;
8010 /* If 2.x, convert format_spec to the same type as value */
8011 /* This is to allow things like u''.format('') */
8012 if (!PyArg_ParseTuple(args
, "O:__format__", &format_spec
))
8014 if (!(PyBytes_Check(format_spec
) || PyUnicode_Check(format_spec
))) {
8015 PyErr_Format(PyExc_TypeError
, "__format__ arg must be str "
8016 "or unicode, not %s", Py_TYPE(format_spec
)->tp_name
);
8019 tmp
= PyObject_Unicode(format_spec
);
8024 result
= _PyUnicode_FormatAdvanced(self
,
8025 PyUnicode_AS_UNICODE(format_spec
),
8026 PyUnicode_GET_SIZE(format_spec
));
8032 PyDoc_STRVAR(p_format__doc__
,
8033 "S.__format__(format_spec) -> unicode\n\
8038 unicode__sizeof__(PyUnicodeObject
*v
)
8040 return PyInt_FromSsize_t(sizeof(PyUnicodeObject
) +
8041 sizeof(Py_UNICODE
) * (v
->length
+ 1));
8044 PyDoc_STRVAR(sizeof__doc__
,
8045 "S.__sizeof__() -> size of S in memory, in bytes\n\
8050 unicode_getnewargs(PyUnicodeObject
*v
)
8052 return Py_BuildValue("(u#)", v
->str
, v
->length
);
8056 static PyMethodDef unicode_methods
[] = {
8058 /* Order is according to common usage: often used methods should
8059 appear first, since lookup is done sequentially. */
8061 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
| METH_KEYWORDS
, encode__doc__
},
8062 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
8063 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
8064 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
8065 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
8066 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
8067 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
8068 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
8069 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
8070 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
8071 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
8072 {"partition", (PyCFunction
) unicode_partition
, METH_O
, partition__doc__
},
8073 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
8074 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
8075 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
8076 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
8077 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
| METH_KEYWORDS
, decode__doc__
},
8078 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8079 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
8080 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
8081 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
8082 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
8083 {"rpartition", (PyCFunction
) unicode_rpartition
, METH_O
, rpartition__doc__
},
8084 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
8085 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
8086 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
8087 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
8088 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
8089 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
8090 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
8091 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
8092 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
8093 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
8094 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
8095 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
8096 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
8097 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
8098 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
8099 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
8100 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
8101 {"format", (PyCFunction
) do_string_format
, METH_VARARGS
| METH_KEYWORDS
, format__doc__
},
8102 {"__format__", (PyCFunction
) unicode__format__
, METH_VARARGS
, p_format__doc__
},
8103 {"_formatter_field_name_split", (PyCFunction
) formatter_field_name_split
, METH_NOARGS
},
8104 {"_formatter_parser", (PyCFunction
) formatter_parser
, METH_NOARGS
},
8105 {"__sizeof__", (PyCFunction
) unicode__sizeof__
, METH_NOARGS
, sizeof__doc__
},
8107 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
8111 /* This one is just used for debugging the implementation. */
8112 {"freelistsize", (PyCFunction
) free_listsize
, METH_NOARGS
},
8115 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
8120 unicode_mod(PyObject
*v
, PyObject
*w
)
8122 if (!PyUnicode_Check(v
)) {
8123 Py_INCREF(Py_NotImplemented
);
8124 return Py_NotImplemented
;
8126 return PyUnicode_Format(v
, w
);
8129 static PyNumberMethods unicode_as_number
= {
8134 unicode_mod
, /*nb_remainder*/
8137 static PySequenceMethods unicode_as_sequence
= {
8138 (lenfunc
) unicode_length
, /* sq_length */
8139 PyUnicode_Concat
, /* sq_concat */
8140 (ssizeargfunc
) unicode_repeat
, /* sq_repeat */
8141 (ssizeargfunc
) unicode_getitem
, /* sq_item */
8142 (ssizessizeargfunc
) unicode_slice
, /* sq_slice */
8143 0, /* sq_ass_item */
8144 0, /* sq_ass_slice */
8145 PyUnicode_Contains
, /* sq_contains */
8149 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
8151 if (PyIndex_Check(item
)) {
8152 Py_ssize_t i
= PyNumber_AsSsize_t(item
, PyExc_IndexError
);
8153 if (i
== -1 && PyErr_Occurred())
8156 i
+= PyUnicode_GET_SIZE(self
);
8157 return unicode_getitem(self
, i
);
8158 } else if (PySlice_Check(item
)) {
8159 Py_ssize_t start
, stop
, step
, slicelength
, cur
, i
;
8160 Py_UNICODE
* source_buf
;
8161 Py_UNICODE
* result_buf
;
8164 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyUnicode_GET_SIZE(self
),
8165 &start
, &stop
, &step
, &slicelength
) < 0) {
8169 if (slicelength
<= 0) {
8170 return PyUnicode_FromUnicode(NULL
, 0);
8171 } else if (start
== 0 && step
== 1 && slicelength
== self
->length
&&
8172 PyUnicode_CheckExact(self
)) {
8174 return (PyObject
*)self
;
8175 } else if (step
== 1) {
8176 return PyUnicode_FromUnicode(self
->str
+ start
, slicelength
);
8178 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
8179 result_buf
= (Py_UNICODE
*)PyObject_MALLOC(slicelength
*
8180 sizeof(Py_UNICODE
));
8182 if (result_buf
== NULL
)
8183 return PyErr_NoMemory();
8185 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
8186 result_buf
[i
] = source_buf
[cur
];
8189 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
8190 PyObject_FREE(result_buf
);
8194 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
8199 static PyMappingMethods unicode_as_mapping
= {
8200 (lenfunc
)unicode_length
, /* mp_length */
8201 (binaryfunc
)unicode_subscript
, /* mp_subscript */
8202 (objobjargproc
)0, /* mp_ass_subscript */
8206 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
8211 PyErr_SetString(PyExc_SystemError
,
8212 "accessing non-existent unicode segment");
8215 *ptr
= (void *) self
->str
;
8216 return PyUnicode_GET_DATA_SIZE(self
);
8220 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, Py_ssize_t index
,
8223 PyErr_SetString(PyExc_TypeError
,
8224 "cannot use unicode as modifiable buffer");
8229 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
8233 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
8238 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
8245 PyErr_SetString(PyExc_SystemError
,
8246 "accessing non-existent unicode segment");
8249 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
8252 *ptr
= (void *) PyString_AS_STRING(str
);
8253 return PyString_GET_SIZE(str
);
8256 /* Helpers for PyUnicode_Format() */
8259 getnextarg(PyObject
*args
, Py_ssize_t arglen
, Py_ssize_t
*p_argidx
)
8261 Py_ssize_t argidx
= *p_argidx
;
8262 if (argidx
< arglen
) {
8267 return PyTuple_GetItem(args
, argidx
);
8269 PyErr_SetString(PyExc_TypeError
,
8270 "not enough arguments for format string");
8274 #define F_LJUST (1<<0)
8275 #define F_SIGN (1<<1)
8276 #define F_BLANK (1<<2)
8277 #define F_ALT (1<<3)
8278 #define F_ZERO (1<<4)
8281 strtounicode(Py_UNICODE
*buffer
, const char *charbuffer
)
8283 register Py_ssize_t i
;
8284 Py_ssize_t len
= strlen(charbuffer
);
8285 for (i
= len
- 1; i
>= 0; i
--)
8286 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
8292 longtounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, long x
)
8296 PyOS_snprintf((char *)buffer
, len
, format
, x
);
8297 result
= strtounicode(buffer
, (char *)buffer
);
8298 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
8301 /* XXX To save some code duplication, formatfloat/long/int could have been
8302 shared with stringobject.c, converting from 8-bit to Unicode after the
8303 formatting is done. */
8305 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8308 formatfloat(PyObject
*v
, int flags
, int prec
, int type
)
8314 x
= PyFloat_AsDouble(v
);
8315 if (x
== -1.0 && PyErr_Occurred())
8321 p
= PyOS_double_to_string(x
, type
, prec
,
8322 (flags
& F_ALT
) ? Py_DTSF_ALT
: 0, NULL
);
8325 result
= PyUnicode_FromStringAndSize(p
, strlen(p
));
8331 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
8335 PyObject
*str
; /* temporary string object. */
8336 PyUnicodeObject
*result
;
8338 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
8341 result
= _PyUnicode_New(len
);
8346 for (i
= 0; i
< len
; i
++)
8347 result
->str
[i
] = buf
[i
];
8348 result
->str
[len
] = 0;
8350 return (PyObject
*)result
;
8354 formatint(Py_UNICODE
*buf
,
8361 /* fmt = '%#.' + `prec` + 'l' + `type`
8362 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8366 char fmt
[64]; /* plenty big enough! */
8370 x
= PyInt_AsLong(v
);
8371 if (x
== -1 && PyErr_Occurred())
8373 if (x
< 0 && type
== 'u') {
8376 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
8383 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8384 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8386 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
8387 PyErr_SetString(PyExc_OverflowError
,
8388 "formatted integer is too long (precision too large?)");
8392 if ((flags
& F_ALT
) &&
8393 (type
== 'x' || type
== 'X')) {
8394 /* When converting under %#x or %#X, there are a number
8395 * of issues that cause pain:
8396 * - when 0 is being converted, the C standard leaves off
8397 * the '0x' or '0X', which is inconsistent with other
8398 * %#x/%#X conversions and inconsistent with Python's
8400 * - there are platforms that violate the standard and
8401 * convert 0 with the '0x' or '0X'
8402 * (Metrowerks, Compaq Tru64)
8403 * - there are platforms that give '0x' when converting
8404 * under %#X, but convert 0 in accordance with the
8405 * standard (OS/2 EMX)
8407 * We can achieve the desired consistency by inserting our
8408 * own '0x' or '0X' prefix, and substituting %x/%X in place
8411 * Note that this is the same approach as used in
8412 * formatint() in stringobject.c
8414 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
8415 sign
, type
, prec
, type
);
8418 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
8419 sign
, (flags
&F_ALT
) ? "#" : "",
8423 return longtounicode(buf
, buflen
, fmt
, -x
);
8425 return longtounicode(buf
, buflen
, fmt
, x
);
8429 formatchar(Py_UNICODE
*buf
,
8433 /* presume that the buffer is at least 2 characters long */
8434 if (PyUnicode_Check(v
)) {
8435 if (PyUnicode_GET_SIZE(v
) != 1)
8437 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
8440 else if (PyString_Check(v
)) {
8441 if (PyString_GET_SIZE(v
) != 1)
8443 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
8447 /* Integer input truncated to a character */
8449 x
= PyInt_AsLong(v
);
8450 if (x
== -1 && PyErr_Occurred())
8452 #ifdef Py_UNICODE_WIDE
8453 if (x
< 0 || x
> 0x10ffff) {
8454 PyErr_SetString(PyExc_OverflowError
,
8455 "%c arg not in range(0x110000) "
8456 "(wide Python build)");
8460 if (x
< 0 || x
> 0xffff) {
8461 PyErr_SetString(PyExc_OverflowError
,
8462 "%c arg not in range(0x10000) "
8463 "(narrow Python build)");
8467 buf
[0] = (Py_UNICODE
) x
;
8473 PyErr_SetString(PyExc_TypeError
,
8474 "%c requires int or char");
8478 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8480 FORMATBUFLEN is the length of the buffer in which the ints &
8481 chars are formatted. XXX This is a magic number. Each formatting
8482 routine does bounds checking to ensure no overflow, but a better
8483 solution may be to malloc a buffer of appropriate size for each
8484 format. For now, the current solution is sufficient.
8486 #define FORMATBUFLEN (size_t)120
8488 PyObject
*PyUnicode_Format(PyObject
*format
,
8491 Py_UNICODE
*fmt
, *res
;
8492 Py_ssize_t fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
8494 PyUnicodeObject
*result
= NULL
;
8495 PyObject
*dict
= NULL
;
8498 if (format
== NULL
|| args
== NULL
) {
8499 PyErr_BadInternalCall();
8502 uformat
= PyUnicode_FromObject(format
);
8503 if (uformat
== NULL
)
8505 fmt
= PyUnicode_AS_UNICODE(uformat
);
8506 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
8508 reslen
= rescnt
= fmtcnt
+ 100;
8509 result
= _PyUnicode_New(reslen
);
8512 res
= PyUnicode_AS_UNICODE(result
);
8514 if (PyTuple_Check(args
)) {
8515 arglen
= PyTuple_Size(args
);
8522 if (Py_TYPE(args
)->tp_as_mapping
&& !PyTuple_Check(args
) &&
8523 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
8526 while (--fmtcnt
>= 0) {
8529 rescnt
= fmtcnt
+ 100;
8531 if (_PyUnicode_Resize(&result
, reslen
) < 0)
8533 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
8539 /* Got a format specifier */
8541 Py_ssize_t width
= -1;
8543 Py_UNICODE c
= '\0';
8547 PyObject
*temp
= NULL
;
8551 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{int,char}() */
8555 Py_UNICODE
*keystart
;
8561 PyErr_SetString(PyExc_TypeError
,
8562 "format requires a mapping");
8568 /* Skip over balanced parentheses */
8569 while (pcount
> 0 && --fmtcnt
>= 0) {
8572 else if (*fmt
== '(')
8576 keylen
= fmt
- keystart
- 1;
8577 if (fmtcnt
< 0 || pcount
> 0) {
8578 PyErr_SetString(PyExc_ValueError
,
8579 "incomplete format key");
8583 /* keys are converted to strings using UTF-8 and
8584 then looked up since Python uses strings to hold
8585 variables names etc. in its namespaces and we
8586 wouldn't want to break common idioms. */
8587 key
= PyUnicode_EncodeUTF8(keystart
,
8591 key
= PyUnicode_FromUnicode(keystart
, keylen
);
8599 args
= PyObject_GetItem(dict
, key
);
8608 while (--fmtcnt
>= 0) {
8609 switch (c
= *fmt
++) {
8610 case '-': flags
|= F_LJUST
; continue;
8611 case '+': flags
|= F_SIGN
; continue;
8612 case ' ': flags
|= F_BLANK
; continue;
8613 case '#': flags
|= F_ALT
; continue;
8614 case '0': flags
|= F_ZERO
; continue;
8619 v
= getnextarg(args
, arglen
, &argidx
);
8622 if (!PyInt_Check(v
)) {
8623 PyErr_SetString(PyExc_TypeError
,
8627 width
= PyInt_AsLong(v
);
8635 else if (c
>= '0' && c
<= '9') {
8637 while (--fmtcnt
>= 0) {
8639 if (c
< '0' || c
> '9')
8641 if ((width
*10) / 10 != width
) {
8642 PyErr_SetString(PyExc_ValueError
,
8646 width
= width
*10 + (c
- '0');
8654 v
= getnextarg(args
, arglen
, &argidx
);
8657 if (!PyInt_Check(v
)) {
8658 PyErr_SetString(PyExc_TypeError
,
8662 prec
= PyInt_AsLong(v
);
8668 else if (c
>= '0' && c
<= '9') {
8670 while (--fmtcnt
>= 0) {
8671 c
= Py_CHARMASK(*fmt
++);
8672 if (c
< '0' || c
> '9')
8674 if ((prec
*10) / 10 != prec
) {
8675 PyErr_SetString(PyExc_ValueError
,
8679 prec
= prec
*10 + (c
- '0');
8684 if (c
== 'h' || c
== 'l' || c
== 'L') {
8690 PyErr_SetString(PyExc_ValueError
,
8691 "incomplete format");
8695 v
= getnextarg(args
, arglen
, &argidx
);
8705 /* presume that buffer length is at least 1 */
8712 if (PyUnicode_Check(v
) && c
== 's') {
8719 temp
= PyObject_Unicode(v
);
8721 temp
= PyObject_Repr(v
);
8724 if (PyUnicode_Check(temp
))
8725 /* nothing to do */;
8726 else if (PyString_Check(temp
)) {
8727 /* convert to string to Unicode */
8728 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
8729 PyString_GET_SIZE(temp
),
8739 PyErr_SetString(PyExc_TypeError
,
8740 "%s argument has non-string str()");
8744 pbuf
= PyUnicode_AS_UNICODE(temp
);
8745 len
= PyUnicode_GET_SIZE(temp
);
8746 if (prec
>= 0 && len
> prec
)
8759 if (PyNumber_Check(v
)) {
8760 PyObject
*iobj
=NULL
;
8762 if (PyInt_Check(v
) || (PyLong_Check(v
))) {
8767 iobj
= PyNumber_Int(v
);
8768 if (iobj
==NULL
) iobj
= PyNumber_Long(v
);
8771 if (PyInt_Check(iobj
)) {
8774 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8775 flags
, prec
, c
, iobj
);
8781 else if (PyLong_Check(iobj
)) {
8783 temp
= formatlong(iobj
, flags
, prec
, c
);
8787 pbuf
= PyUnicode_AS_UNICODE(temp
);
8788 len
= PyUnicode_GET_SIZE(temp
);
8797 PyErr_Format(PyExc_TypeError
,
8798 "%%%c format: a number is required, "
8799 "not %.200s", (char)c
, Py_TYPE(v
)->tp_name
);
8812 temp
= formatfloat(v
, flags
, prec
, c
);
8815 pbuf
= PyUnicode_AS_UNICODE(temp
);
8816 len
= PyUnicode_GET_SIZE(temp
);
8824 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
8830 PyErr_Format(PyExc_ValueError
,
8831 "unsupported format character '%c' (0x%x) "
8833 (31<=c
&& c
<=126) ? (char)c
: '?',
8835 (Py_ssize_t
)(fmt
- 1 -
8836 PyUnicode_AS_UNICODE(uformat
)));
8840 if (*pbuf
== '-' || *pbuf
== '+') {
8844 else if (flags
& F_SIGN
)
8846 else if (flags
& F_BLANK
)
8853 if (rescnt
- (sign
!= 0) < width
) {
8855 rescnt
= width
+ fmtcnt
+ 100;
8862 if (_PyUnicode_Resize(&result
, reslen
) < 0) {
8866 res
= PyUnicode_AS_UNICODE(result
)
8876 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8877 assert(pbuf
[0] == '0');
8878 assert(pbuf
[1] == c
);
8889 if (width
> len
&& !(flags
& F_LJUST
)) {
8893 } while (--width
> len
);
8898 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8899 assert(pbuf
[0] == '0');
8900 assert(pbuf
[1] == c
);
8905 Py_UNICODE_COPY(res
, pbuf
, len
);
8908 while (--width
>= len
) {
8912 if (dict
&& (argidx
< arglen
) && c
!= '%') {
8913 PyErr_SetString(PyExc_TypeError
,
8914 "not all arguments converted during string formatting");
8921 if (argidx
< arglen
&& !dict
) {
8922 PyErr_SetString(PyExc_TypeError
,
8923 "not all arguments converted during string formatting");
8927 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
8933 return (PyObject
*)result
;
8944 static PyBufferProcs unicode_as_buffer
= {
8945 (readbufferproc
) unicode_buffer_getreadbuf
,
8946 (writebufferproc
) unicode_buffer_getwritebuf
,
8947 (segcountproc
) unicode_buffer_getsegcount
,
8948 (charbufferproc
) unicode_buffer_getcharbuf
,
8952 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
8955 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8958 static char *kwlist
[] = {"string", "encoding", "errors", 0};
8959 char *encoding
= NULL
;
8960 char *errors
= NULL
;
8962 if (type
!= &PyUnicode_Type
)
8963 return unicode_subtype_new(type
, args
, kwds
);
8964 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
8965 kwlist
, &x
, &encoding
, &errors
))
8968 return (PyObject
*)_PyUnicode_New(0);
8969 if (encoding
== NULL
&& errors
== NULL
)
8970 return PyObject_Unicode(x
);
8972 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
8976 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8978 PyUnicodeObject
*tmp
, *pnew
;
8981 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
8982 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
8985 assert(PyUnicode_Check(tmp
));
8986 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
8991 pnew
->str
= (Py_UNICODE
*) PyObject_MALLOC(sizeof(Py_UNICODE
) * (n
+1));
8992 if (pnew
->str
== NULL
) {
8993 _Py_ForgetReference((PyObject
*)pnew
);
8996 return PyErr_NoMemory();
8998 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
9000 pnew
->hash
= tmp
->hash
;
9002 return (PyObject
*)pnew
;
9005 PyDoc_STRVAR(unicode_doc
,
9006 "unicode(string [, encoding[, errors]]) -> object\n\
9008 Create a new Unicode object from the given encoded string.\n\
9009 encoding defaults to the current default string encoding.\n\
9010 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9012 PyTypeObject PyUnicode_Type
= {
9013 PyVarObject_HEAD_INIT(&PyType_Type
, 0)
9014 "unicode", /* tp_name */
9015 sizeof(PyUnicodeObject
), /* tp_size */
9016 0, /* tp_itemsize */
9018 (destructor
)unicode_dealloc
, /* tp_dealloc */
9023 unicode_repr
, /* tp_repr */
9024 &unicode_as_number
, /* tp_as_number */
9025 &unicode_as_sequence
, /* tp_as_sequence */
9026 &unicode_as_mapping
, /* tp_as_mapping */
9027 (hashfunc
) unicode_hash
, /* tp_hash*/
9029 (reprfunc
) unicode_str
, /* tp_str */
9030 PyObject_GenericGetAttr
, /* tp_getattro */
9031 0, /* tp_setattro */
9032 &unicode_as_buffer
, /* tp_as_buffer */
9033 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
9034 Py_TPFLAGS_BASETYPE
| Py_TPFLAGS_UNICODE_SUBCLASS
, /* tp_flags */
9035 unicode_doc
, /* tp_doc */
9036 0, /* tp_traverse */
9038 PyUnicode_RichCompare
, /* tp_richcompare */
9039 0, /* tp_weaklistoffset */
9041 0, /* tp_iternext */
9042 unicode_methods
, /* tp_methods */
9045 &PyBaseString_Type
, /* tp_base */
9047 0, /* tp_descr_get */
9048 0, /* tp_descr_set */
9049 0, /* tp_dictoffset */
9052 unicode_new
, /* tp_new */
9053 PyObject_Del
, /* tp_free */
9056 /* Initialize the Unicode implementation */
9058 void _PyUnicode_Init(void)
9062 /* XXX - move this array to unicodectype.c ? */
9063 Py_UNICODE linebreak
[] = {
9064 0x000A, /* LINE FEED */
9065 0x000D, /* CARRIAGE RETURN */
9066 0x001C, /* FILE SEPARATOR */
9067 0x001D, /* GROUP SEPARATOR */
9068 0x001E, /* RECORD SEPARATOR */
9069 0x0085, /* NEXT LINE */
9070 0x2028, /* LINE SEPARATOR */
9071 0x2029, /* PARAGRAPH SEPARATOR */
9074 /* Init the implementation */
9077 unicode_empty
= _PyUnicode_New(0);
9081 strcpy(unicode_default_encoding
, "ascii");
9082 for (i
= 0; i
< 256; i
++)
9083 unicode_latin1
[i
] = NULL
;
9084 if (PyType_Ready(&PyUnicode_Type
) < 0)
9085 Py_FatalError("Can't initialize 'unicode'");
9087 /* initialize the linebreak bloom filter */
9088 bloom_linebreak
= make_bloom_mask(
9089 linebreak
, sizeof(linebreak
) / sizeof(linebreak
[0])
9092 PyType_Ready(&EncodingMapType
);
9095 /* Finalize the Unicode implementation */
9098 PyUnicode_ClearFreeList(void)
9100 int freelist_size
= numfree
;
9103 for (u
= free_list
; u
!= NULL
;) {
9104 PyUnicodeObject
*v
= u
;
9105 u
= *(PyUnicodeObject
**)u
;
9107 PyObject_DEL(v
->str
);
9108 Py_XDECREF(v
->defenc
);
9113 assert(numfree
== 0);
9114 return freelist_size
;
9118 _PyUnicode_Fini(void)
9122 Py_XDECREF(unicode_empty
);
9123 unicode_empty
= NULL
;
9125 for (i
= 0; i
< 256; i
++) {
9126 if (unicode_latin1
[i
]) {
9127 Py_DECREF(unicode_latin1
[i
]);
9128 unicode_latin1
[i
] = NULL
;
9131 (void)PyUnicode_ClearFreeList();
9142 indent-tabs-mode: nil