3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
45 #include "unicodeobject.h"
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
80 # define BYTEORDER_IS_LITTLE_ENDIAN
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
95 /* Free list for Unicode objects */
96 static PyUnicodeObject
*free_list
;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject
*unicode_empty
;
102 /* Single character Unicode strings in the Latin-1 range are being
104 static PyUnicodeObject
*unicode_latin1
[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding
[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace
[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * HORIZONTAL TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * VERTICAL TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak
[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000D, * CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 /* 0x001C, * FILE SEPARATOR */
154 /* 0x001D, * GROUP SEPARATOR */
155 /* 0x001E, * RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
194 #define BLOOM_WIDTH 128
196 #define BLOOM_WIDTH 64
198 #define BLOOM_WIDTH 32
200 #error "LONG_BIT is smaller than 32"
203 #define BLOOM_MASK unsigned long
205 static BLOOM_MASK bloom_linebreak
;
207 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
208 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210 #define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
214 Py_LOCAL_INLINE(BLOOM_MASK
) make_bloom_mask(Py_UNICODE
* ptr
, Py_ssize_t len
)
216 /* calculate simple bloom-style bitmask for a given unicode string */
222 for (i
= 0; i
< len
; i
++)
223 BLOOM_ADD(mask
, ptr
[i
]);
228 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr
, Py_UNICODE
* set
, Py_ssize_t setlen
)
232 for (i
= 0; i
< setlen
; i
++)
239 #define BLOOM_MEMBER(mask, chr, set, setlen) \
240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
242 /* --- Unicode Object ----------------------------------------------------- */
245 int unicode_resize(register PyUnicodeObject
*unicode
,
250 /* Shortcut if there's nothing much to do. */
251 if (unicode
->length
== length
)
254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
258 if (unicode
== unicode_empty
||
259 (unicode
->length
== 1 &&
260 unicode
->str
[0] < 256U &&
261 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
262 PyErr_SetString(PyExc_SystemError
,
263 "can't resize shared unicode objects");
267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
269 safe to look at str[length] (without making any assumptions about what
272 oldstr
= unicode
->str
;
273 unicode
->str
= PyObject_REALLOC(unicode
->str
,
274 sizeof(Py_UNICODE
) * (length
+ 1));
276 unicode
->str
= (Py_UNICODE
*)oldstr
;
280 unicode
->str
[length
] = 0;
281 unicode
->length
= length
;
284 /* Reset the object caches */
285 if (unicode
->defenc
) {
286 Py_DECREF(unicode
->defenc
);
287 unicode
->defenc
= NULL
;
294 /* We allocate one more byte to make sure the string is
295 Ux0000 terminated -- XXX is this needed ?
297 XXX This allocator could further be enhanced by assuring that the
298 free list never reduces its size below 1.
303 PyUnicodeObject
*_PyUnicode_New(Py_ssize_t length
)
305 register PyUnicodeObject
*unicode
;
307 /* Optimization for empty strings */
308 if (length
== 0 && unicode_empty
!= NULL
) {
309 Py_INCREF(unicode_empty
);
310 return unicode_empty
;
313 /* Ensure we won't overflow the size. */
314 if (length
> ((PY_SSIZE_T_MAX
/ sizeof(Py_UNICODE
)) - 1)) {
315 return (PyUnicodeObject
*)PyErr_NoMemory();
318 /* Unicode freelist & memory allocation */
321 free_list
= *(PyUnicodeObject
**)unicode
;
324 /* Keep-Alive optimization: we only upsize the buffer,
325 never downsize it. */
326 if ((unicode
->length
< length
) &&
327 unicode_resize(unicode
, length
) < 0) {
328 PyObject_DEL(unicode
->str
);
333 size_t new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
334 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
336 PyObject_INIT(unicode
, &PyUnicode_Type
);
340 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
343 new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
344 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
351 /* Initialize the first element to guard against cases where
352 * the caller fails before initializing str -- unicode_resize()
353 * reads str[0], and the Keep-Alive optimization can keep memory
354 * allocated for str alive across a call to unicode_dealloc(unicode).
355 * We don't want unicode_resize to read uninitialized memory in
359 unicode
->str
[length
] = 0;
360 unicode
->length
= length
;
362 unicode
->defenc
= NULL
;
366 /* XXX UNREF/NEWREF interface should be more symmetrical */
368 _Py_ForgetReference((PyObject
*)unicode
);
369 PyObject_Del(unicode
);
374 void unicode_dealloc(register PyUnicodeObject
*unicode
)
376 if (PyUnicode_CheckExact(unicode
) &&
377 numfree
< PyUnicode_MAXFREELIST
) {
378 /* Keep-Alive optimization */
379 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
380 PyObject_DEL(unicode
->str
);
384 if (unicode
->defenc
) {
385 Py_DECREF(unicode
->defenc
);
386 unicode
->defenc
= NULL
;
388 /* Add to free list */
389 *(PyUnicodeObject
**)unicode
= free_list
;
394 PyObject_DEL(unicode
->str
);
395 Py_XDECREF(unicode
->defenc
);
396 Py_TYPE(unicode
)->tp_free((PyObject
*)unicode
);
401 int _PyUnicode_Resize(PyUnicodeObject
**unicode
, Py_ssize_t length
)
403 register PyUnicodeObject
*v
;
405 /* Argument checks */
406 if (unicode
== NULL
) {
407 PyErr_BadInternalCall();
411 if (v
== NULL
|| !PyUnicode_Check(v
) || Py_REFCNT(v
) != 1 || length
< 0) {
412 PyErr_BadInternalCall();
416 /* Resizing unicode_empty and single character objects is not
417 possible since these are being shared. We simply return a fresh
418 copy with the same Unicode content. */
419 if (v
->length
!= length
&&
420 (v
== unicode_empty
|| v
->length
== 1)) {
421 PyUnicodeObject
*w
= _PyUnicode_New(length
);
424 Py_UNICODE_COPY(w
->str
, v
->str
,
425 length
< v
->length
? length
: v
->length
);
431 /* Note that we don't have to modify *unicode for unshared Unicode
432 objects, since we can modify them in-place. */
433 return unicode_resize(v
, length
);
436 int PyUnicode_Resize(PyObject
**unicode
, Py_ssize_t length
)
438 return _PyUnicode_Resize((PyUnicodeObject
**)unicode
, length
);
441 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
444 PyUnicodeObject
*unicode
;
446 /* If the Unicode data is known at construction time, we can apply
447 some optimizations which share commonly used objects. */
450 /* Optimization for empty strings */
451 if (size
== 0 && unicode_empty
!= NULL
) {
452 Py_INCREF(unicode_empty
);
453 return (PyObject
*)unicode_empty
;
456 /* Single character Unicode objects in the Latin-1 range are
457 shared when using this constructor */
458 if (size
== 1 && *u
< 256) {
459 unicode
= unicode_latin1
[*u
];
461 unicode
= _PyUnicode_New(1);
464 unicode
->str
[0] = *u
;
465 unicode_latin1
[*u
] = unicode
;
468 return (PyObject
*)unicode
;
472 unicode
= _PyUnicode_New(size
);
476 /* Copy the Unicode data into the new object */
478 Py_UNICODE_COPY(unicode
->str
, u
, size
);
480 return (PyObject
*)unicode
;
483 PyObject
*PyUnicode_FromStringAndSize(const char *u
, Py_ssize_t size
)
485 PyUnicodeObject
*unicode
;
488 PyErr_SetString(PyExc_SystemError
,
489 "Negative size passed to PyUnicode_FromStringAndSize");
493 /* If the Unicode data is known at construction time, we can apply
494 some optimizations which share commonly used objects.
495 Also, this means the input must be UTF-8, so fall back to the
496 UTF-8 decoder at the end. */
499 /* Optimization for empty strings */
500 if (size
== 0 && unicode_empty
!= NULL
) {
501 Py_INCREF(unicode_empty
);
502 return (PyObject
*)unicode_empty
;
505 /* Single characters are shared when using this constructor.
506 Restrict to ASCII, since the input must be UTF-8. */
507 if (size
== 1 && Py_CHARMASK(*u
) < 128) {
508 unicode
= unicode_latin1
[Py_CHARMASK(*u
)];
510 unicode
= _PyUnicode_New(1);
513 unicode
->str
[0] = Py_CHARMASK(*u
);
514 unicode_latin1
[Py_CHARMASK(*u
)] = unicode
;
517 return (PyObject
*)unicode
;
520 return PyUnicode_DecodeUTF8(u
, size
, NULL
);
523 unicode
= _PyUnicode_New(size
);
527 return (PyObject
*)unicode
;
530 PyObject
*PyUnicode_FromString(const char *u
)
532 size_t size
= strlen(u
);
533 if (size
> PY_SSIZE_T_MAX
) {
534 PyErr_SetString(PyExc_OverflowError
, "input too long");
538 return PyUnicode_FromStringAndSize(u
, size
);
543 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
544 # define CONVERT_WCHAR_TO_SURROGATES
547 #ifdef CONVERT_WCHAR_TO_SURROGATES
549 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
550 to convert from UTF32 to UTF16. */
552 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
555 PyUnicodeObject
*unicode
;
556 register Py_ssize_t i
;
558 const wchar_t *orig_w
;
561 PyErr_BadInternalCall();
567 for (i
= size
; i
> 0; i
--) {
573 unicode
= _PyUnicode_New(alloc
);
577 /* Copy the wchar_t data into the new object */
579 register Py_UNICODE
*u
;
580 u
= PyUnicode_AS_UNICODE(unicode
);
581 for (i
= size
; i
> 0; i
--) {
583 wchar_t ordinal
= *w
++;
585 *u
++ = 0xD800 | (ordinal
>> 10);
586 *u
++ = 0xDC00 | (ordinal
& 0x3FF);
592 return (PyObject
*)unicode
;
597 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
600 PyUnicodeObject
*unicode
;
603 PyErr_BadInternalCall();
607 unicode
= _PyUnicode_New(size
);
611 /* Copy the wchar_t data into the new object */
612 #ifdef HAVE_USABLE_WCHAR_T
613 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
616 register Py_UNICODE
*u
;
617 register Py_ssize_t i
;
618 u
= PyUnicode_AS_UNICODE(unicode
);
619 for (i
= size
; i
> 0; i
--)
624 return (PyObject
*)unicode
;
627 #endif /* CONVERT_WCHAR_TO_SURROGATES */
629 #undef CONVERT_WCHAR_TO_SURROGATES
632 makefmt(char *fmt
, int longflag
, int size_tflag
, int zeropad
, int width
, int precision
, char c
)
638 fmt
+= sprintf(fmt
, "%d", width
);
641 fmt
+= sprintf(fmt
, ".%d", precision
);
644 else if (size_tflag
) {
645 char *f
= PY_FORMAT_SIZE_T
;
653 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
656 PyUnicode_FromFormatV(const char *format
, va_list vargs
)
659 Py_ssize_t callcount
= 0;
660 PyObject
**callresults
= NULL
;
661 PyObject
**callresult
= NULL
;
669 /* used by sprintf */
671 /* use abuffer instead of buffer, if we need more space
672 * (which can happen if there's a format specifier with width). */
673 char *abuffer
= NULL
;
675 Py_ssize_t abuffersize
= 0;
676 char fmt
[60]; /* should be enough for %0width.precisionld */
679 #ifdef VA_LIST_IS_ARRAY
680 Py_MEMCPY(count
, vargs
, sizeof(va_list));
683 __va_copy(count
, vargs
);
688 /* step 1: count the number of %S/%R/%s format specifications
689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
690 * objects once during step 3 and put the result in an array) */
691 for (f
= format
; *f
; f
++) {
695 if (*(f
+1)=='S' || *(f
+1)=='R')
697 while (isdigit((unsigned)*f
))
698 width
= (width
*10) + *f
++ - '0';
699 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
705 /* step 2: allocate memory for the results of
706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
708 callresults
= PyObject_Malloc(sizeof(PyObject
*)*callcount
);
713 callresult
= callresults
;
715 /* step 3: figure out how large a buffer we need */
716 for (f
= format
; *f
; f
++) {
720 while (isdigit((unsigned)*f
))
721 width
= (width
*10) + *f
++ - '0';
722 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726 * they don't affect the amount of space we reserve.
728 if ((*f
== 'l' || *f
== 'z') &&
729 (f
[1] == 'd' || f
[1] == 'u'))
734 (void)va_arg(count
, int);
735 /* fall through... */
739 case 'd': case 'u': case 'i': case 'x':
740 (void) va_arg(count
, int);
741 /* 20 bytes is enough to hold a 64-bit
742 integer. Decimal takes the most space.
743 This isn't enough for octal.
744 If a width is specified we need more
745 (which we allocate later). */
749 if (abuffersize
< width
)
755 const char *s
= va_arg(count
, const char*);
756 PyObject
*str
= PyUnicode_DecodeUTF8(s
, strlen(s
), "replace");
759 n
+= PyUnicode_GET_SIZE(str
);
760 /* Remember the str and switch to the next slot */
766 PyObject
*obj
= va_arg(count
, PyObject
*);
767 assert(obj
&& PyUnicode_Check(obj
));
768 n
+= PyUnicode_GET_SIZE(obj
);
773 PyObject
*obj
= va_arg(count
, PyObject
*);
774 const char *str
= va_arg(count
, const char *);
776 assert(!obj
|| PyUnicode_Check(obj
));
778 n
+= PyUnicode_GET_SIZE(obj
);
785 PyObject
*obj
= va_arg(count
, PyObject
*);
788 str
= PyObject_Str(obj
);
791 n
+= PyUnicode_GET_SIZE(str
);
792 /* Remember the str and switch to the next slot */
798 PyObject
*obj
= va_arg(count
, PyObject
*);
801 repr
= PyObject_Repr(obj
);
804 n
+= PyUnicode_GET_SIZE(repr
);
805 /* Remember the repr and switch to the next slot */
806 *callresult
++ = repr
;
810 (void) va_arg(count
, int);
811 /* maximum 64-bit pointer representation:
813 * so 19 characters is enough.
814 * XXX I count 18 -- what's the extra for?
819 /* if we stumble upon an unknown
820 formatting code, copy the rest of
821 the format string to the output
822 string. (we cannot just skip the
823 code, since there's no way to know
824 what's in the argument list) */
832 if (abuffersize
> 20) {
833 abuffer
= PyObject_Malloc(abuffersize
);
838 realbuffer
= abuffer
;
842 /* step 4: fill the buffer */
843 /* Since we've analyzed how much space we need for the worst case,
844 we don't have to resize the string.
845 There can be no errors beyond this point. */
846 string
= PyUnicode_FromUnicode(NULL
, n
);
850 s
= PyUnicode_AS_UNICODE(string
);
851 callresult
= callresults
;
853 for (f
= format
; *f
; f
++) {
858 zeropad
= (*f
== '0');
859 /* parse the width.precision part */
861 while (isdigit((unsigned)*f
))
862 width
= (width
*10) + *f
++ - '0';
866 while (isdigit((unsigned)*f
))
867 precision
= (precision
*10) + *f
++ - '0';
869 /* handle the long flag, but only for %ld and %lu.
870 others can be added when necessary. */
871 if (*f
== 'l' && (f
[1] == 'd' || f
[1] == 'u')) {
875 /* handle the size_t flag. */
876 if (*f
== 'z' && (f
[1] == 'd' || f
[1] == 'u')) {
883 *s
++ = va_arg(vargs
, int);
886 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'd');
888 sprintf(realbuffer
, fmt
, va_arg(vargs
, long));
890 sprintf(realbuffer
, fmt
, va_arg(vargs
, Py_ssize_t
));
892 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
893 appendstring(realbuffer
);
896 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'u');
898 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned long));
900 sprintf(realbuffer
, fmt
, va_arg(vargs
, size_t));
902 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned int));
903 appendstring(realbuffer
);
906 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'i');
907 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
908 appendstring(realbuffer
);
911 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'x');
912 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
913 appendstring(realbuffer
);
917 /* unused, since we already have the result */
918 (void) va_arg(vargs
, char *);
919 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(*callresult
),
920 PyUnicode_GET_SIZE(*callresult
));
921 s
+= PyUnicode_GET_SIZE(*callresult
);
922 /* We're done with the unicode()/repr() => forget it */
923 Py_DECREF(*callresult
);
924 /* switch to next unicode()/repr() result */
930 PyObject
*obj
= va_arg(vargs
, PyObject
*);
931 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
932 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
938 PyObject
*obj
= va_arg(vargs
, PyObject
*);
939 const char *str
= va_arg(vargs
, const char *);
941 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
942 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
955 /* unused, since we already have the result */
956 (void) va_arg(vargs
, PyObject
*);
957 ucopy
= PyUnicode_AS_UNICODE(*callresult
);
958 usize
= PyUnicode_GET_SIZE(*callresult
);
959 for (upos
= 0; upos
<usize
;)
960 *s
++ = ucopy
[upos
++];
961 /* We're done with the unicode()/repr() => forget it */
962 Py_DECREF(*callresult
);
963 /* switch to next unicode()/repr() result */
968 sprintf(buffer
, "%p", va_arg(vargs
, void*));
969 /* %p is ill-defined: ensure leading 0x. */
970 if (buffer
[1] == 'X')
972 else if (buffer
[1] != 'x') {
973 memmove(buffer
+2, buffer
, strlen(buffer
)+1);
977 appendstring(buffer
);
992 PyObject_Free(callresults
);
994 PyObject_Free(abuffer
);
995 PyUnicode_Resize(&string
, s
- PyUnicode_AS_UNICODE(string
));
999 PyObject
**callresult2
= callresults
;
1000 while (callresult2
< callresult
) {
1001 Py_DECREF(*callresult2
);
1004 PyObject_Free(callresults
);
1007 PyObject_Free(abuffer
);
1014 PyUnicode_FromFormat(const char *format
, ...)
1019 #ifdef HAVE_STDARG_PROTOTYPES
1020 va_start(vargs
, format
);
1024 ret
= PyUnicode_FromFormatV(format
, vargs
);
1029 Py_ssize_t
PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
1033 if (unicode
== NULL
) {
1034 PyErr_BadInternalCall();
1038 /* If possible, try to copy the 0-termination as well */
1039 if (size
> PyUnicode_GET_SIZE(unicode
))
1040 size
= PyUnicode_GET_SIZE(unicode
) + 1;
1042 #ifdef HAVE_USABLE_WCHAR_T
1043 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
1046 register Py_UNICODE
*u
;
1047 register Py_ssize_t i
;
1048 u
= PyUnicode_AS_UNICODE(unicode
);
1049 for (i
= size
; i
> 0; i
--)
1054 if (size
> PyUnicode_GET_SIZE(unicode
))
1055 return PyUnicode_GET_SIZE(unicode
);
1062 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
1066 #ifdef Py_UNICODE_WIDE
1067 if (ordinal
< 0 || ordinal
> 0x10ffff) {
1068 PyErr_SetString(PyExc_ValueError
,
1069 "unichr() arg not in range(0x110000) "
1070 "(wide Python build)");
1074 if (ordinal
< 0 || ordinal
> 0xffff) {
1075 PyErr_SetString(PyExc_ValueError
,
1076 "unichr() arg not in range(0x10000) "
1077 "(narrow Python build)");
1082 s
[0] = (Py_UNICODE
)ordinal
;
1083 return PyUnicode_FromUnicode(s
, 1);
1086 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
1088 /* XXX Perhaps we should make this API an alias of
1089 PyObject_Unicode() instead ?! */
1090 if (PyUnicode_CheckExact(obj
)) {
1094 if (PyUnicode_Check(obj
)) {
1095 /* For a Unicode subtype that's not a Unicode object,
1096 return a true Unicode object with the same data. */
1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
1098 PyUnicode_GET_SIZE(obj
));
1100 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
1103 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
1104 const char *encoding
,
1107 const char *s
= NULL
;
1112 PyErr_BadInternalCall();
1117 /* For b/w compatibility we also accept Unicode objects provided
1118 that no encodings is given and then redirect to
1119 PyObject_Unicode() which then applies the additional logic for
1122 NOTE: This API should really only be used for object which
1123 represent *encoded* Unicode !
1126 if (PyUnicode_Check(obj
)) {
1128 PyErr_SetString(PyExc_TypeError
,
1129 "decoding Unicode is not supported");
1132 return PyObject_Unicode(obj
);
1135 if (PyUnicode_Check(obj
)) {
1136 PyErr_SetString(PyExc_TypeError
,
1137 "decoding Unicode is not supported");
1143 if (PyString_Check(obj
)) {
1144 s
= PyString_AS_STRING(obj
);
1145 len
= PyString_GET_SIZE(obj
);
1147 else if (PyByteArray_Check(obj
)) {
1148 /* Python 2.x specific */
1149 PyErr_Format(PyExc_TypeError
,
1150 "decoding bytearray is not supported");
1153 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
1154 /* Overwrite the error message with something more useful in
1155 case of a TypeError. */
1156 if (PyErr_ExceptionMatches(PyExc_TypeError
))
1157 PyErr_Format(PyExc_TypeError
,
1158 "coercing to Unicode: need string or buffer, "
1160 Py_TYPE(obj
)->tp_name
);
1164 /* Convert to Unicode */
1166 Py_INCREF(unicode_empty
);
1167 v
= (PyObject
*)unicode_empty
;
1170 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
1178 PyObject
*PyUnicode_Decode(const char *s
,
1180 const char *encoding
,
1183 PyObject
*buffer
= NULL
, *unicode
;
1185 if (encoding
== NULL
)
1186 encoding
= PyUnicode_GetDefaultEncoding();
1188 /* Shortcuts for common default encodings */
1189 if (strcmp(encoding
, "utf-8") == 0)
1190 return PyUnicode_DecodeUTF8(s
, size
, errors
);
1191 else if (strcmp(encoding
, "latin-1") == 0)
1192 return PyUnicode_DecodeLatin1(s
, size
, errors
);
1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194 else if (strcmp(encoding
, "mbcs") == 0)
1195 return PyUnicode_DecodeMBCS(s
, size
, errors
);
1197 else if (strcmp(encoding
, "ascii") == 0)
1198 return PyUnicode_DecodeASCII(s
, size
, errors
);
1200 /* Decode via the codec registry */
1201 buffer
= PyBuffer_FromMemory((void *)s
, size
);
1204 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
1205 if (unicode
== NULL
)
1207 if (!PyUnicode_Check(unicode
)) {
1208 PyErr_Format(PyExc_TypeError
,
1209 "decoder did not return an unicode object (type=%.400s)",
1210 Py_TYPE(unicode
)->tp_name
);
1222 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
1223 const char *encoding
,
1228 if (!PyUnicode_Check(unicode
)) {
1229 PyErr_BadArgument();
1233 if (encoding
== NULL
)
1234 encoding
= PyUnicode_GetDefaultEncoding();
1236 /* Decode via the codec registry */
1237 v
= PyCodec_Decode(unicode
, encoding
, errors
);
1246 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
1248 const char *encoding
,
1251 PyObject
*v
, *unicode
;
1253 unicode
= PyUnicode_FromUnicode(s
, size
);
1254 if (unicode
== NULL
)
1256 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
1261 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
1262 const char *encoding
,
1267 if (!PyUnicode_Check(unicode
)) {
1268 PyErr_BadArgument();
1272 if (encoding
== NULL
)
1273 encoding
= PyUnicode_GetDefaultEncoding();
1275 /* Encode via the codec registry */
1276 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1285 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
1286 const char *encoding
,
1291 if (!PyUnicode_Check(unicode
)) {
1292 PyErr_BadArgument();
1296 if (encoding
== NULL
)
1297 encoding
= PyUnicode_GetDefaultEncoding();
1299 /* Shortcuts for common default encodings */
1300 if (errors
== NULL
) {
1301 if (strcmp(encoding
, "utf-8") == 0)
1302 return PyUnicode_AsUTF8String(unicode
);
1303 else if (strcmp(encoding
, "latin-1") == 0)
1304 return PyUnicode_AsLatin1String(unicode
);
1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1306 else if (strcmp(encoding
, "mbcs") == 0)
1307 return PyUnicode_AsMBCSString(unicode
);
1309 else if (strcmp(encoding
, "ascii") == 0)
1310 return PyUnicode_AsASCIIString(unicode
);
1313 /* Encode via the codec registry */
1314 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1317 if (!PyString_Check(v
)) {
1318 PyErr_Format(PyExc_TypeError
,
1319 "encoder did not return a string object (type=%.400s)",
1320 Py_TYPE(v
)->tp_name
);
1330 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
1333 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
1337 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
1338 if (v
&& errors
== NULL
)
1339 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
1343 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
1345 if (!PyUnicode_Check(unicode
)) {
1346 PyErr_BadArgument();
1349 return PyUnicode_AS_UNICODE(unicode
);
1355 Py_ssize_t
PyUnicode_GetSize(PyObject
*unicode
)
1357 if (!PyUnicode_Check(unicode
)) {
1358 PyErr_BadArgument();
1361 return PyUnicode_GET_SIZE(unicode
);
1367 const char *PyUnicode_GetDefaultEncoding(void)
1369 return unicode_default_encoding
;
1372 int PyUnicode_SetDefaultEncoding(const char *encoding
)
1376 /* Make sure the encoding is valid. As side effect, this also
1377 loads the encoding into the codec registry cache. */
1378 v
= _PyCodec_Lookup(encoding
);
1382 strncpy(unicode_default_encoding
,
1384 sizeof(unicode_default_encoding
));
1391 /* error handling callback helper:
1392 build arguments, call the callback and check the arguments,
1393 if no exception occurred, copy the replacement to the output
1394 and adjust various state variables.
1395 return 0 on success, -1 on error
1399 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
1400 const char *encoding
, const char *reason
,
1401 const char *input
, Py_ssize_t insize
, Py_ssize_t
*startinpos
,
1402 Py_ssize_t
*endinpos
, PyObject
**exceptionObject
, const char **inptr
,
1403 PyUnicodeObject
**output
, Py_ssize_t
*outpos
, Py_UNICODE
**outptr
)
1405 static char *argparse
= "O!n;decoding error handler must return (unicode, int) tuple";
1407 PyObject
*restuple
= NULL
;
1408 PyObject
*repunicode
= NULL
;
1409 Py_ssize_t outsize
= PyUnicode_GET_SIZE(*output
);
1410 Py_ssize_t requiredsize
;
1416 if (*errorHandler
== NULL
) {
1417 *errorHandler
= PyCodec_LookupError(errors
);
1418 if (*errorHandler
== NULL
)
1422 if (*exceptionObject
== NULL
) {
1423 *exceptionObject
= PyUnicodeDecodeError_Create(
1424 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
1425 if (*exceptionObject
== NULL
)
1429 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
1437 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
1438 if (restuple
== NULL
)
1440 if (!PyTuple_Check(restuple
)) {
1441 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
1444 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
1447 newpos
= insize
+newpos
;
1448 if (newpos
<0 || newpos
>insize
) {
1449 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", newpos
);
1453 /* need more space? (at least enough for what we
1454 have+the replacement+the rest of the string (starting
1455 at the new input position), so we won't have to check space
1456 when there are no errors in the rest of the string) */
1457 repptr
= PyUnicode_AS_UNICODE(repunicode
);
1458 repsize
= PyUnicode_GET_SIZE(repunicode
);
1459 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
1460 if (requiredsize
> outsize
) {
1461 if (requiredsize
<2*outsize
)
1462 requiredsize
= 2*outsize
;
1463 if (_PyUnicode_Resize(output
, requiredsize
) < 0)
1465 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
1468 *inptr
= input
+ newpos
;
1469 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
1476 Py_XDECREF(restuple
);
1480 /* --- UTF-7 Codec -------------------------------------------------------- */
1482 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1484 /* Three simple macros defining base-64. */
1486 /* Is c a base-64 character? */
1488 #define IS_BASE64(c) \
1489 (isalnum(c) || (c) == '+' || (c) == '/')
1491 /* given that c is a base-64 character, what is its base-64 value? */
1493 #define FROM_BASE64(c) \
1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1497 (c) == '+' ? 62 : 63)
1499 /* What is the base-64 character of the bottom 6 bits of n? */
1501 #define TO_BASE64(n) \
1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1504 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505 * decoded as itself. We are permissive on decoding; the only ASCII
1506 * byte not decoding to itself is the + which begins a base64
1509 #define DECODE_DIRECT(c) \
1510 ((c) <= 127 && (c) != '+')
1512 /* The UTF-7 encoder treats ASCII characters differently according to
1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514 * the above). See RFC2152. This array identifies these different
1517 * alphanumeric and '(),-./:?
1519 * !"#$%&*;<=>@[]^_`{|}
1522 * 3 : special (must be base64 encoded)
1523 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1527 char utf7_category
[128] = {
1528 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1529 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1530 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1532 /* sp ! " # $ % & ' ( ) * + , - . / */
1533 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1534 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1536 /* @ A B C D E F G H I J K L M N O */
1537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1538 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1540 /* ` a b c d e f g h i j k l m n o */
1541 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1542 /* p q r s t u v w x y z { | } ~ del */
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1546 /* ENCODE_DIRECT: this character should be encoded as itself. The
1547 * answer depends on whether we are encoding set O as itself, and also
1548 * on whether we are encoding whitespace as itself. RFC2152 makes it
1549 * clear that the answers to these questions vary between
1550 * applications, so this code needs to be flexible. */
1552 #define ENCODE_DIRECT(c, directO, directWS) \
1553 ((c) < 128 && (c) > 0 && \
1554 ((utf7_category[(c)] == 0) || \
1555 (directWS && (utf7_category[(c)] == 2)) || \
1556 (directO && (utf7_category[(c)] == 1))))
1558 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
1562 return PyUnicode_DecodeUTF7Stateful(s
, size
, errors
, NULL
);
1565 /* The decoder. The only state we preserve is our read position,
1566 * i.e. how many characters we have consumed. So if we end in the
1567 * middle of a shift sequence we have to back off the read position
1568 * and the output to the beginning of the sequence, otherwise we lose
1569 * all the shift state (seen bits, number of bits seen, high
1572 PyObject
*PyUnicode_DecodeUTF7Stateful(const char *s
,
1575 Py_ssize_t
*consumed
)
1577 const char *starts
= s
;
1578 Py_ssize_t startinpos
;
1579 Py_ssize_t endinpos
;
1582 PyUnicodeObject
*unicode
;
1584 const char *errmsg
= "";
1586 Py_UNICODE
*shiftOutStart
;
1587 unsigned int base64bits
= 0;
1588 unsigned long base64buffer
= 0;
1589 Py_UNICODE surrogate
= 0;
1590 PyObject
*errorHandler
= NULL
;
1591 PyObject
*exc
= NULL
;
1593 unicode
= _PyUnicode_New(size
);
1599 return (PyObject
*)unicode
;
1607 Py_UNICODE ch
= (unsigned char) *s
;
1609 if (inShift
) { /* in a base-64 section */
1610 if (IS_BASE64(ch
)) { /* consume a base-64 character */
1611 base64buffer
= (base64buffer
<< 6) | FROM_BASE64(ch
);
1614 if (base64bits
>= 16) {
1615 /* we have enough bits for a UTF-16 value */
1616 Py_UNICODE outCh
= (Py_UNICODE
)
1617 (base64buffer
>> (base64bits
-16));
1619 base64buffer
&= (1 << base64bits
) - 1; /* clear high bits */
1621 /* expecting a second surrogate */
1622 if (outCh
>= 0xDC00 && outCh
<= 0xDFFF) {
1623 #ifdef Py_UNICODE_WIDE
1624 *p
++ = (((surrogate
& 0x3FF)<<10)
1625 | (outCh
& 0x3FF)) + 0x10000;
1634 errmsg
= "second surrogate missing";
1638 else if (outCh
>= 0xD800 && outCh
<= 0xDBFF) {
1639 /* first surrogate */
1642 else if (outCh
>= 0xDC00 && outCh
<= 0xDFFF) {
1643 errmsg
= "unexpected second surrogate";
1651 else { /* now leaving a base-64 section */
1655 errmsg
= "second surrogate missing at end of shift sequence";
1658 if (base64bits
> 0) { /* left-over bits */
1659 if (base64bits
>= 6) {
1660 /* We've seen at least one base-64 character */
1661 errmsg
= "partial character in shift sequence";
1665 /* Some bits remain; they should be zero */
1666 if (base64buffer
!= 0) {
1667 errmsg
= "non-zero padding bits in shift sequence";
1673 /* '-' is absorbed; other terminating
1674 characters are preserved */
1679 else if ( ch
== '+' ) {
1680 startinpos
= s
-starts
;
1681 s
++; /* consume '+' */
1682 if (s
< e
&& *s
== '-') { /* '+-' encodes '+' */
1686 else { /* begin base64-encoded section */
1692 else if (DECODE_DIRECT(ch
)) { /* character decodes as itself */
1697 startinpos
= s
-starts
;
1699 errmsg
= "unexpected special character";
1704 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1705 endinpos
= s
-starts
;
1706 if (unicode_decode_call_errorhandler(
1707 errors
, &errorHandler
,
1709 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1710 &unicode
, &outpos
, &p
))
1716 if (inShift
&& !consumed
) { /* in shift sequence, no more to follow */
1717 /* if we're in an inconsistent state, that's an error */
1719 (base64bits
>= 6) ||
1720 (base64bits
> 0 && base64buffer
!= 0)) {
1721 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1723 if (unicode_decode_call_errorhandler(
1724 errors
, &errorHandler
,
1725 "utf7", "unterminated shift sequence",
1726 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1727 &unicode
, &outpos
, &p
))
1735 p
= shiftOutStart
; /* back off output */
1736 *consumed
= startinpos
;
1739 *consumed
= s
-starts
;
1743 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1746 Py_XDECREF(errorHandler
);
1748 return (PyObject
*)unicode
;
1751 Py_XDECREF(errorHandler
);
1758 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1761 int base64WhiteSpace
,
1765 /* It might be possible to tighten this worst case */
1766 Py_ssize_t allocated
= 8 * size
;
1769 unsigned int base64bits
= 0;
1770 unsigned long base64buffer
= 0;
1774 if (allocated
/ 8 != size
)
1775 return PyErr_NoMemory();
1778 return PyString_FromStringAndSize(NULL
, 0);
1780 v
= PyString_FromStringAndSize(NULL
, allocated
);
1784 start
= out
= PyString_AS_STRING(v
);
1785 for (;i
< size
; ++i
) {
1786 Py_UNICODE ch
= s
[i
];
1789 if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1791 if (base64bits
) { /* output remaining bits */
1792 *out
++ = TO_BASE64(base64buffer
<< (6-base64bits
));
1797 /* Characters not in the BASE64 set implicitly unshift the sequence
1798 so no '-' is required, except if the character is itself a '-' */
1799 if (IS_BASE64(ch
) || ch
== '-') {
1808 else { /* not in a shift sequence */
1813 else if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1824 #ifdef Py_UNICODE_WIDE
1825 if (ch
>= 0x10000) {
1826 /* code first surrogate */
1828 base64buffer
= (base64buffer
<< 16) | 0xd800 | ((ch
-0x10000) >> 10);
1829 while (base64bits
>= 6) {
1830 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1833 /* prepare second surrogate */
1834 ch
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1838 base64buffer
= (base64buffer
<< 16) | ch
;
1839 while (base64bits
>= 6) {
1840 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1845 *out
++= TO_BASE64(base64buffer
<< (6-base64bits
) );
1849 _PyString_Resize(&v
, out
- start
);
1856 #undef DECODE_DIRECT
1857 #undef ENCODE_DIRECT
1859 /* --- UTF-8 Codec -------------------------------------------------------- */
1862 char utf8_code_length
[256] = {
1863 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1864 illegal prefix. see RFC 2279 for details */
1865 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1866 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1874 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1875 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1877 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1878 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1879 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1880 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1883 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1887 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1890 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1893 Py_ssize_t
*consumed
)
1895 const char *starts
= s
;
1897 Py_ssize_t startinpos
;
1898 Py_ssize_t endinpos
;
1901 PyUnicodeObject
*unicode
;
1903 const char *errmsg
= "";
1904 PyObject
*errorHandler
= NULL
;
1905 PyObject
*exc
= NULL
;
1907 /* Note: size will always be longer than the resulting Unicode
1909 unicode
= _PyUnicode_New(size
);
1915 return (PyObject
*)unicode
;
1918 /* Unpack UTF-8 encoded data */
1923 Py_UCS4 ch
= (unsigned char)*s
;
1926 *p
++ = (Py_UNICODE
)ch
;
1931 n
= utf8_code_length
[ch
];
1937 errmsg
= "unexpected end of data";
1938 startinpos
= s
-starts
;
1947 errmsg
= "unexpected code byte";
1948 startinpos
= s
-starts
;
1949 endinpos
= startinpos
+1;
1953 errmsg
= "internal error";
1954 startinpos
= s
-starts
;
1955 endinpos
= startinpos
+1;
1959 if ((s
[1] & 0xc0) != 0x80) {
1960 errmsg
= "invalid data";
1961 startinpos
= s
-starts
;
1962 endinpos
= startinpos
+2;
1965 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1967 startinpos
= s
-starts
;
1968 endinpos
= startinpos
+2;
1969 errmsg
= "illegal encoding";
1973 *p
++ = (Py_UNICODE
)ch
;
1977 if ((s
[1] & 0xc0) != 0x80 ||
1978 (s
[2] & 0xc0) != 0x80) {
1979 errmsg
= "invalid data";
1980 startinpos
= s
-starts
;
1981 endinpos
= startinpos
+3;
1984 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1986 /* Note: UTF-8 encodings of surrogates are considered
1987 legal UTF-8 sequences;
1989 XXX For wide builds (UCS-4) we should probably try
1990 to recombine the surrogates into a single code
1993 errmsg
= "illegal encoding";
1994 startinpos
= s
-starts
;
1995 endinpos
= startinpos
+3;
1999 *p
++ = (Py_UNICODE
)ch
;
2003 if ((s
[1] & 0xc0) != 0x80 ||
2004 (s
[2] & 0xc0) != 0x80 ||
2005 (s
[3] & 0xc0) != 0x80) {
2006 errmsg
= "invalid data";
2007 startinpos
= s
-starts
;
2008 endinpos
= startinpos
+4;
2011 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
2012 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
2013 /* validate and convert to UTF-16 */
2014 if ((ch
< 0x10000) /* minimum value allowed for 4
2016 || (ch
> 0x10ffff)) /* maximum value allowed for
2019 errmsg
= "illegal encoding";
2020 startinpos
= s
-starts
;
2021 endinpos
= startinpos
+4;
2024 #ifdef Py_UNICODE_WIDE
2025 *p
++ = (Py_UNICODE
)ch
;
2027 /* compute and append the two surrogates: */
2029 /* translate from 10000..10FFFF to 0..FFFF */
2032 /* high surrogate = top 10 bits added to D800 */
2033 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
2035 /* low surrogate = bottom 10 bits added to DC00 */
2036 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
2041 /* Other sizes are only needed for UCS-4 */
2042 errmsg
= "unsupported Unicode code range";
2043 startinpos
= s
-starts
;
2044 endinpos
= startinpos
+n
;
2051 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2052 if (unicode_decode_call_errorhandler(
2053 errors
, &errorHandler
,
2055 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2056 &unicode
, &outpos
, &p
))
2060 *consumed
= s
-starts
;
2063 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2066 Py_XDECREF(errorHandler
);
2068 return (PyObject
*)unicode
;
2071 Py_XDECREF(errorHandler
);
2077 /* Allocation strategy: if the string is short, convert into a stack buffer
2078 and allocate exactly as much space needed at the end. Else allocate the
2079 maximum possible needed (4 result bytes per Unicode character), and return
2080 the excess memory at the end.
2083 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
2087 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2089 Py_ssize_t i
; /* index into s of next input byte */
2090 PyObject
*v
; /* result string object */
2091 char *p
; /* next free byte in output buffer */
2092 Py_ssize_t nallocated
; /* number of result bytes allocated */
2093 Py_ssize_t nneeded
; /* number of result bytes needed */
2094 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
2099 if (size
<= MAX_SHORT_UNICHARS
) {
2100 /* Write into the stack buffer; nallocated can't overflow.
2101 * At the end, we'll allocate exactly as much heap space as it
2102 * turns out we need.
2104 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
2105 v
= NULL
; /* will allocate after we're done */
2109 /* Overallocate on the heap, and give the excess back at the end. */
2110 nallocated
= size
* 4;
2111 if (nallocated
/ 4 != size
) /* overflow! */
2112 return PyErr_NoMemory();
2113 v
= PyString_FromStringAndSize(NULL
, nallocated
);
2116 p
= PyString_AS_STRING(v
);
2119 for (i
= 0; i
< size
;) {
2120 Py_UCS4 ch
= s
[i
++];
2126 else if (ch
< 0x0800) {
2127 /* Encode Latin-1 */
2128 *p
++ = (char)(0xc0 | (ch
>> 6));
2129 *p
++ = (char)(0x80 | (ch
& 0x3f));
2132 /* Encode UCS2 Unicode ordinals */
2134 /* Special case: check for high surrogate */
2135 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
2137 /* Check for low surrogate and combine the two to
2138 form a UCS4 value */
2139 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2140 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
2144 /* Fall through: handles isolated high surrogates */
2146 *p
++ = (char)(0xe0 | (ch
>> 12));
2147 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2148 *p
++ = (char)(0x80 | (ch
& 0x3f));
2152 /* Encode UCS4 Unicode ordinals */
2153 *p
++ = (char)(0xf0 | (ch
>> 18));
2154 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
2155 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2156 *p
++ = (char)(0x80 | (ch
& 0x3f));
2161 /* This was stack allocated. */
2162 nneeded
= p
- stackbuf
;
2163 assert(nneeded
<= nallocated
);
2164 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
2167 /* Cut back to size actually needed. */
2168 nneeded
= p
- PyString_AS_STRING(v
);
2169 assert(nneeded
<= nallocated
);
2170 _PyString_Resize(&v
, nneeded
);
2174 #undef MAX_SHORT_UNICHARS
2177 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
2179 if (!PyUnicode_Check(unicode
)) {
2180 PyErr_BadArgument();
2183 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
2184 PyUnicode_GET_SIZE(unicode
),
2188 /* --- UTF-32 Codec ------------------------------------------------------- */
2191 PyUnicode_DecodeUTF32(const char *s
,
2196 return PyUnicode_DecodeUTF32Stateful(s
, size
, errors
, byteorder
, NULL
);
2200 PyUnicode_DecodeUTF32Stateful(const char *s
,
2204 Py_ssize_t
*consumed
)
2206 const char *starts
= s
;
2207 Py_ssize_t startinpos
;
2208 Py_ssize_t endinpos
;
2210 PyUnicodeObject
*unicode
;
2212 #ifndef Py_UNICODE_WIDE
2215 const int pairs
= 0;
2217 const unsigned char *q
, *e
;
2218 int bo
= 0; /* assume native ordering by default */
2219 const char *errmsg
= "";
2220 /* Offsets from q for retrieving bytes in the right order. */
2221 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2222 int iorder
[] = {0, 1, 2, 3};
2224 int iorder
[] = {3, 2, 1, 0};
2226 PyObject
*errorHandler
= NULL
;
2227 PyObject
*exc
= NULL
;
2228 /* On narrow builds we split characters outside the BMP into two
2229 codepoints => count how much extra space we need. */
2230 #ifndef Py_UNICODE_WIDE
2231 for (i
= pairs
= 0; i
< size
/4; i
++)
2232 if (((Py_UCS4
*)s
)[i
] >= 0x10000)
2236 /* This might be one to much, because of a BOM */
2237 unicode
= _PyUnicode_New((size
+3)/4+pairs
);
2241 return (PyObject
*)unicode
;
2243 /* Unpack UTF-32 encoded data */
2245 q
= (unsigned char *)s
;
2251 /* Check for BOM marks (U+FEFF) in the input and adjust current
2252 byte order setting accordingly. In native mode, the leading BOM
2253 mark is skipped, in all other modes, it is copied to the output
2254 stream as-is (giving a ZWNBSP character). */
2257 const Py_UCS4 bom
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2258 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2259 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2260 if (bom
== 0x0000FEFF) {
2264 else if (bom
== 0xFFFE0000) {
2269 if (bom
== 0x0000FEFF) {
2273 else if (bom
== 0xFFFE0000) {
2298 /* remaining bytes at the end? (size should be divisible by 4) */
2302 errmsg
= "truncated data";
2303 startinpos
= ((const char *)q
)-starts
;
2304 endinpos
= ((const char *)e
)-starts
;
2306 /* The remaining input chars are ignored if the callback
2307 chooses to skip the input */
2309 ch
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2310 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2314 errmsg
= "codepoint not in range(0x110000)";
2315 startinpos
= ((const char *)q
)-starts
;
2316 endinpos
= startinpos
+4;
2319 #ifndef Py_UNICODE_WIDE
2322 *p
++ = 0xD800 | ((ch
-0x10000) >> 10);
2323 *p
++ = 0xDC00 | ((ch
-0x10000) & 0x3FF);
2331 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2332 if (unicode_decode_call_errorhandler(
2333 errors
, &errorHandler
,
2335 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2336 &unicode
, &outpos
, &p
))
2344 *consumed
= (const char *)q
-starts
;
2347 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2350 Py_XDECREF(errorHandler
);
2352 return (PyObject
*)unicode
;
2356 Py_XDECREF(errorHandler
);
2362 PyUnicode_EncodeUTF32(const Py_UNICODE
*s
,
2369 Py_ssize_t nsize
, bytesize
;
2370 #ifndef Py_UNICODE_WIDE
2371 Py_ssize_t i
, pairs
;
2373 const int pairs
= 0;
2375 /* Offsets from p for storing byte pairs in the right order. */
2376 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2377 int iorder
[] = {0, 1, 2, 3};
2379 int iorder
[] = {3, 2, 1, 0};
2382 #define STORECHAR(CH) \
2384 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2385 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2386 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2387 p[iorder[0]] = (CH) & 0xff; \
2391 /* In narrow builds we can output surrogate pairs as one codepoint,
2392 so we need less space. */
2393 #ifndef Py_UNICODE_WIDE
2394 for (i
= pairs
= 0; i
< size
-1; i
++)
2395 if (0xD800 <= s
[i
] && s
[i
] <= 0xDBFF &&
2396 0xDC00 <= s
[i
+1] && s
[i
+1] <= 0xDFFF)
2399 nsize
= (size
- pairs
+ (byteorder
== 0));
2400 bytesize
= nsize
* 4;
2401 if (bytesize
/ 4 != nsize
)
2402 return PyErr_NoMemory();
2403 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2407 p
= (unsigned char *)PyString_AS_STRING(v
);
2413 if (byteorder
== -1) {
2420 else if (byteorder
== 1) {
2428 while (size
-- > 0) {
2430 #ifndef Py_UNICODE_WIDE
2431 if (0xD800 <= ch
&& ch
<= 0xDBFF && size
> 0) {
2433 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2434 ch
= (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2446 PyObject
*PyUnicode_AsUTF32String(PyObject
*unicode
)
2448 if (!PyUnicode_Check(unicode
)) {
2449 PyErr_BadArgument();
2452 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode
),
2453 PyUnicode_GET_SIZE(unicode
),
2458 /* --- UTF-16 Codec ------------------------------------------------------- */
2461 PyUnicode_DecodeUTF16(const char *s
,
2466 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
2470 PyUnicode_DecodeUTF16Stateful(const char *s
,
2474 Py_ssize_t
*consumed
)
2476 const char *starts
= s
;
2477 Py_ssize_t startinpos
;
2478 Py_ssize_t endinpos
;
2480 PyUnicodeObject
*unicode
;
2482 const unsigned char *q
, *e
;
2483 int bo
= 0; /* assume native ordering by default */
2484 const char *errmsg
= "";
2485 /* Offsets from q for retrieving byte pairs in the right order. */
2486 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2487 int ihi
= 1, ilo
= 0;
2489 int ihi
= 0, ilo
= 1;
2491 PyObject
*errorHandler
= NULL
;
2492 PyObject
*exc
= NULL
;
2494 /* Note: size will always be longer than the resulting Unicode
2496 unicode
= _PyUnicode_New(size
);
2500 return (PyObject
*)unicode
;
2502 /* Unpack UTF-16 encoded data */
2504 q
= (unsigned char *)s
;
2510 /* Check for BOM marks (U+FEFF) in the input and adjust current
2511 byte order setting accordingly. In native mode, the leading BOM
2512 mark is skipped, in all other modes, it is copied to the output
2513 stream as-is (giving a ZWNBSP character). */
2516 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
2517 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2518 if (bom
== 0xFEFF) {
2522 else if (bom
== 0xFFFE) {
2527 if (bom
== 0xFEFF) {
2531 else if (bom
== 0xFFFE) {
2552 /* remaining bytes at the end? (size should be even) */
2556 errmsg
= "truncated data";
2557 startinpos
= ((const char *)q
)-starts
;
2558 endinpos
= ((const char *)e
)-starts
;
2560 /* The remaining input chars are ignored if the callback
2561 chooses to skip the input */
2563 ch
= (q
[ihi
] << 8) | q
[ilo
];
2567 if (ch
< 0xD800 || ch
> 0xDFFF) {
2572 /* UTF-16 code pair: */
2574 errmsg
= "unexpected end of data";
2575 startinpos
= (((const char *)q
)-2)-starts
;
2576 endinpos
= ((const char *)e
)-starts
;
2579 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
2580 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
2582 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2583 #ifndef Py_UNICODE_WIDE
2587 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2592 errmsg
= "illegal UTF-16 surrogate";
2593 startinpos
= (((const char *)q
)-4)-starts
;
2594 endinpos
= startinpos
+2;
2599 errmsg
= "illegal encoding";
2600 startinpos
= (((const char *)q
)-2)-starts
;
2601 endinpos
= startinpos
+2;
2602 /* Fall through to report the error */
2605 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2606 if (unicode_decode_call_errorhandler(
2607 errors
, &errorHandler
,
2609 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2610 &unicode
, &outpos
, &p
))
2618 *consumed
= (const char *)q
-starts
;
2621 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2624 Py_XDECREF(errorHandler
);
2626 return (PyObject
*)unicode
;
2630 Py_XDECREF(errorHandler
);
2636 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
2643 Py_ssize_t nsize
, bytesize
;
2644 #ifdef Py_UNICODE_WIDE
2645 Py_ssize_t i
, pairs
;
2647 const int pairs
= 0;
2649 /* Offsets from p for storing byte pairs in the right order. */
2650 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2651 int ihi
= 1, ilo
= 0;
2653 int ihi
= 0, ilo
= 1;
2656 #define STORECHAR(CH) \
2658 p[ihi] = ((CH) >> 8) & 0xff; \
2659 p[ilo] = (CH) & 0xff; \
2663 #ifdef Py_UNICODE_WIDE
2664 for (i
= pairs
= 0; i
< size
; i
++)
2665 if (s
[i
] >= 0x10000)
2668 /* 2 * (size + pairs + (byteorder == 0)) */
2669 if (size
> PY_SSIZE_T_MAX
||
2670 size
> PY_SSIZE_T_MAX
- pairs
- (byteorder
== 0))
2671 return PyErr_NoMemory();
2672 nsize
= size
+ pairs
+ (byteorder
== 0);
2673 bytesize
= nsize
* 2;
2674 if (bytesize
/ 2 != nsize
)
2675 return PyErr_NoMemory();
2676 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2680 p
= (unsigned char *)PyString_AS_STRING(v
);
2686 if (byteorder
== -1) {
2691 else if (byteorder
== 1) {
2697 while (size
-- > 0) {
2698 Py_UNICODE ch
= *s
++;
2700 #ifdef Py_UNICODE_WIDE
2701 if (ch
>= 0x10000) {
2702 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
2703 ch
= 0xD800 | ((ch
-0x10000) >> 10);
2714 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
2716 if (!PyUnicode_Check(unicode
)) {
2717 PyErr_BadArgument();
2720 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
2721 PyUnicode_GET_SIZE(unicode
),
2726 /* --- Unicode Escape Codec ----------------------------------------------- */
2728 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
2730 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
2734 const char *starts
= s
;
2735 Py_ssize_t startinpos
;
2736 Py_ssize_t endinpos
;
2743 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
2744 PyObject
*errorHandler
= NULL
;
2745 PyObject
*exc
= NULL
;
2747 /* Escaped strings will always be longer than the resulting
2748 Unicode string, so we start with size here and then reduce the
2749 length after conversion to the true value.
2750 (but if the error callback returns a long replacement string
2751 we'll have to allocate more space) */
2752 v
= _PyUnicode_New(size
);
2756 return (PyObject
*)v
;
2758 p
= PyUnicode_AS_UNICODE(v
);
2766 /* Non-escape characters are interpreted as Unicode ordinals */
2768 *p
++ = (unsigned char) *s
++;
2772 startinpos
= s
-starts
;
2777 c
= '\0'; /* Invalid after \ */
2782 case '\\': *p
++ = '\\'; break;
2783 case '\'': *p
++ = '\''; break;
2784 case '\"': *p
++ = '\"'; break;
2785 case 'b': *p
++ = '\b'; break;
2786 case 'f': *p
++ = '\014'; break; /* FF */
2787 case 't': *p
++ = '\t'; break;
2788 case 'n': *p
++ = '\n'; break;
2789 case 'r': *p
++ = '\r'; break;
2790 case 'v': *p
++ = '\013'; break; /* VT */
2791 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
2793 /* \OOO (octal) escapes */
2794 case '0': case '1': case '2': case '3':
2795 case '4': case '5': case '6': case '7':
2797 if (s
< end
&& '0' <= *s
&& *s
<= '7') {
2798 x
= (x
<<3) + *s
++ - '0';
2799 if (s
< end
&& '0' <= *s
&& *s
<= '7')
2800 x
= (x
<<3) + *s
++ - '0';
2809 message
= "truncated \\xXX escape";
2815 message
= "truncated \\uXXXX escape";
2821 message
= "truncated \\UXXXXXXXX escape";
2824 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2827 if (unicode_decode_call_errorhandler(
2828 errors
, &errorHandler
,
2829 "unicodeescape", "end of string in escape sequence",
2830 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2835 for (i
= 0; i
< digits
; ++i
) {
2836 c
= (unsigned char) s
[i
];
2838 endinpos
= (s
+i
+1)-starts
;
2839 if (unicode_decode_call_errorhandler(
2840 errors
, &errorHandler
,
2841 "unicodeescape", message
,
2842 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2847 chr
= (chr
<<4) & ~0xF;
2848 if (c
>= '0' && c
<= '9')
2850 else if (c
>= 'a' && c
<= 'f')
2851 chr
+= 10 + c
- 'a';
2853 chr
+= 10 + c
- 'A';
2856 if (chr
== 0xffffffff && PyErr_Occurred())
2857 /* _decoding_error will have already written into the
2861 /* when we get here, chr is a 32-bit unicode character */
2863 /* UCS-2 character */
2864 *p
++ = (Py_UNICODE
) chr
;
2865 else if (chr
<= 0x10ffff) {
2866 /* UCS-4 character. Either store directly, or as
2868 #ifdef Py_UNICODE_WIDE
2872 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
2873 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
2876 endinpos
= s
-starts
;
2877 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2878 if (unicode_decode_call_errorhandler(
2879 errors
, &errorHandler
,
2880 "unicodeescape", "illegal Unicode character",
2881 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2889 message
= "malformed \\N character escape";
2890 if (ucnhash_CAPI
== NULL
) {
2891 /* load the unicode data module */
2893 m
= PyImport_ImportModuleNoBlock("unicodedata");
2896 api
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
2900 ucnhash_CAPI
= (_PyUnicode_Name_CAPI
*)PyCObject_AsVoidPtr(api
);
2902 if (ucnhash_CAPI
== NULL
)
2906 const char *start
= s
+1;
2907 /* look for the closing brace */
2908 while (*s
!= '}' && s
< end
)
2910 if (s
> start
&& s
< end
&& *s
== '}') {
2911 /* found a name. look it up in the unicode database */
2912 message
= "unknown Unicode character name";
2914 if (ucnhash_CAPI
->getcode(NULL
, start
, (int)(s
-start
-1), &chr
))
2918 endinpos
= s
-starts
;
2919 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2920 if (unicode_decode_call_errorhandler(
2921 errors
, &errorHandler
,
2922 "unicodeescape", message
,
2923 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2930 message
= "\\ at end of string";
2932 endinpos
= s
-starts
;
2933 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2934 if (unicode_decode_call_errorhandler(
2935 errors
, &errorHandler
,
2936 "unicodeescape", message
,
2937 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2943 *p
++ = (unsigned char)s
[-1];
2950 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2952 Py_XDECREF(errorHandler
);
2954 return (PyObject
*)v
;
2959 "\\N escapes not supported (can't load unicodedata module)"
2962 Py_XDECREF(errorHandler
);
2968 Py_XDECREF(errorHandler
);
2973 /* Return a Unicode-Escape string version of the Unicode object.
2975 If quotes is true, the string is enclosed in u"" or u'' quotes as
2980 Py_LOCAL_INLINE(const Py_UNICODE
*) findchar(const Py_UNICODE
*s
,
2984 /* like wcschr, but doesn't stop at NULL characters */
2986 while (size
-- > 0) {
2996 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
3003 static const char *hexdigit
= "0123456789abcdef";
3004 #ifdef Py_UNICODE_WIDE
3005 const Py_ssize_t expandsize
= 10;
3007 const Py_ssize_t expandsize
= 6;
3010 /* XXX(nnorwitz): rather than over-allocating, it would be
3011 better to choose a different scheme. Perhaps scan the
3012 first N-chars of the string and allocate based on that size.
3014 /* Initial allocation is based on the longest-possible unichr
3017 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3018 unichr, so in this case it's the longest unichr escape. In
3019 narrow (UTF-16) builds this is five chars per source unichr
3020 since there are two unichrs in the surrogate pair, so in narrow
3021 (UTF-16) builds it's not the longest unichr escape.
3023 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3024 so in the narrow (UTF-16) build case it's the longest unichr
3028 if (size
> (PY_SSIZE_T_MAX
- 2 - 1) / expandsize
)
3029 return PyErr_NoMemory();
3031 repr
= PyString_FromStringAndSize(NULL
,
3038 p
= PyString_AS_STRING(repr
);
3042 *p
++ = (findchar(s
, size
, '\'') &&
3043 !findchar(s
, size
, '"')) ? '"' : '\'';
3045 while (size
-- > 0) {
3046 Py_UNICODE ch
= *s
++;
3048 /* Escape quotes and backslashes */
3050 ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1]) || ch
== '\\') {
3056 #ifdef Py_UNICODE_WIDE
3057 /* Map 21-bit characters to '\U00xxxxxx' */
3058 else if (ch
>= 0x10000) {
3061 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
3062 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
3063 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
3064 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
3065 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
3066 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
3067 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
3068 *p
++ = hexdigit
[ch
& 0x0000000F];
3072 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3073 else if (ch
>= 0xD800 && ch
< 0xDC00) {
3079 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3080 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3083 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
3084 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
3085 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
3086 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
3087 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
3088 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
3089 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
3090 *p
++ = hexdigit
[ucs
& 0x0000000F];
3093 /* Fall through: isolated surrogates are copied as-is */
3099 /* Map 16-bit characters to '\uxxxx' */
3103 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
3104 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
3105 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3106 *p
++ = hexdigit
[ch
& 0x000F];
3109 /* Map special whitespace to '\t', \n', '\r' */
3110 else if (ch
== '\t') {
3114 else if (ch
== '\n') {
3118 else if (ch
== '\r') {
3123 /* Map non-printable US ASCII to '\xhh' */
3124 else if (ch
< ' ' || ch
>= 0x7F) {
3127 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3128 *p
++ = hexdigit
[ch
& 0x000F];
3131 /* Copy everything else as-is */
3136 *p
++ = PyString_AS_STRING(repr
)[1];
3139 _PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
));
3143 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
3146 return unicodeescape_string(s
, size
, 0);
3149 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
3151 if (!PyUnicode_Check(unicode
)) {
3152 PyErr_BadArgument();
3155 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3156 PyUnicode_GET_SIZE(unicode
));
3159 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3161 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
3165 const char *starts
= s
;
3166 Py_ssize_t startinpos
;
3167 Py_ssize_t endinpos
;
3173 PyObject
*errorHandler
= NULL
;
3174 PyObject
*exc
= NULL
;
3176 /* Escaped strings will always be longer than the resulting
3177 Unicode string, so we start with size here and then reduce the
3178 length after conversion to the true value. (But decoding error
3179 handler might have to resize the string) */
3180 v
= _PyUnicode_New(size
);
3184 return (PyObject
*)v
;
3185 p
= PyUnicode_AS_UNICODE(v
);
3193 /* Non-escape characters are interpreted as Unicode ordinals */
3195 *p
++ = (unsigned char)*s
++;
3198 startinpos
= s
-starts
;
3200 /* \u-escapes are only interpreted iff the number of leading
3201 backslashes if odd */
3206 *p
++ = (unsigned char)*s
++;
3208 if (((s
- bs
) & 1) == 0 ||
3210 (*s
!= 'u' && *s
!= 'U')) {
3214 count
= *s
=='u' ? 4 : 8;
3217 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3218 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3219 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
3220 c
= (unsigned char)*s
;
3222 endinpos
= s
-starts
;
3223 if (unicode_decode_call_errorhandler(
3224 errors
, &errorHandler
,
3225 "rawunicodeescape", "truncated \\uXXXX",
3226 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3232 if (c
>= '0' && c
<= '9')
3234 else if (c
>= 'a' && c
<= 'f')
3240 /* UCS-2 character */
3241 *p
++ = (Py_UNICODE
) x
;
3242 else if (x
<= 0x10ffff) {
3243 /* UCS-4 character. Either store directly, or as
3245 #ifdef Py_UNICODE_WIDE
3246 *p
++ = (Py_UNICODE
) x
;
3249 *p
++ = 0xD800 + (Py_UNICODE
) (x
>> 10);
3250 *p
++ = 0xDC00 + (Py_UNICODE
) (x
& 0x03FF);
3253 endinpos
= s
-starts
;
3254 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3255 if (unicode_decode_call_errorhandler(
3256 errors
, &errorHandler
,
3257 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3258 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3265 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3267 Py_XDECREF(errorHandler
);
3269 return (PyObject
*)v
;
3273 Py_XDECREF(errorHandler
);
3278 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
3285 static const char *hexdigit
= "0123456789abcdef";
3286 #ifdef Py_UNICODE_WIDE
3287 const Py_ssize_t expandsize
= 10;
3289 const Py_ssize_t expandsize
= 6;
3292 if (size
> PY_SSIZE_T_MAX
/ expandsize
)
3293 return PyErr_NoMemory();
3295 repr
= PyString_FromStringAndSize(NULL
, expandsize
* size
);
3301 p
= q
= PyString_AS_STRING(repr
);
3302 while (size
-- > 0) {
3303 Py_UNICODE ch
= *s
++;
3304 #ifdef Py_UNICODE_WIDE
3305 /* Map 32-bit characters to '\Uxxxxxxxx' */
3306 if (ch
>= 0x10000) {
3309 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
3310 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
3311 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
3312 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
3313 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3314 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3315 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3316 *p
++ = hexdigit
[ch
& 15];
3320 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3321 if (ch
>= 0xD800 && ch
< 0xDC00) {
3327 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3328 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3331 *p
++ = hexdigit
[(ucs
>> 28) & 0xf];
3332 *p
++ = hexdigit
[(ucs
>> 24) & 0xf];
3333 *p
++ = hexdigit
[(ucs
>> 20) & 0xf];
3334 *p
++ = hexdigit
[(ucs
>> 16) & 0xf];
3335 *p
++ = hexdigit
[(ucs
>> 12) & 0xf];
3336 *p
++ = hexdigit
[(ucs
>> 8) & 0xf];
3337 *p
++ = hexdigit
[(ucs
>> 4) & 0xf];
3338 *p
++ = hexdigit
[ucs
& 0xf];
3341 /* Fall through: isolated surrogates are copied as-is */
3346 /* Map 16-bit characters to '\uxxxx' */
3350 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3351 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3352 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3353 *p
++ = hexdigit
[ch
& 15];
3355 /* Copy everything else as-is */
3360 _PyString_Resize(&repr
, p
- q
);
3364 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
3366 if (!PyUnicode_Check(unicode
)) {
3367 PyErr_BadArgument();
3370 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3371 PyUnicode_GET_SIZE(unicode
));
3374 /* --- Unicode Internal Codec ------------------------------------------- */
3376 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
3380 const char *starts
= s
;
3381 Py_ssize_t startinpos
;
3382 Py_ssize_t endinpos
;
3388 PyObject
*errorHandler
= NULL
;
3389 PyObject
*exc
= NULL
;
3391 #ifdef Py_UNICODE_WIDE
3392 Py_UNICODE unimax
= PyUnicode_GetMax();
3395 /* XXX overflow detection missing */
3396 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
3399 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
3400 return (PyObject
*)v
;
3401 p
= PyUnicode_AS_UNICODE(v
);
3405 memcpy(p
, s
, sizeof(Py_UNICODE
));
3406 /* We have to sanity check the raw data, otherwise doom looms for
3407 some malformed UCS-4 data. */
3409 #ifdef Py_UNICODE_WIDE
3410 *p
> unimax
|| *p
< 0 ||
3412 end
-s
< Py_UNICODE_SIZE
3415 startinpos
= s
- starts
;
3416 if (end
-s
< Py_UNICODE_SIZE
) {
3417 endinpos
= end
-starts
;
3418 reason
= "truncated input";
3421 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
3422 reason
= "illegal code point (> 0x10FFFF)";
3424 outpos
= p
- PyUnicode_AS_UNICODE(v
);
3425 if (unicode_decode_call_errorhandler(
3426 errors
, &errorHandler
,
3427 "unicode_internal", reason
,
3428 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3435 s
+= Py_UNICODE_SIZE
;
3439 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3441 Py_XDECREF(errorHandler
);
3443 return (PyObject
*)v
;
3447 Py_XDECREF(errorHandler
);
3452 /* --- Latin-1 Codec ------------------------------------------------------ */
3454 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
3461 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3463 Py_UNICODE r
= *(unsigned char*)s
;
3464 return PyUnicode_FromUnicode(&r
, 1);
3467 v
= _PyUnicode_New(size
);
3471 return (PyObject
*)v
;
3472 p
= PyUnicode_AS_UNICODE(v
);
3474 *p
++ = (unsigned char)*s
++;
3475 return (PyObject
*)v
;
3482 /* create or adjust a UnicodeEncodeError */
3483 static void make_encode_exception(PyObject
**exceptionObject
,
3484 const char *encoding
,
3485 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3486 Py_ssize_t startpos
, Py_ssize_t endpos
,
3489 if (*exceptionObject
== NULL
) {
3490 *exceptionObject
= PyUnicodeEncodeError_Create(
3491 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3494 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
3496 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
3498 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
3502 Py_DECREF(*exceptionObject
);
3503 *exceptionObject
= NULL
;
3507 /* raises a UnicodeEncodeError */
3508 static void raise_encode_exception(PyObject
**exceptionObject
,
3509 const char *encoding
,
3510 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3511 Py_ssize_t startpos
, Py_ssize_t endpos
,
3514 make_encode_exception(exceptionObject
,
3515 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3516 if (*exceptionObject
!= NULL
)
3517 PyCodec_StrictErrors(*exceptionObject
);
3520 /* error handling callback helper:
3521 build arguments, call the callback and check the arguments,
3522 put the result into newpos and return the replacement string, which
3523 has to be freed by the caller */
3524 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
3525 PyObject
**errorHandler
,
3526 const char *encoding
, const char *reason
,
3527 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
3528 Py_ssize_t startpos
, Py_ssize_t endpos
,
3531 static char *argparse
= "O!n;encoding error handler must return (unicode, int) tuple";
3534 PyObject
*resunicode
;
3536 if (*errorHandler
== NULL
) {
3537 *errorHandler
= PyCodec_LookupError(errors
);
3538 if (*errorHandler
== NULL
)
3542 make_encode_exception(exceptionObject
,
3543 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3544 if (*exceptionObject
== NULL
)
3547 restuple
= PyObject_CallFunctionObjArgs(
3548 *errorHandler
, *exceptionObject
, NULL
);
3549 if (restuple
== NULL
)
3551 if (!PyTuple_Check(restuple
)) {
3552 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
3553 Py_DECREF(restuple
);
3556 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3557 &resunicode
, newpos
)) {
3558 Py_DECREF(restuple
);
3562 *newpos
= size
+*newpos
;
3563 if (*newpos
<0 || *newpos
>size
) {
3564 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
3565 Py_DECREF(restuple
);
3568 Py_INCREF(resunicode
);
3569 Py_DECREF(restuple
);
3573 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
3580 /* pointers to the beginning and end+1 of input */
3581 const Py_UNICODE
*startp
= p
;
3582 const Py_UNICODE
*endp
= p
+ size
;
3583 /* pointer to the beginning of the unencodable characters */
3584 /* const Py_UNICODE *badp = NULL; */
3585 /* pointer into the output */
3587 /* current output position */
3588 Py_ssize_t respos
= 0;
3590 const char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
3591 const char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3592 PyObject
*errorHandler
= NULL
;
3593 PyObject
*exc
= NULL
;
3594 /* the following variable is used for caching string comparisons
3595 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3596 int known_errorHandler
= -1;
3598 /* allocate enough for a simple encoding without
3599 replacements, if we need more, we'll resize */
3600 res
= PyString_FromStringAndSize(NULL
, size
);
3605 str
= PyString_AS_STRING(res
);
3611 /* can we encode this? */
3613 /* no overflow check, because we know that the space is enough */
3618 Py_ssize_t unicodepos
= p
-startp
;
3619 Py_ssize_t requiredsize
;
3620 PyObject
*repunicode
;
3625 /* startpos for collecting unencodable chars */
3626 const Py_UNICODE
*collstart
= p
;
3627 const Py_UNICODE
*collend
= p
;
3628 /* find all unecodable characters */
3629 while ((collend
< endp
) && ((*collend
)>=limit
))
3631 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3632 if (known_errorHandler
==-1) {
3633 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3634 known_errorHandler
= 1;
3635 else if (!strcmp(errors
, "replace"))
3636 known_errorHandler
= 2;
3637 else if (!strcmp(errors
, "ignore"))
3638 known_errorHandler
= 3;
3639 else if (!strcmp(errors
, "xmlcharrefreplace"))
3640 known_errorHandler
= 4;
3642 known_errorHandler
= 0;
3644 switch (known_errorHandler
) {
3645 case 1: /* strict */
3646 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
3648 case 2: /* replace */
3649 while (collstart
++<collend
)
3650 *str
++ = '?'; /* fall through */
3651 case 3: /* ignore */
3654 case 4: /* xmlcharrefreplace */
3655 respos
= str
-PyString_AS_STRING(res
);
3656 /* determine replacement size (temporarily (mis)uses p) */
3657 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
3666 #ifndef Py_UNICODE_WIDE
3672 else if (*p
<1000000)
3678 requiredsize
= respos
+repsize
+(endp
-collend
);
3679 if (requiredsize
> ressize
) {
3680 if (requiredsize
<2*ressize
)
3681 requiredsize
= 2*ressize
;
3682 if (_PyString_Resize(&res
, requiredsize
))
3684 str
= PyString_AS_STRING(res
) + respos
;
3685 ressize
= requiredsize
;
3687 /* generate replacement (temporarily (mis)uses p) */
3688 for (p
= collstart
; p
< collend
; ++p
) {
3689 str
+= sprintf(str
, "&#%d;", (int)*p
);
3694 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
3695 encoding
, reason
, startp
, size
, &exc
,
3696 collstart
-startp
, collend
-startp
, &newpos
);
3697 if (repunicode
== NULL
)
3699 /* need more space? (at least enough for what we have+the
3700 replacement+the rest of the string, so we won't have to
3701 check space for encodable characters) */
3702 respos
= str
-PyString_AS_STRING(res
);
3703 repsize
= PyUnicode_GET_SIZE(repunicode
);
3704 requiredsize
= respos
+repsize
+(endp
-collend
);
3705 if (requiredsize
> ressize
) {
3706 if (requiredsize
<2*ressize
)
3707 requiredsize
= 2*ressize
;
3708 if (_PyString_Resize(&res
, requiredsize
)) {
3709 Py_DECREF(repunicode
);
3712 str
= PyString_AS_STRING(res
) + respos
;
3713 ressize
= requiredsize
;
3715 /* check if there is anything unencodable in the replacement
3716 and copy it to the output */
3717 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
3720 raise_encode_exception(&exc
, encoding
, startp
, size
,
3721 unicodepos
, unicodepos
+1, reason
);
3722 Py_DECREF(repunicode
);
3727 p
= startp
+ newpos
;
3728 Py_DECREF(repunicode
);
3732 /* Resize if we allocated to much */
3733 respos
= str
-PyString_AS_STRING(res
);
3735 /* If this falls res will be NULL */
3736 _PyString_Resize(&res
, respos
);
3737 Py_XDECREF(errorHandler
);
3743 Py_XDECREF(errorHandler
);
3748 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
3752 return unicode_encode_ucs1(p
, size
, errors
, 256);
3755 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
3757 if (!PyUnicode_Check(unicode
)) {
3758 PyErr_BadArgument();
3761 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
3762 PyUnicode_GET_SIZE(unicode
),
3766 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3768 PyObject
*PyUnicode_DecodeASCII(const char *s
,
3772 const char *starts
= s
;
3775 Py_ssize_t startinpos
;
3776 Py_ssize_t endinpos
;
3779 PyObject
*errorHandler
= NULL
;
3780 PyObject
*exc
= NULL
;
3782 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3783 if (size
== 1 && *(unsigned char*)s
< 128) {
3784 Py_UNICODE r
= *(unsigned char*)s
;
3785 return PyUnicode_FromUnicode(&r
, 1);
3788 v
= _PyUnicode_New(size
);
3792 return (PyObject
*)v
;
3793 p
= PyUnicode_AS_UNICODE(v
);
3796 register unsigned char c
= (unsigned char)*s
;
3802 startinpos
= s
-starts
;
3803 endinpos
= startinpos
+ 1;
3804 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
3805 if (unicode_decode_call_errorhandler(
3806 errors
, &errorHandler
,
3807 "ascii", "ordinal not in range(128)",
3808 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3813 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
3814 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3816 Py_XDECREF(errorHandler
);
3818 return (PyObject
*)v
;
3822 Py_XDECREF(errorHandler
);
3827 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
3831 return unicode_encode_ucs1(p
, size
, errors
, 128);
3834 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
3836 if (!PyUnicode_Check(unicode
)) {
3837 PyErr_BadArgument();
3840 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
3841 PyUnicode_GET_SIZE(unicode
),
3845 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3847 /* --- MBCS codecs for Windows -------------------------------------------- */
3849 #if SIZEOF_INT < SIZEOF_SIZE_T
3853 /* XXX This code is limited to "true" double-byte encodings, as
3854 a) it assumes an incomplete character consists of a single byte, and
3855 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3856 encodings, see IsDBCSLeadByteEx documentation. */
3858 static int is_dbcs_lead_byte(const char *s
, int offset
)
3860 const char *curr
= s
+ offset
;
3862 if (IsDBCSLeadByte(*curr
)) {
3863 const char *prev
= CharPrev(s
, curr
);
3864 return (prev
== curr
) || !IsDBCSLeadByte(*prev
) || (curr
- prev
== 2);
3870 * Decode MBCS string into unicode object. If 'final' is set, converts
3871 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3873 static int decode_mbcs(PyUnicodeObject
**v
,
3874 const char *s
, /* MBCS string */
3875 int size
, /* sizeof MBCS string */
3884 /* Skip trailing lead-byte unless 'final' is set */
3885 if (!final
&& size
>= 1 && is_dbcs_lead_byte(s
, size
- 1))
3888 /* First get the size of the result */
3890 usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
3892 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3898 /* Create unicode object */
3899 *v
= _PyUnicode_New(usize
);
3904 /* Extend unicode object */
3905 n
= PyUnicode_GET_SIZE(*v
);
3906 if (_PyUnicode_Resize(v
, n
+ usize
) < 0)
3910 /* Do the conversion */
3912 p
= PyUnicode_AS_UNICODE(*v
) + n
;
3913 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
3914 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3922 PyObject
*PyUnicode_DecodeMBCSStateful(const char *s
,
3925 Py_ssize_t
*consumed
)
3927 PyUnicodeObject
*v
= NULL
;
3936 done
= decode_mbcs(&v
, s
, INT_MAX
, 0);
3939 done
= decode_mbcs(&v
, s
, (int)size
, !consumed
);
3950 if (size
> INT_MAX
) {
3957 return (PyObject
*)v
;
3960 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
3964 return PyUnicode_DecodeMBCSStateful(s
, size
, errors
, NULL
);
3968 * Convert unicode into string object (MBCS).
3969 * Returns 0 if succeed, -1 otherwise.
3971 static int encode_mbcs(PyObject
**repr
,
3972 const Py_UNICODE
*p
, /* unicode */
3973 int size
) /* size of unicode */
3980 /* First get the size of the result */
3982 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
3983 if (mbcssize
== 0) {
3984 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3989 if (*repr
== NULL
) {
3990 /* Create string object */
3991 *repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
3996 /* Extend string object */
3997 n
= PyString_Size(*repr
);
3998 if (_PyString_Resize(repr
, n
+ mbcssize
) < 0)
4002 /* Do the conversion */
4004 char *s
= PyString_AS_STRING(*repr
) + n
;
4005 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
4006 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
4014 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
4018 PyObject
*repr
= NULL
;
4024 ret
= encode_mbcs(&repr
, p
, INT_MAX
);
4027 ret
= encode_mbcs(&repr
, p
, (int)size
);
4035 if (size
> INT_MAX
) {
4045 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
4047 if (!PyUnicode_Check(unicode
)) {
4048 PyErr_BadArgument();
4051 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
4052 PyUnicode_GET_SIZE(unicode
),
4058 #endif /* MS_WINDOWS */
4060 /* --- Character Mapping Codec -------------------------------------------- */
4062 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
4067 const char *starts
= s
;
4068 Py_ssize_t startinpos
;
4069 Py_ssize_t endinpos
;
4074 Py_ssize_t extrachars
= 0;
4075 PyObject
*errorHandler
= NULL
;
4076 PyObject
*exc
= NULL
;
4077 Py_UNICODE
*mapstring
= NULL
;
4078 Py_ssize_t maplen
= 0;
4080 /* Default to Latin-1 */
4081 if (mapping
== NULL
)
4082 return PyUnicode_DecodeLatin1(s
, size
, errors
);
4084 v
= _PyUnicode_New(size
);
4088 return (PyObject
*)v
;
4089 p
= PyUnicode_AS_UNICODE(v
);
4091 if (PyUnicode_CheckExact(mapping
)) {
4092 mapstring
= PyUnicode_AS_UNICODE(mapping
);
4093 maplen
= PyUnicode_GET_SIZE(mapping
);
4095 unsigned char ch
= *s
;
4096 Py_UNICODE x
= 0xfffe; /* illegal value */
4102 /* undefined mapping */
4103 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4104 startinpos
= s
-starts
;
4105 endinpos
= startinpos
+1;
4106 if (unicode_decode_call_errorhandler(
4107 errors
, &errorHandler
,
4108 "charmap", "character maps to <undefined>",
4109 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4121 unsigned char ch
= *s
;
4124 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4125 w
= PyInt_FromLong((long)ch
);
4128 x
= PyObject_GetItem(mapping
, w
);
4131 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4132 /* No mapping found means: mapping is undefined. */
4141 if (PyInt_Check(x
)) {
4142 long value
= PyInt_AS_LONG(x
);
4143 if (value
< 0 || value
> 65535) {
4144 PyErr_SetString(PyExc_TypeError
,
4145 "character mapping must be in range(65536)");
4149 *p
++ = (Py_UNICODE
)value
;
4151 else if (x
== Py_None
) {
4152 /* undefined mapping */
4153 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4154 startinpos
= s
-starts
;
4155 endinpos
= startinpos
+1;
4156 if (unicode_decode_call_errorhandler(
4157 errors
, &errorHandler
,
4158 "charmap", "character maps to <undefined>",
4159 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4167 else if (PyUnicode_Check(x
)) {
4168 Py_ssize_t targetsize
= PyUnicode_GET_SIZE(x
);
4170 if (targetsize
== 1)
4172 *p
++ = *PyUnicode_AS_UNICODE(x
);
4174 else if (targetsize
> 1) {
4176 if (targetsize
> extrachars
) {
4178 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
4179 Py_ssize_t needed
= (targetsize
- extrachars
) + \
4181 extrachars
+= needed
;
4182 /* XXX overflow detection missing */
4183 if (_PyUnicode_Resize(&v
,
4184 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
4188 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
4191 PyUnicode_AS_UNICODE(x
),
4194 extrachars
-= targetsize
;
4196 /* 1-0 mapping: skip the character */
4199 /* wrong return value */
4200 PyErr_SetString(PyExc_TypeError
,
4201 "character mapping must return integer, None or unicode");
4209 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
4210 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
4212 Py_XDECREF(errorHandler
);
4214 return (PyObject
*)v
;
4217 Py_XDECREF(errorHandler
);
4223 /* Charmap encoding: the lookup table */
4225 struct encoding_map
{
4227 unsigned char level1
[32];
4229 unsigned char level23
[1];
4233 encoding_map_size(PyObject
*obj
, PyObject
* args
)
4235 struct encoding_map
*map
= (struct encoding_map
*)obj
;
4236 return PyInt_FromLong(sizeof(*map
) - 1 + 16*map
->count2
+
4240 static PyMethodDef encoding_map_methods
[] = {
4241 {"size", encoding_map_size
, METH_NOARGS
,
4242 PyDoc_STR("Return the size (in bytes) of this object") },
4247 encoding_map_dealloc(PyObject
* o
)
4252 static PyTypeObject EncodingMapType
= {
4253 PyVarObject_HEAD_INIT(NULL
, 0)
4254 "EncodingMap", /*tp_name*/
4255 sizeof(struct encoding_map
), /*tp_basicsize*/
4258 encoding_map_dealloc
, /*tp_dealloc*/
4265 0, /*tp_as_sequence*/
4266 0, /*tp_as_mapping*/
4273 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
4277 0, /*tp_richcompare*/
4278 0, /*tp_weaklistoffset*/
4281 encoding_map_methods
, /*tp_methods*/
4288 0, /*tp_dictoffset*/
4297 PyUnicode_BuildEncodingMap(PyObject
* string
)
4301 struct encoding_map
*mresult
;
4304 unsigned char level1
[32];
4305 unsigned char level2
[512];
4306 unsigned char *mlevel1
, *mlevel2
, *mlevel3
;
4307 int count2
= 0, count3
= 0;
4309 if (!PyUnicode_Check(string
) || PyUnicode_GetSize(string
) != 256) {
4310 PyErr_BadArgument();
4313 decode
= PyUnicode_AS_UNICODE(string
);
4314 memset(level1
, 0xFF, sizeof level1
);
4315 memset(level2
, 0xFF, sizeof level2
);
4317 /* If there isn't a one-to-one mapping of NULL to \0,
4318 or if there are non-BMP characters, we need to use
4319 a mapping dictionary. */
4322 for (i
= 1; i
< 256; i
++) {
4325 #ifdef Py_UNICODE_WIDE
4326 || decode
[i
] > 0xFFFF
4332 if (decode
[i
] == 0xFFFE)
4333 /* unmapped character */
4335 l1
= decode
[i
] >> 11;
4336 l2
= decode
[i
] >> 7;
4337 if (level1
[l1
] == 0xFF)
4338 level1
[l1
] = count2
++;
4339 if (level2
[l2
] == 0xFF)
4340 level2
[l2
] = count3
++;
4343 if (count2
>= 0xFF || count3
>= 0xFF)
4347 PyObject
*result
= PyDict_New();
4348 PyObject
*key
, *value
;
4351 for (i
= 0; i
< 256; i
++) {
4353 key
= PyInt_FromLong(decode
[i
]);
4354 value
= PyInt_FromLong(i
);
4357 if (PyDict_SetItem(result
, key
, value
) == -1)
4370 /* Create a three-level trie */
4371 result
= PyObject_MALLOC(sizeof(struct encoding_map
) +
4372 16*count2
+ 128*count3
- 1);
4374 return PyErr_NoMemory();
4375 PyObject_Init(result
, &EncodingMapType
);
4376 mresult
= (struct encoding_map
*)result
;
4377 mresult
->count2
= count2
;
4378 mresult
->count3
= count3
;
4379 mlevel1
= mresult
->level1
;
4380 mlevel2
= mresult
->level23
;
4381 mlevel3
= mresult
->level23
+ 16*count2
;
4382 memcpy(mlevel1
, level1
, 32);
4383 memset(mlevel2
, 0xFF, 16*count2
);
4384 memset(mlevel3
, 0, 128*count3
);
4386 for (i
= 1; i
< 256; i
++) {
4387 int o1
, o2
, o3
, i2
, i3
;
4388 if (decode
[i
] == 0xFFFE)
4389 /* unmapped character */
4392 o2
= (decode
[i
]>>7) & 0xF;
4393 i2
= 16*mlevel1
[o1
] + o2
;
4394 if (mlevel2
[i2
] == 0xFF)
4395 mlevel2
[i2
] = count3
++;
4396 o3
= decode
[i
] & 0x7F;
4397 i3
= 128*mlevel2
[i2
] + o3
;
4404 encoding_map_lookup(Py_UNICODE c
, PyObject
*mapping
)
4406 struct encoding_map
*map
= (struct encoding_map
*)mapping
;
4408 int l2
= (c
>>7) & 0xF;
4412 #ifdef Py_UNICODE_WIDE
4420 i
= map
->level1
[l1
];
4425 i
= map
->level23
[16*i
+l2
];
4430 i
= map
->level23
[16*map
->count2
+ 128*i
+ l3
];
4437 /* Lookup the character ch in the mapping. If the character
4438 can't be found, Py_None is returned (or NULL, if another
4440 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
4442 PyObject
*w
= PyInt_FromLong((long)c
);
4447 x
= PyObject_GetItem(mapping
, w
);
4450 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4451 /* No mapping found means: mapping is undefined. */
4459 else if (x
== Py_None
)
4461 else if (PyInt_Check(x
)) {
4462 long value
= PyInt_AS_LONG(x
);
4463 if (value
< 0 || value
> 255) {
4464 PyErr_SetString(PyExc_TypeError
,
4465 "character mapping must be in range(256)");
4471 else if (PyString_Check(x
))
4474 /* wrong return value */
4475 PyErr_SetString(PyExc_TypeError
,
4476 "character mapping must return integer, None or str");
4483 charmapencode_resize(PyObject
**outobj
, Py_ssize_t
*outpos
, Py_ssize_t requiredsize
)
4485 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4486 /* exponentially overallocate to minimize reallocations */
4487 if (requiredsize
< 2*outsize
)
4488 requiredsize
= 2*outsize
;
4489 if (_PyString_Resize(outobj
, requiredsize
)) {
4495 typedef enum charmapencode_result
{
4496 enc_SUCCESS
, enc_FAILED
, enc_EXCEPTION
4497 }charmapencode_result
;
4498 /* lookup the character, put the result in the output string and adjust
4499 various state variables. Reallocate the output string if not enough
4500 space is available. Return a new reference to the object that
4501 was put in the output buffer, or Py_None, if the mapping was undefined
4502 (in which case no character was written) or NULL, if a
4503 reallocation error occurred. The caller must decref the result */
4505 charmapencode_result
charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
4506 PyObject
**outobj
, Py_ssize_t
*outpos
)
4510 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4512 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4513 int res
= encoding_map_lookup(c
, mapping
);
4514 Py_ssize_t requiredsize
= *outpos
+1;
4517 if (outsize
<requiredsize
)
4518 if (!charmapencode_resize(outobj
, outpos
, requiredsize
))
4519 return enc_EXCEPTION
;
4520 outstart
= PyString_AS_STRING(*outobj
);
4521 outstart
[(*outpos
)++] = (char)res
;
4525 rep
= charmapencode_lookup(c
, mapping
);
4527 return enc_EXCEPTION
;
4528 else if (rep
==Py_None
) {
4532 if (PyInt_Check(rep
)) {
4533 Py_ssize_t requiredsize
= *outpos
+1;
4534 if (outsize
<requiredsize
)
4535 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4537 return enc_EXCEPTION
;
4539 outstart
= PyString_AS_STRING(*outobj
);
4540 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
4543 const char *repchars
= PyString_AS_STRING(rep
);
4544 Py_ssize_t repsize
= PyString_GET_SIZE(rep
);
4545 Py_ssize_t requiredsize
= *outpos
+repsize
;
4546 if (outsize
<requiredsize
)
4547 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4549 return enc_EXCEPTION
;
4551 outstart
= PyString_AS_STRING(*outobj
);
4552 memcpy(outstart
+ *outpos
, repchars
, repsize
);
4560 /* handle an error in PyUnicode_EncodeCharmap
4561 Return 0 on success, -1 on error */
4563 int charmap_encoding_error(
4564 const Py_UNICODE
*p
, Py_ssize_t size
, Py_ssize_t
*inpos
, PyObject
*mapping
,
4565 PyObject
**exceptionObject
,
4566 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
4567 PyObject
**res
, Py_ssize_t
*respos
)
4569 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4573 /* startpos for collecting unencodable chars */
4574 Py_ssize_t collstartpos
= *inpos
;
4575 Py_ssize_t collendpos
= *inpos
+1;
4577 char *encoding
= "charmap";
4578 char *reason
= "character maps to <undefined>";
4579 charmapencode_result x
;
4581 /* find all unencodable characters */
4582 while (collendpos
< size
) {
4584 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4585 int res
= encoding_map_lookup(p
[collendpos
], mapping
);
4592 rep
= charmapencode_lookup(p
[collendpos
], mapping
);
4595 else if (rep
!=Py_None
) {
4602 /* cache callback name lookup
4603 * (if not done yet, i.e. it's the first error) */
4604 if (*known_errorHandler
==-1) {
4605 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4606 *known_errorHandler
= 1;
4607 else if (!strcmp(errors
, "replace"))
4608 *known_errorHandler
= 2;
4609 else if (!strcmp(errors
, "ignore"))
4610 *known_errorHandler
= 3;
4611 else if (!strcmp(errors
, "xmlcharrefreplace"))
4612 *known_errorHandler
= 4;
4614 *known_errorHandler
= 0;
4616 switch (*known_errorHandler
) {
4617 case 1: /* strict */
4618 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4620 case 2: /* replace */
4621 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
4622 x
= charmapencode_output('?', mapping
, res
, respos
);
4623 if (x
==enc_EXCEPTION
) {
4626 else if (x
==enc_FAILED
) {
4627 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4632 case 3: /* ignore */
4633 *inpos
= collendpos
;
4635 case 4: /* xmlcharrefreplace */
4636 /* generate replacement (temporarily (mis)uses p) */
4637 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
4638 char buffer
[2+29+1+1];
4640 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
4641 for (cp
= buffer
; *cp
; ++cp
) {
4642 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
4643 if (x
==enc_EXCEPTION
)
4645 else if (x
==enc_FAILED
) {
4646 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4651 *inpos
= collendpos
;
4654 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
4655 encoding
, reason
, p
, size
, exceptionObject
,
4656 collstartpos
, collendpos
, &newpos
);
4657 if (repunicode
== NULL
)
4659 /* generate replacement */
4660 repsize
= PyUnicode_GET_SIZE(repunicode
);
4661 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
4662 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
4663 if (x
==enc_EXCEPTION
) {
4666 else if (x
==enc_FAILED
) {
4667 Py_DECREF(repunicode
);
4668 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4673 Py_DECREF(repunicode
);
4678 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
4684 PyObject
*res
= NULL
;
4685 /* current input position */
4686 Py_ssize_t inpos
= 0;
4687 /* current output position */
4688 Py_ssize_t respos
= 0;
4689 PyObject
*errorHandler
= NULL
;
4690 PyObject
*exc
= NULL
;
4691 /* the following variable is used for caching string comparisons
4692 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4693 * 3=ignore, 4=xmlcharrefreplace */
4694 int known_errorHandler
= -1;
4696 /* Default to Latin-1 */
4697 if (mapping
== NULL
)
4698 return PyUnicode_EncodeLatin1(p
, size
, errors
);
4700 /* allocate enough for a simple encoding without
4701 replacements, if we need more, we'll resize */
4702 res
= PyString_FromStringAndSize(NULL
, size
);
4708 while (inpos
<size
) {
4709 /* try to encode it */
4710 charmapencode_result x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
4711 if (x
==enc_EXCEPTION
) /* error */
4713 if (x
==enc_FAILED
) { /* unencodable character */
4714 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
4716 &known_errorHandler
, &errorHandler
, errors
,
4722 /* done with this character => adjust input position */
4726 /* Resize if we allocated to much */
4727 if (respos
<PyString_GET_SIZE(res
)) {
4728 if (_PyString_Resize(&res
, respos
))
4732 Py_XDECREF(errorHandler
);
4738 Py_XDECREF(errorHandler
);
4742 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
4745 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
4746 PyErr_BadArgument();
4749 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
4750 PyUnicode_GET_SIZE(unicode
),
4755 /* create or adjust a UnicodeTranslateError */
4756 static void make_translate_exception(PyObject
**exceptionObject
,
4757 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4758 Py_ssize_t startpos
, Py_ssize_t endpos
,
4761 if (*exceptionObject
== NULL
) {
4762 *exceptionObject
= PyUnicodeTranslateError_Create(
4763 unicode
, size
, startpos
, endpos
, reason
);
4766 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
4768 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
4770 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
4774 Py_DECREF(*exceptionObject
);
4775 *exceptionObject
= NULL
;
4779 /* raises a UnicodeTranslateError */
4780 static void raise_translate_exception(PyObject
**exceptionObject
,
4781 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4782 Py_ssize_t startpos
, Py_ssize_t endpos
,
4785 make_translate_exception(exceptionObject
,
4786 unicode
, size
, startpos
, endpos
, reason
);
4787 if (*exceptionObject
!= NULL
)
4788 PyCodec_StrictErrors(*exceptionObject
);
4791 /* error handling callback helper:
4792 build arguments, call the callback and check the arguments,
4793 put the result into newpos and return the replacement string, which
4794 has to be freed by the caller */
4795 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
4796 PyObject
**errorHandler
,
4798 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
4799 Py_ssize_t startpos
, Py_ssize_t endpos
,
4802 static char *argparse
= "O!n;translating error handler must return (unicode, int) tuple";
4804 Py_ssize_t i_newpos
;
4806 PyObject
*resunicode
;
4808 if (*errorHandler
== NULL
) {
4809 *errorHandler
= PyCodec_LookupError(errors
);
4810 if (*errorHandler
== NULL
)
4814 make_translate_exception(exceptionObject
,
4815 unicode
, size
, startpos
, endpos
, reason
);
4816 if (*exceptionObject
== NULL
)
4819 restuple
= PyObject_CallFunctionObjArgs(
4820 *errorHandler
, *exceptionObject
, NULL
);
4821 if (restuple
== NULL
)
4823 if (!PyTuple_Check(restuple
)) {
4824 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
4825 Py_DECREF(restuple
);
4828 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
4829 &resunicode
, &i_newpos
)) {
4830 Py_DECREF(restuple
);
4834 *newpos
= size
+i_newpos
;
4837 if (*newpos
<0 || *newpos
>size
) {
4838 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
4839 Py_DECREF(restuple
);
4842 Py_INCREF(resunicode
);
4843 Py_DECREF(restuple
);
4847 /* Lookup the character ch in the mapping and put the result in result,
4848 which must be decrefed by the caller.
4849 Return 0 on success, -1 on error */
4851 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
4853 PyObject
*w
= PyInt_FromLong((long)c
);
4858 x
= PyObject_GetItem(mapping
, w
);
4861 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4862 /* No mapping found means: use 1:1 mapping. */
4869 else if (x
== Py_None
) {
4873 else if (PyInt_Check(x
)) {
4874 long value
= PyInt_AS_LONG(x
);
4875 long max
= PyUnicode_GetMax();
4876 if (value
< 0 || value
> max
) {
4877 PyErr_Format(PyExc_TypeError
,
4878 "character mapping must be in range(0x%lx)", max
+1);
4885 else if (PyUnicode_Check(x
)) {
4890 /* wrong return value */
4891 PyErr_SetString(PyExc_TypeError
,
4892 "character mapping must return integer, None or unicode");
4897 /* ensure that *outobj is at least requiredsize characters long,
4898 if not reallocate and adjust various state variables.
4899 Return 0 on success, -1 on error */
4901 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
4902 Py_ssize_t requiredsize
)
4904 Py_ssize_t oldsize
= PyUnicode_GET_SIZE(*outobj
);
4905 if (requiredsize
> oldsize
) {
4906 /* remember old output position */
4907 Py_ssize_t outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
4908 /* exponentially overallocate to minimize reallocations */
4909 if (requiredsize
< 2 * oldsize
)
4910 requiredsize
= 2 * oldsize
;
4911 if (PyUnicode_Resize(outobj
, requiredsize
) < 0)
4913 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
4917 /* lookup the character, put the result in the output string and adjust
4918 various state variables. Return a new reference to the object that
4919 was put in the output buffer in *result, or Py_None, if the mapping was
4920 undefined (in which case no character was written).
4921 The called must decref result.
4922 Return 0 on success, -1 on error. */
4924 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
4925 Py_ssize_t insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
4928 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
4931 /* not found => default to 1:1 mapping */
4932 *(*outp
)++ = *curinp
;
4934 else if (*res
==Py_None
)
4936 else if (PyInt_Check(*res
)) {
4937 /* no overflow check, because we know that the space is enough */
4938 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
4940 else if (PyUnicode_Check(*res
)) {
4941 Py_ssize_t repsize
= PyUnicode_GET_SIZE(*res
);
4943 /* no overflow check, because we know that the space is enough */
4944 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
4946 else if (repsize
!=0) {
4947 /* more than one character */
4948 Py_ssize_t requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
4949 (insize
- (curinp
-startinp
)) +
4951 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
4953 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
4962 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
4968 PyObject
*res
= NULL
;
4969 /* pointers to the beginning and end+1 of input */
4970 const Py_UNICODE
*startp
= p
;
4971 const Py_UNICODE
*endp
= p
+ size
;
4972 /* pointer into the output */
4974 /* current output position */
4975 Py_ssize_t respos
= 0;
4976 char *reason
= "character maps to <undefined>";
4977 PyObject
*errorHandler
= NULL
;
4978 PyObject
*exc
= NULL
;
4979 /* the following variable is used for caching string comparisons
4980 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4981 * 3=ignore, 4=xmlcharrefreplace */
4982 int known_errorHandler
= -1;
4984 if (mapping
== NULL
) {
4985 PyErr_BadArgument();
4989 /* allocate enough for a simple 1:1 translation without
4990 replacements, if we need more, we'll resize */
4991 res
= PyUnicode_FromUnicode(NULL
, size
);
4996 str
= PyUnicode_AS_UNICODE(res
);
4999 /* try to encode it */
5001 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
5006 if (x
!=Py_None
) /* it worked => adjust input pointer */
5008 else { /* untranslatable character */
5009 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
5013 /* startpos for collecting untranslatable chars */
5014 const Py_UNICODE
*collstart
= p
;
5015 const Py_UNICODE
*collend
= p
+1;
5016 const Py_UNICODE
*coll
;
5018 /* find all untranslatable characters */
5019 while (collend
< endp
) {
5020 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
5027 /* cache callback name lookup
5028 * (if not done yet, i.e. it's the first error) */
5029 if (known_errorHandler
==-1) {
5030 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5031 known_errorHandler
= 1;
5032 else if (!strcmp(errors
, "replace"))
5033 known_errorHandler
= 2;
5034 else if (!strcmp(errors
, "ignore"))
5035 known_errorHandler
= 3;
5036 else if (!strcmp(errors
, "xmlcharrefreplace"))
5037 known_errorHandler
= 4;
5039 known_errorHandler
= 0;
5041 switch (known_errorHandler
) {
5042 case 1: /* strict */
5043 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
5045 case 2: /* replace */
5046 /* No need to check for space, this is a 1:1 replacement */
5047 for (coll
= collstart
; coll
<collend
; ++coll
)
5050 case 3: /* ignore */
5053 case 4: /* xmlcharrefreplace */
5054 /* generate replacement (temporarily (mis)uses p) */
5055 for (p
= collstart
; p
< collend
; ++p
) {
5056 char buffer
[2+29+1+1];
5058 sprintf(buffer
, "&#%d;", (int)*p
);
5059 if (charmaptranslate_makespace(&res
, &str
,
5060 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
5062 for (cp
= buffer
; *cp
; ++cp
)
5068 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
5069 reason
, startp
, size
, &exc
,
5070 collstart
-startp
, collend
-startp
, &newpos
);
5071 if (repunicode
== NULL
)
5073 /* generate replacement */
5074 repsize
= PyUnicode_GET_SIZE(repunicode
);
5075 if (charmaptranslate_makespace(&res
, &str
,
5076 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
5077 Py_DECREF(repunicode
);
5080 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
5082 p
= startp
+ newpos
;
5083 Py_DECREF(repunicode
);
5087 /* Resize if we allocated to much */
5088 respos
= str
-PyUnicode_AS_UNICODE(res
);
5089 if (respos
<PyUnicode_GET_SIZE(res
)) {
5090 if (PyUnicode_Resize(&res
, respos
) < 0)
5094 Py_XDECREF(errorHandler
);
5100 Py_XDECREF(errorHandler
);
5104 PyObject
*PyUnicode_Translate(PyObject
*str
,
5110 str
= PyUnicode_FromObject(str
);
5113 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
5114 PyUnicode_GET_SIZE(str
),
5125 /* --- Decimal Encoder ---------------------------------------------------- */
5127 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
5132 Py_UNICODE
*p
, *end
;
5133 PyObject
*errorHandler
= NULL
;
5134 PyObject
*exc
= NULL
;
5135 const char *encoding
= "decimal";
5136 const char *reason
= "invalid decimal Unicode string";
5137 /* the following variable is used for caching string comparisons
5138 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5139 int known_errorHandler
= -1;
5141 if (output
== NULL
) {
5142 PyErr_BadArgument();
5149 register Py_UNICODE ch
= *p
;
5151 PyObject
*repunicode
;
5155 Py_UNICODE
*collstart
;
5156 Py_UNICODE
*collend
;
5158 if (Py_UNICODE_ISSPACE(ch
)) {
5163 decimal
= Py_UNICODE_TODECIMAL(ch
);
5165 *output
++ = '0' + decimal
;
5169 if (0 < ch
&& ch
< 256) {
5170 *output
++ = (char)ch
;
5174 /* All other characters are considered unencodable */
5177 while (collend
< end
) {
5178 if ((0 < *collend
&& *collend
< 256) ||
5179 !Py_UNICODE_ISSPACE(*collend
) ||
5180 Py_UNICODE_TODECIMAL(*collend
))
5183 /* cache callback name lookup
5184 * (if not done yet, i.e. it's the first error) */
5185 if (known_errorHandler
==-1) {
5186 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5187 known_errorHandler
= 1;
5188 else if (!strcmp(errors
, "replace"))
5189 known_errorHandler
= 2;
5190 else if (!strcmp(errors
, "ignore"))
5191 known_errorHandler
= 3;
5192 else if (!strcmp(errors
, "xmlcharrefreplace"))
5193 known_errorHandler
= 4;
5195 known_errorHandler
= 0;
5197 switch (known_errorHandler
) {
5198 case 1: /* strict */
5199 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
5201 case 2: /* replace */
5202 for (p
= collstart
; p
< collend
; ++p
)
5205 case 3: /* ignore */
5208 case 4: /* xmlcharrefreplace */
5209 /* generate replacement (temporarily (mis)uses p) */
5210 for (p
= collstart
; p
< collend
; ++p
)
5211 output
+= sprintf(output
, "&#%d;", (int)*p
);
5215 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
5216 encoding
, reason
, s
, length
, &exc
,
5217 collstart
-s
, collend
-s
, &newpos
);
5218 if (repunicode
== NULL
)
5220 /* generate replacement */
5221 repsize
= PyUnicode_GET_SIZE(repunicode
);
5222 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
5223 Py_UNICODE ch
= *uni2
;
5224 if (Py_UNICODE_ISSPACE(ch
))
5227 decimal
= Py_UNICODE_TODECIMAL(ch
);
5229 *output
++ = '0' + decimal
;
5230 else if (0 < ch
&& ch
< 256)
5231 *output
++ = (char)ch
;
5233 Py_DECREF(repunicode
);
5234 raise_encode_exception(&exc
, encoding
,
5235 s
, length
, collstart
-s
, collend
-s
, reason
);
5241 Py_DECREF(repunicode
);
5244 /* 0-terminate the output string */
5247 Py_XDECREF(errorHandler
);
5252 Py_XDECREF(errorHandler
);
5256 /* --- Helpers ------------------------------------------------------------ */
5258 #include "stringlib/unicodedefs.h"
5259 #include "stringlib/fastsearch.h"
5261 #include "stringlib/count.h"
5262 #include "stringlib/find.h"
5263 #include "stringlib/partition.h"
5264 #include "stringlib/split.h"
5266 /* helper macro to fixup start/end slice values */
5267 #define ADJUST_INDICES(start, end, len) \
5270 else if (end < 0) { \
5281 Py_ssize_t
PyUnicode_Count(PyObject
*str
,
5287 PyUnicodeObject
* str_obj
;
5288 PyUnicodeObject
* sub_obj
;
5290 str_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(str
);
5293 sub_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(substr
);
5299 ADJUST_INDICES(start
, end
, str_obj
->length
);
5300 result
= stringlib_count(
5301 str_obj
->str
+ start
, end
- start
, sub_obj
->str
, sub_obj
->length
,
5311 Py_ssize_t
PyUnicode_Find(PyObject
*str
,
5319 str
= PyUnicode_FromObject(str
);
5322 sub
= PyUnicode_FromObject(sub
);
5329 result
= stringlib_find_slice(
5330 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5331 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5335 result
= stringlib_rfind_slice(
5336 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5337 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5348 int tailmatch(PyUnicodeObject
*self
,
5349 PyUnicodeObject
*substring
,
5354 if (substring
->length
== 0)
5357 ADJUST_INDICES(start
, end
, self
->length
);
5358 end
-= substring
->length
;
5362 if (direction
> 0) {
5363 if (Py_UNICODE_MATCH(self
, end
, substring
))
5366 if (Py_UNICODE_MATCH(self
, start
, substring
))
5373 Py_ssize_t
PyUnicode_Tailmatch(PyObject
*str
,
5381 str
= PyUnicode_FromObject(str
);
5384 substr
= PyUnicode_FromObject(substr
);
5385 if (substr
== NULL
) {
5390 result
= tailmatch((PyUnicodeObject
*)str
,
5391 (PyUnicodeObject
*)substr
,
5392 start
, end
, direction
);
5398 /* Apply fixfct filter to the Unicode object self and return a
5399 reference to the modified object */
5402 PyObject
*fixup(PyUnicodeObject
*self
,
5403 int (*fixfct
)(PyUnicodeObject
*s
))
5408 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5412 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5414 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
5415 /* fixfct should return TRUE if it modified the buffer. If
5416 FALSE, return a reference to the original buffer instead
5417 (to save space, not time) */
5420 return (PyObject
*) self
;
5422 return (PyObject
*) u
;
5426 int fixupper(PyUnicodeObject
*self
)
5428 Py_ssize_t len
= self
->length
;
5429 Py_UNICODE
*s
= self
->str
;
5433 register Py_UNICODE ch
;
5435 ch
= Py_UNICODE_TOUPPER(*s
);
5447 int fixlower(PyUnicodeObject
*self
)
5449 Py_ssize_t len
= self
->length
;
5450 Py_UNICODE
*s
= self
->str
;
5454 register Py_UNICODE ch
;
5456 ch
= Py_UNICODE_TOLOWER(*s
);
5468 int fixswapcase(PyUnicodeObject
*self
)
5470 Py_ssize_t len
= self
->length
;
5471 Py_UNICODE
*s
= self
->str
;
5475 if (Py_UNICODE_ISUPPER(*s
)) {
5476 *s
= Py_UNICODE_TOLOWER(*s
);
5478 } else if (Py_UNICODE_ISLOWER(*s
)) {
5479 *s
= Py_UNICODE_TOUPPER(*s
);
5489 int fixcapitalize(PyUnicodeObject
*self
)
5491 Py_ssize_t len
= self
->length
;
5492 Py_UNICODE
*s
= self
->str
;
5497 if (Py_UNICODE_ISLOWER(*s
)) {
5498 *s
= Py_UNICODE_TOUPPER(*s
);
5503 if (Py_UNICODE_ISUPPER(*s
)) {
5504 *s
= Py_UNICODE_TOLOWER(*s
);
5513 int fixtitle(PyUnicodeObject
*self
)
5515 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5516 register Py_UNICODE
*e
;
5517 int previous_is_cased
;
5519 /* Shortcut for single character strings */
5520 if (PyUnicode_GET_SIZE(self
) == 1) {
5521 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
5530 e
= p
+ PyUnicode_GET_SIZE(self
);
5531 previous_is_cased
= 0;
5532 for (; p
< e
; p
++) {
5533 register const Py_UNICODE ch
= *p
;
5535 if (previous_is_cased
)
5536 *p
= Py_UNICODE_TOLOWER(ch
);
5538 *p
= Py_UNICODE_TOTITLE(ch
);
5540 if (Py_UNICODE_ISLOWER(ch
) ||
5541 Py_UNICODE_ISUPPER(ch
) ||
5542 Py_UNICODE_ISTITLE(ch
))
5543 previous_is_cased
= 1;
5545 previous_is_cased
= 0;
5551 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
5553 PyObject
*internal_separator
= NULL
;
5554 const Py_UNICODE blank
= ' ';
5555 const Py_UNICODE
*sep
= &blank
;
5556 Py_ssize_t seplen
= 1;
5557 PyUnicodeObject
*res
= NULL
; /* the result */
5558 Py_ssize_t res_alloc
= 100; /* # allocated bytes for string in res */
5559 Py_ssize_t res_used
; /* # used bytes */
5560 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
5561 PyObject
*fseq
; /* PySequence_Fast(seq) */
5562 Py_ssize_t seqlen
; /* len(fseq) -- number of items in sequence */
5566 fseq
= PySequence_Fast(seq
, "");
5571 /* Grrrr. A codec may be invoked to convert str objects to
5572 * Unicode, and so it's possible to call back into Python code
5573 * during PyUnicode_FromObject(), and so it's possible for a sick
5574 * codec to change the size of fseq (if seq is a list). Therefore
5575 * we have to keep refetching the size -- can't assume seqlen
5578 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5579 /* If empty sequence, return u"". */
5581 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
5584 /* If singleton sequence with an exact Unicode, return that. */
5586 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
5587 if (PyUnicode_CheckExact(item
)) {
5589 res
= (PyUnicodeObject
*)item
;
5594 /* At least two items to join, or one that isn't exact Unicode. */
5596 /* Set up sep and seplen -- they're needed. */
5597 if (separator
== NULL
) {
5602 internal_separator
= PyUnicode_FromObject(separator
);
5603 if (internal_separator
== NULL
)
5605 sep
= PyUnicode_AS_UNICODE(internal_separator
);
5606 seplen
= PyUnicode_GET_SIZE(internal_separator
);
5607 /* In case PyUnicode_FromObject() mutated seq. */
5608 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5613 res
= _PyUnicode_New(res_alloc
);
5616 res_p
= PyUnicode_AS_UNICODE(res
);
5619 for (i
= 0; i
< seqlen
; ++i
) {
5621 Py_ssize_t new_res_used
;
5623 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
5624 /* Convert item to Unicode. */
5625 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
5626 PyErr_Format(PyExc_TypeError
,
5627 "sequence item %zd: expected string or Unicode,"
5629 i
, Py_TYPE(item
)->tp_name
);
5632 item
= PyUnicode_FromObject(item
);
5635 /* We own a reference to item from here on. */
5637 /* In case PyUnicode_FromObject() mutated seq. */
5638 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5640 /* Make sure we have enough space for the separator and the item. */
5641 itemlen
= PyUnicode_GET_SIZE(item
);
5642 new_res_used
= res_used
+ itemlen
;
5643 if (new_res_used
< 0)
5645 if (i
< seqlen
- 1) {
5646 new_res_used
+= seplen
;
5647 if (new_res_used
< 0)
5650 if (new_res_used
> res_alloc
) {
5651 /* double allocated size until it's big enough */
5653 res_alloc
+= res_alloc
;
5656 } while (new_res_used
> res_alloc
);
5657 if (_PyUnicode_Resize(&res
, res_alloc
) < 0) {
5661 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
5664 /* Copy item, and maybe the separator. */
5665 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), itemlen
);
5667 if (i
< seqlen
- 1) {
5668 Py_UNICODE_COPY(res_p
, sep
, seplen
);
5672 res_used
= new_res_used
;
5675 /* Shrink res to match the used area; this probably can't fail,
5676 * but it's cheap to check.
5678 if (_PyUnicode_Resize(&res
, res_used
) < 0)
5682 Py_XDECREF(internal_separator
);
5684 return (PyObject
*)res
;
5687 PyErr_SetString(PyExc_OverflowError
,
5688 "join() result is too long for a Python string");
5693 Py_XDECREF(internal_separator
);
5700 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
5712 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
5717 if (left
> PY_SSIZE_T_MAX
- self
->length
||
5718 right
> PY_SSIZE_T_MAX
- (left
+ self
->length
)) {
5719 PyErr_SetString(PyExc_OverflowError
, "padded string is too long");
5722 u
= _PyUnicode_New(left
+ self
->length
+ right
);
5725 Py_UNICODE_FILL(u
->str
, fill
, left
);
5726 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
5728 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
5734 PyObject
*PyUnicode_Splitlines(PyObject
*string
, int keepends
)
5738 string
= PyUnicode_FromObject(string
);
5742 list
= stringlib_splitlines(
5743 (PyObject
*) string
, PyUnicode_AS_UNICODE(string
),
5744 PyUnicode_GET_SIZE(string
), keepends
);
5751 PyObject
*split(PyUnicodeObject
*self
,
5752 PyUnicodeObject
*substring
,
5753 Py_ssize_t maxcount
)
5756 maxcount
= PY_SSIZE_T_MAX
;
5758 if (substring
== NULL
)
5759 return stringlib_split_whitespace(
5760 (PyObject
*) self
, self
->str
, self
->length
, maxcount
5763 return stringlib_split(
5764 (PyObject
*) self
, self
->str
, self
->length
,
5765 substring
->str
, substring
->length
,
5771 PyObject
*rsplit(PyUnicodeObject
*self
,
5772 PyUnicodeObject
*substring
,
5773 Py_ssize_t maxcount
)
5776 maxcount
= PY_SSIZE_T_MAX
;
5778 if (substring
== NULL
)
5779 return stringlib_rsplit_whitespace(
5780 (PyObject
*) self
, self
->str
, self
->length
, maxcount
5783 return stringlib_rsplit(
5784 (PyObject
*) self
, self
->str
, self
->length
,
5785 substring
->str
, substring
->length
,
5791 PyObject
*replace(PyUnicodeObject
*self
,
5792 PyUnicodeObject
*str1
,
5793 PyUnicodeObject
*str2
,
5794 Py_ssize_t maxcount
)
5799 maxcount
= PY_SSIZE_T_MAX
;
5800 else if (maxcount
== 0 || self
->length
== 0)
5803 if (str1
->length
== str2
->length
) {
5806 if (str1
->length
== 0)
5808 if (str1
->length
== 1) {
5809 /* replace characters */
5811 if (!findchar(self
->str
, self
->length
, str1
->str
[0]))
5813 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5816 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5819 for (i
= 0; i
< u
->length
; i
++)
5820 if (u
->str
[i
] == u1
) {
5827 self
->str
, self
->length
, str1
->str
, str1
->length
, 0
5831 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5834 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5836 /* change everything in-place, starting with this one */
5837 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5840 while ( --maxcount
> 0) {
5841 i
= stringlib_find(self
->str
+i
, self
->length
-i
,
5842 str1
->str
, str1
->length
,
5846 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5852 Py_ssize_t n
, i
, j
, e
;
5853 Py_ssize_t product
, new_size
, delta
;
5856 /* replace strings */
5857 n
= stringlib_count(self
->str
, self
->length
, str1
->str
, str1
->length
,
5861 /* new_size = self->length + n * (str2->length - str1->length)); */
5862 delta
= (str2
->length
- str1
->length
);
5864 new_size
= self
->length
;
5866 product
= n
* (str2
->length
- str1
->length
);
5867 if ((product
/ (str2
->length
- str1
->length
)) != n
) {
5868 PyErr_SetString(PyExc_OverflowError
,
5869 "replace string is too long");
5872 new_size
= self
->length
+ product
;
5874 PyErr_SetString(PyExc_OverflowError
,
5875 "replace string is too long");
5879 u
= _PyUnicode_New(new_size
);
5884 e
= self
->length
- str1
->length
;
5885 if (str1
->length
> 0) {
5887 /* look for next match */
5888 j
= stringlib_find(self
->str
+i
, self
->length
-i
,
5889 str1
->str
, str1
->length
,
5894 /* copy unchanged part [i:j] */
5895 Py_UNICODE_COPY(p
, self
->str
+i
, j
-i
);
5898 /* copy substitution string */
5899 if (str2
->length
> 0) {
5900 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5903 i
= j
+ str1
->length
;
5905 if (i
< self
->length
)
5906 /* copy tail [i:] */
5907 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5911 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5915 *p
++ = self
->str
[i
++];
5917 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5920 return (PyObject
*) u
;
5923 /* nothing to replace; return original string (when possible) */
5924 if (PyUnicode_CheckExact(self
)) {
5926 return (PyObject
*) self
;
5928 return PyUnicode_FromUnicode(self
->str
, self
->length
);
5931 /* --- Unicode Object Methods --------------------------------------------- */
5933 PyDoc_STRVAR(title__doc__
,
5934 "S.title() -> unicode\n\
5936 Return a titlecased version of S, i.e. words start with title case\n\
5937 characters, all remaining cased characters have lower case.");
5940 unicode_title(PyUnicodeObject
*self
)
5942 return fixup(self
, fixtitle
);
5945 PyDoc_STRVAR(capitalize__doc__
,
5946 "S.capitalize() -> unicode\n\
5948 Return a capitalized version of S, i.e. make the first character\n\
5952 unicode_capitalize(PyUnicodeObject
*self
)
5954 return fixup(self
, fixcapitalize
);
5958 PyDoc_STRVAR(capwords__doc__
,
5959 "S.capwords() -> unicode\n\
5961 Apply .capitalize() to all words in S and return the result with\n\
5962 normalized whitespace (all whitespace strings are replaced by ' ').");
5965 unicode_capwords(PyUnicodeObject
*self
)
5971 /* Split into words */
5972 list
= split(self
, NULL
, -1);
5976 /* Capitalize each word */
5977 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
5978 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
5982 Py_DECREF(PyList_GET_ITEM(list
, i
));
5983 PyList_SET_ITEM(list
, i
, item
);
5986 /* Join the words to form a new string */
5987 item
= PyUnicode_Join(NULL
, list
);
5991 return (PyObject
*)item
;
5995 /* Argument converter. Coerces to a single unicode character */
5998 convert_uc(PyObject
*obj
, void *addr
)
6000 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
6004 uniobj
= PyUnicode_FromObject(obj
);
6005 if (uniobj
== NULL
) {
6006 PyErr_SetString(PyExc_TypeError
,
6007 "The fill character cannot be converted to Unicode");
6010 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
6011 PyErr_SetString(PyExc_TypeError
,
6012 "The fill character must be exactly one character long");
6016 unistr
= PyUnicode_AS_UNICODE(uniobj
);
6017 *fillcharloc
= unistr
[0];
6022 PyDoc_STRVAR(center__doc__
,
6023 "S.center(width[, fillchar]) -> unicode\n\
6025 Return S centered in a Unicode string of length width. Padding is\n\
6026 done using the specified fill character (default is a space)");
6029 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
6031 Py_ssize_t marg
, left
;
6033 Py_UNICODE fillchar
= ' ';
6035 if (!PyArg_ParseTuple(args
, "n|O&:center", &width
, convert_uc
, &fillchar
))
6038 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6040 return (PyObject
*) self
;
6043 marg
= width
- self
->length
;
6044 left
= marg
/ 2 + (marg
& width
& 1);
6046 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
6051 /* This code should go into some future Unicode collation support
6052 module. The basic comparison should compare ordinals on a naive
6053 basis (this is what Java does and thus Jython too). */
6055 /* speedy UTF-16 code point order comparison */
6057 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6059 static short utf16Fixup
[32] =
6061 0, 0, 0, 0, 0, 0, 0, 0,
6062 0, 0, 0, 0, 0, 0, 0, 0,
6063 0, 0, 0, 0, 0, 0, 0, 0,
6064 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6068 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6070 Py_ssize_t len1
, len2
;
6072 Py_UNICODE
*s1
= str1
->str
;
6073 Py_UNICODE
*s2
= str2
->str
;
6075 len1
= str1
->length
;
6076 len2
= str2
->length
;
6078 while (len1
> 0 && len2
> 0) {
6084 if (c1
> (1<<11) * 26)
6085 c1
+= utf16Fixup
[c1
>>11];
6086 if (c2
> (1<<11) * 26)
6087 c2
+= utf16Fixup
[c2
>>11];
6088 /* now c1 and c2 are in UTF-32-compatible order */
6091 return (c1
< c2
) ? -1 : 1;
6096 return (len1
< len2
) ? -1 : (len1
!= len2
);
6102 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6104 register Py_ssize_t len1
, len2
;
6106 Py_UNICODE
*s1
= str1
->str
;
6107 Py_UNICODE
*s2
= str2
->str
;
6109 len1
= str1
->length
;
6110 len2
= str2
->length
;
6112 while (len1
> 0 && len2
> 0) {
6119 return (c1
< c2
) ? -1 : 1;
6124 return (len1
< len2
) ? -1 : (len1
!= len2
);
6129 int PyUnicode_Compare(PyObject
*left
,
6132 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
6135 /* Coerce the two arguments */
6136 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6139 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6143 /* Shortcut for empty or interned objects */
6150 result
= unicode_compare(u
, v
);
6162 PyObject
*PyUnicode_RichCompare(PyObject
*left
,
6168 result
= PyUnicode_Compare(left
, right
);
6169 if (result
== -1 && PyErr_Occurred())
6172 /* Convert the return value to a Boolean */
6175 result
= (result
== 0);
6178 result
= (result
!= 0);
6181 result
= (result
<= 0);
6184 result
= (result
>= 0);
6187 result
= (result
== -1);
6190 result
= (result
== 1);
6193 return PyBool_FromLong(result
);
6199 Type errors mean that PyUnicode_FromObject() could not convert
6200 one of the arguments (usually the right hand side) to Unicode,
6201 ie. we can't handle the comparison request. However, it is
6202 possible that the other object knows a comparison method, which
6203 is why we return Py_NotImplemented to give the other object a
6207 if (PyErr_ExceptionMatches(PyExc_TypeError
)) {
6209 Py_INCREF(Py_NotImplemented
);
6210 return Py_NotImplemented
;
6212 if (op
!= Py_EQ
&& op
!= Py_NE
)
6215 /* Equality comparison.
6217 This is a special case: we silence any PyExc_UnicodeDecodeError
6218 and instead turn it into a PyErr_UnicodeWarning.
6221 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
))
6224 if (PyErr_Warn(PyExc_UnicodeWarning
,
6226 "Unicode equal comparison "
6227 "failed to convert both arguments to Unicode - "
6228 "interpreting them as being unequal" :
6229 "Unicode unequal comparison "
6230 "failed to convert both arguments to Unicode - "
6231 "interpreting them as being unequal"
6234 result
= (op
== Py_NE
);
6235 return PyBool_FromLong(result
);
6238 int PyUnicode_Contains(PyObject
*container
,
6241 PyObject
*str
, *sub
;
6244 /* Coerce the two arguments */
6245 sub
= PyUnicode_FromObject(element
);
6250 str
= PyUnicode_FromObject(container
);
6256 result
= stringlib_contains_obj(str
, sub
);
6264 /* Concat to string or Unicode object giving a new Unicode object. */
6266 PyObject
*PyUnicode_Concat(PyObject
*left
,
6269 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
6271 /* Coerce the two arguments */
6272 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6275 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6280 if (v
== unicode_empty
) {
6282 return (PyObject
*)u
;
6284 if (u
== unicode_empty
) {
6286 return (PyObject
*)v
;
6289 /* Concat the two Unicode strings */
6290 w
= _PyUnicode_New(u
->length
+ v
->length
);
6293 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
6294 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
6298 return (PyObject
*)w
;
6306 PyDoc_STRVAR(count__doc__
,
6307 "S.count(sub[, start[, end]]) -> int\n\
6309 Return the number of non-overlapping occurrences of substring sub in\n\
6310 Unicode string S[start:end]. Optional arguments start and end are\n\
6311 interpreted as in slice notation.");
6314 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
6316 PyUnicodeObject
*substring
;
6317 Py_ssize_t start
= 0;
6318 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6321 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
6322 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6325 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6326 (PyObject
*)substring
);
6327 if (substring
== NULL
)
6330 ADJUST_INDICES(start
, end
, self
->length
);
6331 result
= PyInt_FromSsize_t(
6332 stringlib_count(self
->str
+ start
, end
- start
,
6333 substring
->str
, substring
->length
,
6337 Py_DECREF(substring
);
6342 PyDoc_STRVAR(encode__doc__
,
6343 "S.encode([encoding[,errors]]) -> string or unicode\n\
6345 Encodes S using the codec registered for encoding. encoding defaults\n\
6346 to the default encoding. errors may be given to set a different error\n\
6347 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6348 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6349 'xmlcharrefreplace' as well as any other name registered with\n\
6350 codecs.register_error that can handle UnicodeEncodeErrors.");
6353 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
, PyObject
*kwargs
)
6355 static char *kwlist
[] = {"encoding", "errors", 0};
6356 char *encoding
= NULL
;
6357 char *errors
= NULL
;
6360 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "|ss:encode",
6361 kwlist
, &encoding
, &errors
))
6363 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
6366 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6367 PyErr_Format(PyExc_TypeError
,
6368 "encoder did not return a string/unicode object "
6370 Py_TYPE(v
)->tp_name
);
6380 PyDoc_STRVAR(decode__doc__
,
6381 "S.decode([encoding[,errors]]) -> string or unicode\n\
6383 Decodes S using the codec registered for encoding. encoding defaults\n\
6384 to the default encoding. errors may be given to set a different error\n\
6385 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6386 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6387 as well as any other name registerd with codecs.register_error that is\n\
6388 able to handle UnicodeDecodeErrors.");
6391 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
, PyObject
*kwargs
)
6393 static char *kwlist
[] = {"encoding", "errors", 0};
6394 char *encoding
= NULL
;
6395 char *errors
= NULL
;
6398 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "|ss:decode",
6399 kwlist
, &encoding
, &errors
))
6401 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
6404 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6405 PyErr_Format(PyExc_TypeError
,
6406 "decoder did not return a string/unicode object "
6408 Py_TYPE(v
)->tp_name
);
6418 PyDoc_STRVAR(expandtabs__doc__
,
6419 "S.expandtabs([tabsize]) -> unicode\n\
6421 Return a copy of S where all tab characters are expanded using spaces.\n\
6422 If tabsize is not given, a tab size of 8 characters is assumed.");
6425 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
6431 Py_ssize_t i
, j
, incr
;
6435 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
6438 /* First pass: determine size of output string */
6439 i
= 0; /* chars up to and including most recent \n or \r */
6440 j
= 0; /* chars since most recent \n or \r (use in tab calculations) */
6441 e
= self
->str
+ self
->length
; /* end of input */
6442 for (p
= self
->str
; p
< e
; p
++)
6445 incr
= tabsize
- (j
% tabsize
); /* cannot overflow */
6446 if (j
> PY_SSIZE_T_MAX
- incr
)
6452 if (j
> PY_SSIZE_T_MAX
- 1)
6455 if (*p
== '\n' || *p
== '\r') {
6456 if (i
> PY_SSIZE_T_MAX
- j
)
6463 if (i
> PY_SSIZE_T_MAX
- j
)
6466 /* Second pass: create output string and fill it */
6467 u
= _PyUnicode_New(i
+ j
);
6471 j
= 0; /* same as in first pass */
6472 q
= u
->str
; /* next output char */
6473 qe
= u
->str
+ u
->length
; /* end of output */
6475 for (p
= self
->str
; p
< e
; p
++)
6478 i
= tabsize
- (j
% tabsize
);
6492 if (*p
== '\n' || *p
== '\r')
6496 return (PyObject
*) u
;
6501 PyErr_SetString(PyExc_OverflowError
, "new string is too long");
6505 PyDoc_STRVAR(find__doc__
,
6506 "S.find(sub [,start [,end]]) -> int\n\
6508 Return the lowest index in S where substring sub is found,\n\
6509 such that sub is contained within s[start:end]. Optional\n\
6510 arguments start and end are interpreted as in slice notation.\n\
6512 Return -1 on failure.");
6515 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
6517 PyObject
*substring
;
6522 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6525 result
= stringlib_find_slice(
6526 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6527 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6531 Py_DECREF(substring
);
6533 return PyInt_FromSsize_t(result
);
6537 unicode_getitem(PyUnicodeObject
*self
, Py_ssize_t index
)
6539 if (index
< 0 || index
>= self
->length
) {
6540 PyErr_SetString(PyExc_IndexError
, "string index out of range");
6544 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
6548 unicode_hash(PyUnicodeObject
*self
)
6550 /* Since Unicode objects compare equal to their ASCII string
6551 counterparts, they should use the individual character values
6552 as basis for their hash value. This is needed to assure that
6553 strings and Unicode objects behave in the same way as
6556 register Py_ssize_t len
;
6557 register Py_UNICODE
*p
;
6560 if (self
->hash
!= -1)
6562 len
= PyUnicode_GET_SIZE(self
);
6563 p
= PyUnicode_AS_UNICODE(self
);
6566 x
= (1000003*x
) ^ *p
++;
6567 x
^= PyUnicode_GET_SIZE(self
);
6574 PyDoc_STRVAR(index__doc__
,
6575 "S.index(sub [,start [,end]]) -> int\n\
6577 Like S.find() but raise ValueError when the substring is not found.");
6580 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
6583 PyObject
*substring
;
6587 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6590 result
= stringlib_find_slice(
6591 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6592 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6596 Py_DECREF(substring
);
6599 PyErr_SetString(PyExc_ValueError
, "substring not found");
6603 return PyInt_FromSsize_t(result
);
6606 PyDoc_STRVAR(islower__doc__
,
6607 "S.islower() -> bool\n\
6609 Return True if all cased characters in S are lowercase and there is\n\
6610 at least one cased character in S, False otherwise.");
6613 unicode_islower(PyUnicodeObject
*self
)
6615 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6616 register const Py_UNICODE
*e
;
6619 /* Shortcut for single character strings */
6620 if (PyUnicode_GET_SIZE(self
) == 1)
6621 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
6623 /* Special case for empty strings */
6624 if (PyUnicode_GET_SIZE(self
) == 0)
6625 return PyBool_FromLong(0);
6627 e
= p
+ PyUnicode_GET_SIZE(self
);
6629 for (; p
< e
; p
++) {
6630 register const Py_UNICODE ch
= *p
;
6632 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
6633 return PyBool_FromLong(0);
6634 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
6637 return PyBool_FromLong(cased
);
6640 PyDoc_STRVAR(isupper__doc__
,
6641 "S.isupper() -> bool\n\
6643 Return True if all cased characters in S are uppercase and there is\n\
6644 at least one cased character in S, False otherwise.");
6647 unicode_isupper(PyUnicodeObject
*self
)
6649 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6650 register const Py_UNICODE
*e
;
6653 /* Shortcut for single character strings */
6654 if (PyUnicode_GET_SIZE(self
) == 1)
6655 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
6657 /* Special case for empty strings */
6658 if (PyUnicode_GET_SIZE(self
) == 0)
6659 return PyBool_FromLong(0);
6661 e
= p
+ PyUnicode_GET_SIZE(self
);
6663 for (; p
< e
; p
++) {
6664 register const Py_UNICODE ch
= *p
;
6666 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
6667 return PyBool_FromLong(0);
6668 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
6671 return PyBool_FromLong(cased
);
6674 PyDoc_STRVAR(istitle__doc__
,
6675 "S.istitle() -> bool\n\
6677 Return True if S is a titlecased string and there is at least one\n\
6678 character in S, i.e. upper- and titlecase characters may only\n\
6679 follow uncased characters and lowercase characters only cased ones.\n\
6680 Return False otherwise.");
6683 unicode_istitle(PyUnicodeObject
*self
)
6685 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6686 register const Py_UNICODE
*e
;
6687 int cased
, previous_is_cased
;
6689 /* Shortcut for single character strings */
6690 if (PyUnicode_GET_SIZE(self
) == 1)
6691 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
6692 (Py_UNICODE_ISUPPER(*p
) != 0));
6694 /* Special case for empty strings */
6695 if (PyUnicode_GET_SIZE(self
) == 0)
6696 return PyBool_FromLong(0);
6698 e
= p
+ PyUnicode_GET_SIZE(self
);
6700 previous_is_cased
= 0;
6701 for (; p
< e
; p
++) {
6702 register const Py_UNICODE ch
= *p
;
6704 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
6705 if (previous_is_cased
)
6706 return PyBool_FromLong(0);
6707 previous_is_cased
= 1;
6710 else if (Py_UNICODE_ISLOWER(ch
)) {
6711 if (!previous_is_cased
)
6712 return PyBool_FromLong(0);
6713 previous_is_cased
= 1;
6717 previous_is_cased
= 0;
6719 return PyBool_FromLong(cased
);
6722 PyDoc_STRVAR(isspace__doc__
,
6723 "S.isspace() -> bool\n\
6725 Return True if all characters in S are whitespace\n\
6726 and there is at least one character in S, False otherwise.");
6729 unicode_isspace(PyUnicodeObject
*self
)
6731 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6732 register const Py_UNICODE
*e
;
6734 /* Shortcut for single character strings */
6735 if (PyUnicode_GET_SIZE(self
) == 1 &&
6736 Py_UNICODE_ISSPACE(*p
))
6737 return PyBool_FromLong(1);
6739 /* Special case for empty strings */
6740 if (PyUnicode_GET_SIZE(self
) == 0)
6741 return PyBool_FromLong(0);
6743 e
= p
+ PyUnicode_GET_SIZE(self
);
6744 for (; p
< e
; p
++) {
6745 if (!Py_UNICODE_ISSPACE(*p
))
6746 return PyBool_FromLong(0);
6748 return PyBool_FromLong(1);
6751 PyDoc_STRVAR(isalpha__doc__
,
6752 "S.isalpha() -> bool\n\
6754 Return True if all characters in S are alphabetic\n\
6755 and there is at least one character in S, False otherwise.");
6758 unicode_isalpha(PyUnicodeObject
*self
)
6760 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6761 register const Py_UNICODE
*e
;
6763 /* Shortcut for single character strings */
6764 if (PyUnicode_GET_SIZE(self
) == 1 &&
6765 Py_UNICODE_ISALPHA(*p
))
6766 return PyBool_FromLong(1);
6768 /* Special case for empty strings */
6769 if (PyUnicode_GET_SIZE(self
) == 0)
6770 return PyBool_FromLong(0);
6772 e
= p
+ PyUnicode_GET_SIZE(self
);
6773 for (; p
< e
; p
++) {
6774 if (!Py_UNICODE_ISALPHA(*p
))
6775 return PyBool_FromLong(0);
6777 return PyBool_FromLong(1);
6780 PyDoc_STRVAR(isalnum__doc__
,
6781 "S.isalnum() -> bool\n\
6783 Return True if all characters in S are alphanumeric\n\
6784 and there is at least one character in S, False otherwise.");
6787 unicode_isalnum(PyUnicodeObject
*self
)
6789 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6790 register const Py_UNICODE
*e
;
6792 /* Shortcut for single character strings */
6793 if (PyUnicode_GET_SIZE(self
) == 1 &&
6794 Py_UNICODE_ISALNUM(*p
))
6795 return PyBool_FromLong(1);
6797 /* Special case for empty strings */
6798 if (PyUnicode_GET_SIZE(self
) == 0)
6799 return PyBool_FromLong(0);
6801 e
= p
+ PyUnicode_GET_SIZE(self
);
6802 for (; p
< e
; p
++) {
6803 if (!Py_UNICODE_ISALNUM(*p
))
6804 return PyBool_FromLong(0);
6806 return PyBool_FromLong(1);
6809 PyDoc_STRVAR(isdecimal__doc__
,
6810 "S.isdecimal() -> bool\n\
6812 Return True if there are only decimal characters in S,\n\
6816 unicode_isdecimal(PyUnicodeObject
*self
)
6818 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6819 register const Py_UNICODE
*e
;
6821 /* Shortcut for single character strings */
6822 if (PyUnicode_GET_SIZE(self
) == 1 &&
6823 Py_UNICODE_ISDECIMAL(*p
))
6824 return PyBool_FromLong(1);
6826 /* Special case for empty strings */
6827 if (PyUnicode_GET_SIZE(self
) == 0)
6828 return PyBool_FromLong(0);
6830 e
= p
+ PyUnicode_GET_SIZE(self
);
6831 for (; p
< e
; p
++) {
6832 if (!Py_UNICODE_ISDECIMAL(*p
))
6833 return PyBool_FromLong(0);
6835 return PyBool_FromLong(1);
6838 PyDoc_STRVAR(isdigit__doc__
,
6839 "S.isdigit() -> bool\n\
6841 Return True if all characters in S are digits\n\
6842 and there is at least one character in S, False otherwise.");
6845 unicode_isdigit(PyUnicodeObject
*self
)
6847 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6848 register const Py_UNICODE
*e
;
6850 /* Shortcut for single character strings */
6851 if (PyUnicode_GET_SIZE(self
) == 1 &&
6852 Py_UNICODE_ISDIGIT(*p
))
6853 return PyBool_FromLong(1);
6855 /* Special case for empty strings */
6856 if (PyUnicode_GET_SIZE(self
) == 0)
6857 return PyBool_FromLong(0);
6859 e
= p
+ PyUnicode_GET_SIZE(self
);
6860 for (; p
< e
; p
++) {
6861 if (!Py_UNICODE_ISDIGIT(*p
))
6862 return PyBool_FromLong(0);
6864 return PyBool_FromLong(1);
6867 PyDoc_STRVAR(isnumeric__doc__
,
6868 "S.isnumeric() -> bool\n\
6870 Return True if there are only numeric characters in S,\n\
6874 unicode_isnumeric(PyUnicodeObject
*self
)
6876 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6877 register const Py_UNICODE
*e
;
6879 /* Shortcut for single character strings */
6880 if (PyUnicode_GET_SIZE(self
) == 1 &&
6881 Py_UNICODE_ISNUMERIC(*p
))
6882 return PyBool_FromLong(1);
6884 /* Special case for empty strings */
6885 if (PyUnicode_GET_SIZE(self
) == 0)
6886 return PyBool_FromLong(0);
6888 e
= p
+ PyUnicode_GET_SIZE(self
);
6889 for (; p
< e
; p
++) {
6890 if (!Py_UNICODE_ISNUMERIC(*p
))
6891 return PyBool_FromLong(0);
6893 return PyBool_FromLong(1);
6896 PyDoc_STRVAR(join__doc__
,
6897 "S.join(iterable) -> unicode\n\
6899 Return a string which is the concatenation of the strings in the\n\
6900 iterable. The separator between elements is S.");
6903 unicode_join(PyObject
*self
, PyObject
*data
)
6905 return PyUnicode_Join(self
, data
);
6909 unicode_length(PyUnicodeObject
*self
)
6911 return self
->length
;
6914 PyDoc_STRVAR(ljust__doc__
,
6915 "S.ljust(width[, fillchar]) -> int\n\
6917 Return S left-justified in a Unicode string of length width. Padding is\n\
6918 done using the specified fill character (default is a space).");
6921 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
6924 Py_UNICODE fillchar
= ' ';
6926 if (!PyArg_ParseTuple(args
, "n|O&:ljust", &width
, convert_uc
, &fillchar
))
6929 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6931 return (PyObject
*) self
;
6934 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
6937 PyDoc_STRVAR(lower__doc__
,
6938 "S.lower() -> unicode\n\
6940 Return a copy of the string S converted to lowercase.");
6943 unicode_lower(PyUnicodeObject
*self
)
6945 return fixup(self
, fixlower
);
6949 #define RIGHTSTRIP 1
6952 /* Arrays indexed by above */
6953 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6955 #define STRIPNAME(i) (stripformat[i]+3)
6957 /* externally visible for str.strip(unicode) */
6959 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
6961 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6962 Py_ssize_t len
= PyUnicode_GET_SIZE(self
);
6963 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
6964 Py_ssize_t seplen
= PyUnicode_GET_SIZE(sepobj
);
6967 BLOOM_MASK sepmask
= make_bloom_mask(sep
, seplen
);
6970 if (striptype
!= RIGHTSTRIP
) {
6971 while (i
< len
&& BLOOM_MEMBER(sepmask
, s
[i
], sep
, seplen
)) {
6977 if (striptype
!= LEFTSTRIP
) {
6980 } while (j
>= i
&& BLOOM_MEMBER(sepmask
, s
[j
], sep
, seplen
));
6984 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
6986 return (PyObject
*)self
;
6989 return PyUnicode_FromUnicode(s
+i
, j
-i
);
6994 do_strip(PyUnicodeObject
*self
, int striptype
)
6996 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6997 Py_ssize_t len
= PyUnicode_GET_SIZE(self
), i
, j
;
7000 if (striptype
!= RIGHTSTRIP
) {
7001 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
7007 if (striptype
!= LEFTSTRIP
) {
7010 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
7014 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7016 return (PyObject
*)self
;
7019 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7024 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
7026 PyObject
*sep
= NULL
;
7028 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
7031 if (sep
!= NULL
&& sep
!= Py_None
) {
7032 if (PyUnicode_Check(sep
))
7033 return _PyUnicode_XStrip(self
, striptype
, sep
);
7034 else if (PyString_Check(sep
)) {
7036 sep
= PyUnicode_FromObject(sep
);
7039 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
7044 PyErr_Format(PyExc_TypeError
,
7045 "%s arg must be None, unicode or str",
7046 STRIPNAME(striptype
));
7051 return do_strip(self
, striptype
);
7055 PyDoc_STRVAR(strip__doc__
,
7056 "S.strip([chars]) -> unicode\n\
7058 Return a copy of the string S with leading and trailing\n\
7059 whitespace removed.\n\
7060 If chars is given and not None, remove characters in chars instead.\n\
7061 If chars is a str, it will be converted to unicode before stripping");
7064 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
7066 if (PyTuple_GET_SIZE(args
) == 0)
7067 return do_strip(self
, BOTHSTRIP
); /* Common case */
7069 return do_argstrip(self
, BOTHSTRIP
, args
);
7073 PyDoc_STRVAR(lstrip__doc__
,
7074 "S.lstrip([chars]) -> unicode\n\
7076 Return a copy of the string S with leading whitespace removed.\n\
7077 If chars is given and not None, remove characters in chars instead.\n\
7078 If chars is a str, it will be converted to unicode before stripping");
7081 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
7083 if (PyTuple_GET_SIZE(args
) == 0)
7084 return do_strip(self
, LEFTSTRIP
); /* Common case */
7086 return do_argstrip(self
, LEFTSTRIP
, args
);
7090 PyDoc_STRVAR(rstrip__doc__
,
7091 "S.rstrip([chars]) -> unicode\n\
7093 Return a copy of the string S with trailing whitespace removed.\n\
7094 If chars is given and not None, remove characters in chars instead.\n\
7095 If chars is a str, it will be converted to unicode before stripping");
7098 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
7100 if (PyTuple_GET_SIZE(args
) == 0)
7101 return do_strip(self
, RIGHTSTRIP
); /* Common case */
7103 return do_argstrip(self
, RIGHTSTRIP
, args
);
7108 unicode_repeat(PyUnicodeObject
*str
, Py_ssize_t len
)
7118 if (len
== 1 && PyUnicode_CheckExact(str
)) {
7119 /* no repeat, return original string */
7121 return (PyObject
*) str
;
7124 /* ensure # of chars needed doesn't overflow int and # of bytes
7125 * needed doesn't overflow size_t
7127 nchars
= len
* str
->length
;
7128 if (len
&& nchars
/ len
!= str
->length
) {
7129 PyErr_SetString(PyExc_OverflowError
,
7130 "repeated string is too long");
7133 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
7134 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
7135 PyErr_SetString(PyExc_OverflowError
,
7136 "repeated string is too long");
7139 u
= _PyUnicode_New(nchars
);
7145 if (str
->length
== 1 && len
> 0) {
7146 Py_UNICODE_FILL(p
, str
->str
[0], len
);
7148 Py_ssize_t done
= 0; /* number of characters copied this far */
7149 if (done
< nchars
) {
7150 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
7153 while (done
< nchars
) {
7154 Py_ssize_t n
= (done
<= nchars
-done
) ? done
: nchars
-done
;
7155 Py_UNICODE_COPY(p
+done
, p
, n
);
7160 return (PyObject
*) u
;
7163 PyObject
*PyUnicode_Replace(PyObject
*obj
,
7166 Py_ssize_t maxcount
)
7173 self
= PyUnicode_FromObject(obj
);
7176 str1
= PyUnicode_FromObject(subobj
);
7181 str2
= PyUnicode_FromObject(replobj
);
7187 result
= replace((PyUnicodeObject
*)self
,
7188 (PyUnicodeObject
*)str1
,
7189 (PyUnicodeObject
*)str2
,
7197 PyDoc_STRVAR(replace__doc__
,
7198 "S.replace (old, new[, count]) -> unicode\n\
7200 Return a copy of S with all occurrences of substring\n\
7201 old replaced by new. If the optional argument count is\n\
7202 given, only the first count occurrences are replaced.");
7205 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
7207 PyUnicodeObject
*str1
;
7208 PyUnicodeObject
*str2
;
7209 Py_ssize_t maxcount
= -1;
7212 if (!PyArg_ParseTuple(args
, "OO|n:replace", &str1
, &str2
, &maxcount
))
7214 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
7217 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
7223 result
= replace(self
, str1
, str2
, maxcount
);
7231 PyObject
*unicode_repr(PyObject
*unicode
)
7233 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
7234 PyUnicode_GET_SIZE(unicode
),
7238 PyDoc_STRVAR(rfind__doc__
,
7239 "S.rfind(sub [,start [,end]]) -> int\n\
7241 Return the highest index in S where substring sub is found,\n\
7242 such that sub is contained within s[start:end]. Optional\n\
7243 arguments start and end are interpreted as in slice notation.\n\
7245 Return -1 on failure.");
7248 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
7250 PyObject
*substring
;
7255 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7258 result
= stringlib_rfind_slice(
7259 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7260 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7264 Py_DECREF(substring
);
7266 return PyInt_FromSsize_t(result
);
7269 PyDoc_STRVAR(rindex__doc__
,
7270 "S.rindex(sub [,start [,end]]) -> int\n\
7272 Like S.rfind() but raise ValueError when the substring is not found.");
7275 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
7277 PyObject
*substring
;
7282 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7285 result
= stringlib_rfind_slice(
7286 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7287 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7291 Py_DECREF(substring
);
7294 PyErr_SetString(PyExc_ValueError
, "substring not found");
7297 return PyInt_FromSsize_t(result
);
7300 PyDoc_STRVAR(rjust__doc__
,
7301 "S.rjust(width[, fillchar]) -> unicode\n\
7303 Return S right-justified in a Unicode string of length width. Padding is\n\
7304 done using the specified fill character (default is a space).");
7307 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
7310 Py_UNICODE fillchar
= ' ';
7312 if (!PyArg_ParseTuple(args
, "n|O&:rjust", &width
, convert_uc
, &fillchar
))
7315 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7317 return (PyObject
*) self
;
7320 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
7324 unicode_slice(PyUnicodeObject
*self
, Py_ssize_t start
, Py_ssize_t end
)
7326 /* standard clamping */
7331 if (end
> self
->length
)
7333 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
7334 /* full slice, return original string */
7336 return (PyObject
*) self
;
7341 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
7345 PyObject
*PyUnicode_Split(PyObject
*s
,
7347 Py_ssize_t maxsplit
)
7351 s
= PyUnicode_FromObject(s
);
7355 sep
= PyUnicode_FromObject(sep
);
7362 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7369 PyDoc_STRVAR(split__doc__
,
7370 "S.split([sep [,maxsplit]]) -> list of strings\n\
7372 Return a list of the words in S, using sep as the\n\
7373 delimiter string. If maxsplit is given, at most maxsplit\n\
7374 splits are done. If sep is not specified or is None, any\n\
7375 whitespace string is a separator and empty strings are\n\
7376 removed from the result.");
7379 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
7381 PyObject
*substring
= Py_None
;
7382 Py_ssize_t maxcount
= -1;
7384 if (!PyArg_ParseTuple(args
, "|On:split", &substring
, &maxcount
))
7387 if (substring
== Py_None
)
7388 return split(self
, NULL
, maxcount
);
7389 else if (PyUnicode_Check(substring
))
7390 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
7392 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
7396 PyUnicode_Partition(PyObject
*str_in
, PyObject
*sep_in
)
7402 str_obj
= PyUnicode_FromObject(str_in
);
7405 sep_obj
= PyUnicode_FromObject(sep_in
);
7411 out
= stringlib_partition(
7412 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7413 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7424 PyUnicode_RPartition(PyObject
*str_in
, PyObject
*sep_in
)
7430 str_obj
= PyUnicode_FromObject(str_in
);
7433 sep_obj
= PyUnicode_FromObject(sep_in
);
7439 out
= stringlib_rpartition(
7440 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7441 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7450 PyDoc_STRVAR(partition__doc__
,
7451 "S.partition(sep) -> (head, sep, tail)\n\
7453 Search for the separator sep in S, and return the part before it,\n\
7454 the separator itself, and the part after it. If the separator is not\n\
7455 found, return S and two empty strings.");
7458 unicode_partition(PyUnicodeObject
*self
, PyObject
*separator
)
7460 return PyUnicode_Partition((PyObject
*)self
, separator
);
7463 PyDoc_STRVAR(rpartition__doc__
,
7464 "S.rpartition(sep) -> (tail, sep, head)\n\
7466 Search for the separator sep in S, starting at the end of S, and return\n\
7467 the part before it, the separator itself, and the part after it. If the\n\
7468 separator is not found, return two empty strings and S.");
7471 unicode_rpartition(PyUnicodeObject
*self
, PyObject
*separator
)
7473 return PyUnicode_RPartition((PyObject
*)self
, separator
);
7476 PyObject
*PyUnicode_RSplit(PyObject
*s
,
7478 Py_ssize_t maxsplit
)
7482 s
= PyUnicode_FromObject(s
);
7486 sep
= PyUnicode_FromObject(sep
);
7493 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7500 PyDoc_STRVAR(rsplit__doc__
,
7501 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7503 Return a list of the words in S, using sep as the\n\
7504 delimiter string, starting at the end of the string and\n\
7505 working to the front. If maxsplit is given, at most maxsplit\n\
7506 splits are done. If sep is not specified, any whitespace string\n\
7510 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
7512 PyObject
*substring
= Py_None
;
7513 Py_ssize_t maxcount
= -1;
7515 if (!PyArg_ParseTuple(args
, "|On:rsplit", &substring
, &maxcount
))
7518 if (substring
== Py_None
)
7519 return rsplit(self
, NULL
, maxcount
);
7520 else if (PyUnicode_Check(substring
))
7521 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
7523 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
7526 PyDoc_STRVAR(splitlines__doc__
,
7527 "S.splitlines([keepends]) -> list of strings\n\
7529 Return a list of the lines in S, breaking at line boundaries.\n\
7530 Line breaks are not included in the resulting list unless keepends\n\
7531 is given and true.");
7534 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
7538 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
7541 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
7545 PyObject
*unicode_str(PyUnicodeObject
*self
)
7547 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
7550 PyDoc_STRVAR(swapcase__doc__
,
7551 "S.swapcase() -> unicode\n\
7553 Return a copy of S with uppercase characters converted to lowercase\n\
7557 unicode_swapcase(PyUnicodeObject
*self
)
7559 return fixup(self
, fixswapcase
);
7562 PyDoc_STRVAR(translate__doc__
,
7563 "S.translate(table) -> unicode\n\
7565 Return a copy of the string S, where all characters have been mapped\n\
7566 through the given translation table, which must be a mapping of\n\
7567 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7568 Unmapped characters are left untouched. Characters mapped to None\n\
7572 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
7574 return PyUnicode_TranslateCharmap(self
->str
,
7580 PyDoc_STRVAR(upper__doc__
,
7581 "S.upper() -> unicode\n\
7583 Return a copy of S converted to uppercase.");
7586 unicode_upper(PyUnicodeObject
*self
)
7588 return fixup(self
, fixupper
);
7591 PyDoc_STRVAR(zfill__doc__
,
7592 "S.zfill(width) -> unicode\n\
7594 Pad a numeric string S with zeros on the left, to fill a field\n\
7595 of the specified width. The string S is never truncated.");
7598 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
7604 if (!PyArg_ParseTuple(args
, "n:zfill", &width
))
7607 if (self
->length
>= width
) {
7608 if (PyUnicode_CheckExact(self
)) {
7610 return (PyObject
*) self
;
7613 return PyUnicode_FromUnicode(
7614 PyUnicode_AS_UNICODE(self
),
7615 PyUnicode_GET_SIZE(self
)
7619 fill
= width
- self
->length
;
7621 u
= pad(self
, fill
, 0, '0');
7626 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
7627 /* move sign to beginning of string */
7628 u
->str
[0] = u
->str
[fill
];
7632 return (PyObject
*) u
;
7637 free_listsize(PyUnicodeObject
*self
)
7639 return PyInt_FromLong(numfree
);
7643 PyDoc_STRVAR(startswith__doc__
,
7644 "S.startswith(prefix[, start[, end]]) -> bool\n\
7646 Return True if S starts with the specified prefix, False otherwise.\n\
7647 With optional start, test S beginning at that position.\n\
7648 With optional end, stop comparing S at that position.\n\
7649 prefix can also be a tuple of strings to try.");
7652 unicode_startswith(PyUnicodeObject
*self
,
7656 PyUnicodeObject
*substring
;
7657 Py_ssize_t start
= 0;
7658 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7661 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &subobj
,
7662 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7664 if (PyTuple_Check(subobj
)) {
7666 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7667 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7668 PyTuple_GET_ITEM(subobj
, i
));
7669 if (substring
== NULL
)
7671 result
= tailmatch(self
, substring
, start
, end
, -1);
7672 Py_DECREF(substring
);
7677 /* nothing matched */
7680 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7681 if (substring
== NULL
)
7683 result
= tailmatch(self
, substring
, start
, end
, -1);
7684 Py_DECREF(substring
);
7685 return PyBool_FromLong(result
);
7689 PyDoc_STRVAR(endswith__doc__
,
7690 "S.endswith(suffix[, start[, end]]) -> bool\n\
7692 Return True if S ends with the specified suffix, False otherwise.\n\
7693 With optional start, test S beginning at that position.\n\
7694 With optional end, stop comparing S at that position.\n\
7695 suffix can also be a tuple of strings to try.");
7698 unicode_endswith(PyUnicodeObject
*self
,
7702 PyUnicodeObject
*substring
;
7703 Py_ssize_t start
= 0;
7704 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7707 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &subobj
,
7708 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7710 if (PyTuple_Check(subobj
)) {
7712 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7713 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7714 PyTuple_GET_ITEM(subobj
, i
));
7715 if (substring
== NULL
)
7717 result
= tailmatch(self
, substring
, start
, end
, +1);
7718 Py_DECREF(substring
);
7725 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7726 if (substring
== NULL
)
7729 result
= tailmatch(self
, substring
, start
, end
, +1);
7730 Py_DECREF(substring
);
7731 return PyBool_FromLong(result
);
7735 /* Implements do_string_format, which is unicode because of stringlib */
7736 #include "stringlib/string_format.h"
7738 PyDoc_STRVAR(format__doc__
,
7739 "S.format(*args, **kwargs) -> unicode\n\
7744 unicode__format__(PyObject
*self
, PyObject
*args
)
7746 PyObject
*format_spec
;
7747 PyObject
*result
= NULL
;
7748 PyObject
*tmp
= NULL
;
7750 /* If 2.x, convert format_spec to the same type as value */
7751 /* This is to allow things like u''.format('') */
7752 if (!PyArg_ParseTuple(args
, "O:__format__", &format_spec
))
7754 if (!(PyBytes_Check(format_spec
) || PyUnicode_Check(format_spec
))) {
7755 PyErr_Format(PyExc_TypeError
, "__format__ arg must be str "
7756 "or unicode, not %s", Py_TYPE(format_spec
)->tp_name
);
7759 tmp
= PyObject_Unicode(format_spec
);
7764 result
= _PyUnicode_FormatAdvanced(self
,
7765 PyUnicode_AS_UNICODE(format_spec
),
7766 PyUnicode_GET_SIZE(format_spec
));
7772 PyDoc_STRVAR(p_format__doc__
,
7773 "S.__format__(format_spec) -> unicode\n\
7778 unicode__sizeof__(PyUnicodeObject
*v
)
7780 return PyInt_FromSsize_t(sizeof(PyUnicodeObject
) +
7781 sizeof(Py_UNICODE
) * (v
->length
+ 1));
7784 PyDoc_STRVAR(sizeof__doc__
,
7785 "S.__sizeof__() -> size of S in memory, in bytes\n\
7790 unicode_getnewargs(PyUnicodeObject
*v
)
7792 return Py_BuildValue("(u#)", v
->str
, v
->length
);
7796 static PyMethodDef unicode_methods
[] = {
7798 /* Order is according to common usage: often used methods should
7799 appear first, since lookup is done sequentially. */
7801 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
| METH_KEYWORDS
, encode__doc__
},
7802 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
7803 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
7804 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
7805 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
7806 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
7807 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
7808 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
7809 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
7810 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
7811 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
7812 {"partition", (PyCFunction
) unicode_partition
, METH_O
, partition__doc__
},
7813 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
7814 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
7815 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
7816 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
7817 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
| METH_KEYWORDS
, decode__doc__
},
7818 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7819 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
7820 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
7821 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
7822 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
7823 {"rpartition", (PyCFunction
) unicode_rpartition
, METH_O
, rpartition__doc__
},
7824 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
7825 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
7826 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
7827 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
7828 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
7829 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
7830 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
7831 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
7832 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
7833 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
7834 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
7835 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
7836 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
7837 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
7838 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
7839 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
7840 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
7841 {"format", (PyCFunction
) do_string_format
, METH_VARARGS
| METH_KEYWORDS
, format__doc__
},
7842 {"__format__", (PyCFunction
) unicode__format__
, METH_VARARGS
, p_format__doc__
},
7843 {"_formatter_field_name_split", (PyCFunction
) formatter_field_name_split
, METH_NOARGS
},
7844 {"_formatter_parser", (PyCFunction
) formatter_parser
, METH_NOARGS
},
7845 {"__sizeof__", (PyCFunction
) unicode__sizeof__
, METH_NOARGS
, sizeof__doc__
},
7847 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
7851 /* This one is just used for debugging the implementation. */
7852 {"freelistsize", (PyCFunction
) free_listsize
, METH_NOARGS
},
7855 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
7860 unicode_mod(PyObject
*v
, PyObject
*w
)
7862 if (!PyUnicode_Check(v
)) {
7863 Py_INCREF(Py_NotImplemented
);
7864 return Py_NotImplemented
;
7866 return PyUnicode_Format(v
, w
);
7869 static PyNumberMethods unicode_as_number
= {
7874 unicode_mod
, /*nb_remainder*/
7877 static PySequenceMethods unicode_as_sequence
= {
7878 (lenfunc
) unicode_length
, /* sq_length */
7879 PyUnicode_Concat
, /* sq_concat */
7880 (ssizeargfunc
) unicode_repeat
, /* sq_repeat */
7881 (ssizeargfunc
) unicode_getitem
, /* sq_item */
7882 (ssizessizeargfunc
) unicode_slice
, /* sq_slice */
7883 0, /* sq_ass_item */
7884 0, /* sq_ass_slice */
7885 PyUnicode_Contains
, /* sq_contains */
7889 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
7891 if (PyIndex_Check(item
)) {
7892 Py_ssize_t i
= PyNumber_AsSsize_t(item
, PyExc_IndexError
);
7893 if (i
== -1 && PyErr_Occurred())
7896 i
+= PyUnicode_GET_SIZE(self
);
7897 return unicode_getitem(self
, i
);
7898 } else if (PySlice_Check(item
)) {
7899 Py_ssize_t start
, stop
, step
, slicelength
, cur
, i
;
7900 Py_UNICODE
* source_buf
;
7901 Py_UNICODE
* result_buf
;
7904 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyUnicode_GET_SIZE(self
),
7905 &start
, &stop
, &step
, &slicelength
) < 0) {
7909 if (slicelength
<= 0) {
7910 return PyUnicode_FromUnicode(NULL
, 0);
7911 } else if (start
== 0 && step
== 1 && slicelength
== self
->length
&&
7912 PyUnicode_CheckExact(self
)) {
7914 return (PyObject
*)self
;
7915 } else if (step
== 1) {
7916 return PyUnicode_FromUnicode(self
->str
+ start
, slicelength
);
7918 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
7919 result_buf
= (Py_UNICODE
*)PyObject_MALLOC(slicelength
*
7920 sizeof(Py_UNICODE
));
7922 if (result_buf
== NULL
)
7923 return PyErr_NoMemory();
7925 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
7926 result_buf
[i
] = source_buf
[cur
];
7929 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
7930 PyObject_FREE(result_buf
);
7934 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
7939 static PyMappingMethods unicode_as_mapping
= {
7940 (lenfunc
)unicode_length
, /* mp_length */
7941 (binaryfunc
)unicode_subscript
, /* mp_subscript */
7942 (objobjargproc
)0, /* mp_ass_subscript */
7946 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
7951 PyErr_SetString(PyExc_SystemError
,
7952 "accessing non-existent unicode segment");
7955 *ptr
= (void *) self
->str
;
7956 return PyUnicode_GET_DATA_SIZE(self
);
7960 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, Py_ssize_t index
,
7963 PyErr_SetString(PyExc_TypeError
,
7964 "cannot use unicode as modifiable buffer");
7969 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
7973 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
7978 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
7985 PyErr_SetString(PyExc_SystemError
,
7986 "accessing non-existent unicode segment");
7989 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
7992 *ptr
= (void *) PyString_AS_STRING(str
);
7993 return PyString_GET_SIZE(str
);
7996 /* Helpers for PyUnicode_Format() */
7999 getnextarg(PyObject
*args
, Py_ssize_t arglen
, Py_ssize_t
*p_argidx
)
8001 Py_ssize_t argidx
= *p_argidx
;
8002 if (argidx
< arglen
) {
8007 return PyTuple_GetItem(args
, argidx
);
8009 PyErr_SetString(PyExc_TypeError
,
8010 "not enough arguments for format string");
8014 #define F_LJUST (1<<0)
8015 #define F_SIGN (1<<1)
8016 #define F_BLANK (1<<2)
8017 #define F_ALT (1<<3)
8018 #define F_ZERO (1<<4)
8021 strtounicode(Py_UNICODE
*buffer
, const char *charbuffer
)
8023 register Py_ssize_t i
;
8024 Py_ssize_t len
= strlen(charbuffer
);
8025 for (i
= len
- 1; i
>= 0; i
--)
8026 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
8032 longtounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, long x
)
8036 PyOS_snprintf((char *)buffer
, len
, format
, x
);
8037 result
= strtounicode(buffer
, (char *)buffer
);
8038 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
8041 /* XXX To save some code duplication, formatfloat/long/int could have been
8042 shared with stringobject.c, converting from 8-bit to Unicode after the
8043 formatting is done. */
8045 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8048 formatfloat(PyObject
*v
, int flags
, int prec
, int type
)
8054 x
= PyFloat_AsDouble(v
);
8055 if (x
== -1.0 && PyErr_Occurred())
8061 p
= PyOS_double_to_string(x
, type
, prec
,
8062 (flags
& F_ALT
) ? Py_DTSF_ALT
: 0, NULL
);
8065 result
= PyUnicode_FromStringAndSize(p
, strlen(p
));
8071 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
8075 PyObject
*str
; /* temporary string object. */
8076 PyUnicodeObject
*result
;
8078 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
8081 result
= _PyUnicode_New(len
);
8086 for (i
= 0; i
< len
; i
++)
8087 result
->str
[i
] = buf
[i
];
8088 result
->str
[len
] = 0;
8090 return (PyObject
*)result
;
8094 formatint(Py_UNICODE
*buf
,
8101 /* fmt = '%#.' + `prec` + 'l' + `type`
8102 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8106 char fmt
[64]; /* plenty big enough! */
8110 x
= PyInt_AsLong(v
);
8111 if (x
== -1 && PyErr_Occurred())
8113 if (x
< 0 && type
== 'u') {
8116 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
8123 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8124 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8126 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
8127 PyErr_SetString(PyExc_OverflowError
,
8128 "formatted integer is too long (precision too large?)");
8132 if ((flags
& F_ALT
) &&
8133 (type
== 'x' || type
== 'X')) {
8134 /* When converting under %#x or %#X, there are a number
8135 * of issues that cause pain:
8136 * - when 0 is being converted, the C standard leaves off
8137 * the '0x' or '0X', which is inconsistent with other
8138 * %#x/%#X conversions and inconsistent with Python's
8140 * - there are platforms that violate the standard and
8141 * convert 0 with the '0x' or '0X'
8142 * (Metrowerks, Compaq Tru64)
8143 * - there are platforms that give '0x' when converting
8144 * under %#X, but convert 0 in accordance with the
8145 * standard (OS/2 EMX)
8147 * We can achieve the desired consistency by inserting our
8148 * own '0x' or '0X' prefix, and substituting %x/%X in place
8151 * Note that this is the same approach as used in
8152 * formatint() in stringobject.c
8154 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
8155 sign
, type
, prec
, type
);
8158 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
8159 sign
, (flags
&F_ALT
) ? "#" : "",
8163 return longtounicode(buf
, buflen
, fmt
, -x
);
8165 return longtounicode(buf
, buflen
, fmt
, x
);
8169 formatchar(Py_UNICODE
*buf
,
8173 /* presume that the buffer is at least 2 characters long */
8174 if (PyUnicode_Check(v
)) {
8175 if (PyUnicode_GET_SIZE(v
) != 1)
8177 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
8180 else if (PyString_Check(v
)) {
8181 if (PyString_GET_SIZE(v
) != 1)
8183 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
8187 /* Integer input truncated to a character */
8189 x
= PyInt_AsLong(v
);
8190 if (x
== -1 && PyErr_Occurred())
8192 #ifdef Py_UNICODE_WIDE
8193 if (x
< 0 || x
> 0x10ffff) {
8194 PyErr_SetString(PyExc_OverflowError
,
8195 "%c arg not in range(0x110000) "
8196 "(wide Python build)");
8200 if (x
< 0 || x
> 0xffff) {
8201 PyErr_SetString(PyExc_OverflowError
,
8202 "%c arg not in range(0x10000) "
8203 "(narrow Python build)");
8207 buf
[0] = (Py_UNICODE
) x
;
8213 PyErr_SetString(PyExc_TypeError
,
8214 "%c requires int or char");
8218 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8220 FORMATBUFLEN is the length of the buffer in which the ints &
8221 chars are formatted. XXX This is a magic number. Each formatting
8222 routine does bounds checking to ensure no overflow, but a better
8223 solution may be to malloc a buffer of appropriate size for each
8224 format. For now, the current solution is sufficient.
8226 #define FORMATBUFLEN (size_t)120
8228 PyObject
*PyUnicode_Format(PyObject
*format
,
8231 Py_UNICODE
*fmt
, *res
;
8232 Py_ssize_t fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
8234 PyUnicodeObject
*result
= NULL
;
8235 PyObject
*dict
= NULL
;
8238 if (format
== NULL
|| args
== NULL
) {
8239 PyErr_BadInternalCall();
8242 uformat
= PyUnicode_FromObject(format
);
8243 if (uformat
== NULL
)
8245 fmt
= PyUnicode_AS_UNICODE(uformat
);
8246 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
8248 reslen
= rescnt
= fmtcnt
+ 100;
8249 result
= _PyUnicode_New(reslen
);
8252 res
= PyUnicode_AS_UNICODE(result
);
8254 if (PyTuple_Check(args
)) {
8255 arglen
= PyTuple_Size(args
);
8262 if (Py_TYPE(args
)->tp_as_mapping
&& !PyTuple_Check(args
) &&
8263 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
8266 while (--fmtcnt
>= 0) {
8269 rescnt
= fmtcnt
+ 100;
8271 if (_PyUnicode_Resize(&result
, reslen
) < 0)
8273 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
8279 /* Got a format specifier */
8281 Py_ssize_t width
= -1;
8283 Py_UNICODE c
= '\0';
8287 PyObject
*temp
= NULL
;
8291 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{int,char}() */
8295 Py_UNICODE
*keystart
;
8301 PyErr_SetString(PyExc_TypeError
,
8302 "format requires a mapping");
8308 /* Skip over balanced parentheses */
8309 while (pcount
> 0 && --fmtcnt
>= 0) {
8312 else if (*fmt
== '(')
8316 keylen
= fmt
- keystart
- 1;
8317 if (fmtcnt
< 0 || pcount
> 0) {
8318 PyErr_SetString(PyExc_ValueError
,
8319 "incomplete format key");
8323 /* keys are converted to strings using UTF-8 and
8324 then looked up since Python uses strings to hold
8325 variables names etc. in its namespaces and we
8326 wouldn't want to break common idioms. */
8327 key
= PyUnicode_EncodeUTF8(keystart
,
8331 key
= PyUnicode_FromUnicode(keystart
, keylen
);
8339 args
= PyObject_GetItem(dict
, key
);
8348 while (--fmtcnt
>= 0) {
8349 switch (c
= *fmt
++) {
8350 case '-': flags
|= F_LJUST
; continue;
8351 case '+': flags
|= F_SIGN
; continue;
8352 case ' ': flags
|= F_BLANK
; continue;
8353 case '#': flags
|= F_ALT
; continue;
8354 case '0': flags
|= F_ZERO
; continue;
8359 v
= getnextarg(args
, arglen
, &argidx
);
8362 if (!PyInt_Check(v
)) {
8363 PyErr_SetString(PyExc_TypeError
,
8367 width
= PyInt_AsLong(v
);
8375 else if (c
>= '0' && c
<= '9') {
8377 while (--fmtcnt
>= 0) {
8379 if (c
< '0' || c
> '9')
8381 if ((width
*10) / 10 != width
) {
8382 PyErr_SetString(PyExc_ValueError
,
8386 width
= width
*10 + (c
- '0');
8394 v
= getnextarg(args
, arglen
, &argidx
);
8397 if (!PyInt_Check(v
)) {
8398 PyErr_SetString(PyExc_TypeError
,
8402 prec
= PyInt_AsLong(v
);
8408 else if (c
>= '0' && c
<= '9') {
8410 while (--fmtcnt
>= 0) {
8411 c
= Py_CHARMASK(*fmt
++);
8412 if (c
< '0' || c
> '9')
8414 if ((prec
*10) / 10 != prec
) {
8415 PyErr_SetString(PyExc_ValueError
,
8419 prec
= prec
*10 + (c
- '0');
8424 if (c
== 'h' || c
== 'l' || c
== 'L') {
8430 PyErr_SetString(PyExc_ValueError
,
8431 "incomplete format");
8435 v
= getnextarg(args
, arglen
, &argidx
);
8445 /* presume that buffer length is at least 1 */
8452 if (PyUnicode_Check(v
) && c
== 's') {
8459 temp
= PyObject_Unicode(v
);
8461 temp
= PyObject_Repr(v
);
8464 if (PyUnicode_Check(temp
))
8465 /* nothing to do */;
8466 else if (PyString_Check(temp
)) {
8467 /* convert to string to Unicode */
8468 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
8469 PyString_GET_SIZE(temp
),
8479 PyErr_SetString(PyExc_TypeError
,
8480 "%s argument has non-string str()");
8484 pbuf
= PyUnicode_AS_UNICODE(temp
);
8485 len
= PyUnicode_GET_SIZE(temp
);
8486 if (prec
>= 0 && len
> prec
)
8499 if (PyNumber_Check(v
)) {
8500 PyObject
*iobj
=NULL
;
8502 if (PyInt_Check(v
) || (PyLong_Check(v
))) {
8507 iobj
= PyNumber_Int(v
);
8508 if (iobj
==NULL
) iobj
= PyNumber_Long(v
);
8511 if (PyInt_Check(iobj
)) {
8514 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8515 flags
, prec
, c
, iobj
);
8521 else if (PyLong_Check(iobj
)) {
8523 temp
= formatlong(iobj
, flags
, prec
, c
);
8527 pbuf
= PyUnicode_AS_UNICODE(temp
);
8528 len
= PyUnicode_GET_SIZE(temp
);
8537 PyErr_Format(PyExc_TypeError
,
8538 "%%%c format: a number is required, "
8539 "not %.200s", (char)c
, Py_TYPE(v
)->tp_name
);
8552 temp
= formatfloat(v
, flags
, prec
, c
);
8555 pbuf
= PyUnicode_AS_UNICODE(temp
);
8556 len
= PyUnicode_GET_SIZE(temp
);
8564 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
8570 PyErr_Format(PyExc_ValueError
,
8571 "unsupported format character '%c' (0x%x) "
8573 (31<=c
&& c
<=126) ? (char)c
: '?',
8575 (Py_ssize_t
)(fmt
- 1 -
8576 PyUnicode_AS_UNICODE(uformat
)));
8580 if (*pbuf
== '-' || *pbuf
== '+') {
8584 else if (flags
& F_SIGN
)
8586 else if (flags
& F_BLANK
)
8593 if (rescnt
- (sign
!= 0) < width
) {
8595 rescnt
= width
+ fmtcnt
+ 100;
8602 if (_PyUnicode_Resize(&result
, reslen
) < 0) {
8606 res
= PyUnicode_AS_UNICODE(result
)
8616 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8617 assert(pbuf
[0] == '0');
8618 assert(pbuf
[1] == c
);
8629 if (width
> len
&& !(flags
& F_LJUST
)) {
8633 } while (--width
> len
);
8638 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8639 assert(pbuf
[0] == '0');
8640 assert(pbuf
[1] == c
);
8645 Py_UNICODE_COPY(res
, pbuf
, len
);
8648 while (--width
>= len
) {
8652 if (dict
&& (argidx
< arglen
) && c
!= '%') {
8653 PyErr_SetString(PyExc_TypeError
,
8654 "not all arguments converted during string formatting");
8661 if (argidx
< arglen
&& !dict
) {
8662 PyErr_SetString(PyExc_TypeError
,
8663 "not all arguments converted during string formatting");
8667 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
8673 return (PyObject
*)result
;
8684 static PyBufferProcs unicode_as_buffer
= {
8685 (readbufferproc
) unicode_buffer_getreadbuf
,
8686 (writebufferproc
) unicode_buffer_getwritebuf
,
8687 (segcountproc
) unicode_buffer_getsegcount
,
8688 (charbufferproc
) unicode_buffer_getcharbuf
,
8692 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
8695 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8698 static char *kwlist
[] = {"string", "encoding", "errors", 0};
8699 char *encoding
= NULL
;
8700 char *errors
= NULL
;
8702 if (type
!= &PyUnicode_Type
)
8703 return unicode_subtype_new(type
, args
, kwds
);
8704 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
8705 kwlist
, &x
, &encoding
, &errors
))
8708 return (PyObject
*)_PyUnicode_New(0);
8709 if (encoding
== NULL
&& errors
== NULL
)
8710 return PyObject_Unicode(x
);
8712 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
8716 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8718 PyUnicodeObject
*tmp
, *pnew
;
8721 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
8722 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
8725 assert(PyUnicode_Check(tmp
));
8726 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
8731 pnew
->str
= (Py_UNICODE
*) PyObject_MALLOC(sizeof(Py_UNICODE
) * (n
+1));
8732 if (pnew
->str
== NULL
) {
8733 _Py_ForgetReference((PyObject
*)pnew
);
8736 return PyErr_NoMemory();
8738 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
8740 pnew
->hash
= tmp
->hash
;
8742 return (PyObject
*)pnew
;
8745 PyDoc_STRVAR(unicode_doc
,
8746 "unicode(string [, encoding[, errors]]) -> object\n\
8748 Create a new Unicode object from the given encoded string.\n\
8749 encoding defaults to the current default string encoding.\n\
8750 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8752 PyTypeObject PyUnicode_Type
= {
8753 PyVarObject_HEAD_INIT(&PyType_Type
, 0)
8754 "unicode", /* tp_name */
8755 sizeof(PyUnicodeObject
), /* tp_size */
8756 0, /* tp_itemsize */
8758 (destructor
)unicode_dealloc
, /* tp_dealloc */
8763 unicode_repr
, /* tp_repr */
8764 &unicode_as_number
, /* tp_as_number */
8765 &unicode_as_sequence
, /* tp_as_sequence */
8766 &unicode_as_mapping
, /* tp_as_mapping */
8767 (hashfunc
) unicode_hash
, /* tp_hash*/
8769 (reprfunc
) unicode_str
, /* tp_str */
8770 PyObject_GenericGetAttr
, /* tp_getattro */
8771 0, /* tp_setattro */
8772 &unicode_as_buffer
, /* tp_as_buffer */
8773 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
8774 Py_TPFLAGS_BASETYPE
| Py_TPFLAGS_UNICODE_SUBCLASS
, /* tp_flags */
8775 unicode_doc
, /* tp_doc */
8776 0, /* tp_traverse */
8778 PyUnicode_RichCompare
, /* tp_richcompare */
8779 0, /* tp_weaklistoffset */
8781 0, /* tp_iternext */
8782 unicode_methods
, /* tp_methods */
8785 &PyBaseString_Type
, /* tp_base */
8787 0, /* tp_descr_get */
8788 0, /* tp_descr_set */
8789 0, /* tp_dictoffset */
8792 unicode_new
, /* tp_new */
8793 PyObject_Del
, /* tp_free */
8796 /* Initialize the Unicode implementation */
8798 void _PyUnicode_Init(void)
8802 /* XXX - move this array to unicodectype.c ? */
8803 Py_UNICODE linebreak
[] = {
8804 0x000A, /* LINE FEED */
8805 0x000D, /* CARRIAGE RETURN */
8806 0x001C, /* FILE SEPARATOR */
8807 0x001D, /* GROUP SEPARATOR */
8808 0x001E, /* RECORD SEPARATOR */
8809 0x0085, /* NEXT LINE */
8810 0x2028, /* LINE SEPARATOR */
8811 0x2029, /* PARAGRAPH SEPARATOR */
8814 /* Init the implementation */
8817 unicode_empty
= _PyUnicode_New(0);
8821 strcpy(unicode_default_encoding
, "ascii");
8822 for (i
= 0; i
< 256; i
++)
8823 unicode_latin1
[i
] = NULL
;
8824 if (PyType_Ready(&PyUnicode_Type
) < 0)
8825 Py_FatalError("Can't initialize 'unicode'");
8827 /* initialize the linebreak bloom filter */
8828 bloom_linebreak
= make_bloom_mask(
8829 linebreak
, sizeof(linebreak
) / sizeof(linebreak
[0])
8832 PyType_Ready(&EncodingMapType
);
8835 /* Finalize the Unicode implementation */
8838 PyUnicode_ClearFreeList(void)
8840 int freelist_size
= numfree
;
8843 for (u
= free_list
; u
!= NULL
;) {
8844 PyUnicodeObject
*v
= u
;
8845 u
= *(PyUnicodeObject
**)u
;
8847 PyObject_DEL(v
->str
);
8848 Py_XDECREF(v
->defenc
);
8853 assert(numfree
== 0);
8854 return freelist_size
;
8858 _PyUnicode_Fini(void)
8862 Py_XDECREF(unicode_empty
);
8863 unicode_empty
= NULL
;
8865 for (i
= 0; i
< 256; i
++) {
8866 if (unicode_latin1
[i
]) {
8867 Py_DECREF(unicode_latin1
[i
]);
8868 unicode_latin1
[i
] = NULL
;
8871 (void)PyUnicode_ClearFreeList();