3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
45 #include "unicodeobject.h"
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
80 # define BYTEORDER_IS_LITTLE_ENDIAN
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
95 /* Free list for Unicode objects */
96 static PyUnicodeObject
*free_list
;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject
*unicode_empty
;
102 /* Single character Unicode strings in the Latin-1 range are being
104 static PyUnicodeObject
*unicode_latin1
[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding
[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace
[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * CHARACTER TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * LINE TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak
[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000B, * LINE TABULATION */
151 /* 0x000C, * FORM FEED */
152 /* 0x000D, * CARRIAGE RETURN */
153 0, 0, 1, 1, 1, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 /* 0x001C, * FILE SEPARATOR */
156 /* 0x001D, * GROUP SEPARATOR */
157 /* 0x001E, * RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
176 PyUnicode_GetMax(void)
178 #ifdef Py_UNICODE_WIDE
181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
187 /* --- Bloom Filters ----------------------------------------------------- */
189 /* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
193 /* the linebreak mask is set up by Unicode_Init below */
196 #define BLOOM_WIDTH 128
198 #define BLOOM_WIDTH 64
200 #define BLOOM_WIDTH 32
202 #error "LONG_BIT is smaller than 32"
205 #define BLOOM_MASK unsigned long
207 static BLOOM_MASK bloom_linebreak
;
209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
212 #define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
216 Py_LOCAL_INLINE(BLOOM_MASK
) make_bloom_mask(Py_UNICODE
* ptr
, Py_ssize_t len
)
218 /* calculate simple bloom-style bitmask for a given unicode string */
224 for (i
= 0; i
< len
; i
++)
225 BLOOM_ADD(mask
, ptr
[i
]);
230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr
, Py_UNICODE
* set
, Py_ssize_t setlen
)
234 for (i
= 0; i
< setlen
; i
++)
241 #define BLOOM_MEMBER(mask, chr, set, setlen) \
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
244 /* --- Unicode Object ----------------------------------------------------- */
247 int unicode_resize(register PyUnicodeObject
*unicode
,
252 /* Shortcut if there's nothing much to do. */
253 if (unicode
->length
== length
)
256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
260 if (unicode
== unicode_empty
||
261 (unicode
->length
== 1 &&
262 unicode
->str
[0] < 256U &&
263 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
264 PyErr_SetString(PyExc_SystemError
,
265 "can't resize shared unicode objects");
269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what
274 oldstr
= unicode
->str
;
275 unicode
->str
= PyObject_REALLOC(unicode
->str
,
276 sizeof(Py_UNICODE
) * (length
+ 1));
278 unicode
->str
= (Py_UNICODE
*)oldstr
;
282 unicode
->str
[length
] = 0;
283 unicode
->length
= length
;
286 /* Reset the object caches */
287 if (unicode
->defenc
) {
288 Py_DECREF(unicode
->defenc
);
289 unicode
->defenc
= NULL
;
296 /* We allocate one more byte to make sure the string is
297 Ux0000 terminated -- XXX is this needed ?
299 XXX This allocator could further be enhanced by assuring that the
300 free list never reduces its size below 1.
305 PyUnicodeObject
*_PyUnicode_New(Py_ssize_t length
)
307 register PyUnicodeObject
*unicode
;
309 /* Optimization for empty strings */
310 if (length
== 0 && unicode_empty
!= NULL
) {
311 Py_INCREF(unicode_empty
);
312 return unicode_empty
;
315 /* Ensure we won't overflow the size. */
316 if (length
> ((PY_SSIZE_T_MAX
/ sizeof(Py_UNICODE
)) - 1)) {
317 return (PyUnicodeObject
*)PyErr_NoMemory();
320 /* Unicode freelist & memory allocation */
323 free_list
= *(PyUnicodeObject
**)unicode
;
326 /* Keep-Alive optimization: we only upsize the buffer,
327 never downsize it. */
328 if ((unicode
->length
< length
) &&
329 unicode_resize(unicode
, length
) < 0) {
330 PyObject_DEL(unicode
->str
);
335 size_t new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
336 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
338 PyObject_INIT(unicode
, &PyUnicode_Type
);
342 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
345 new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
346 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
353 /* Initialize the first element to guard against cases where
354 * the caller fails before initializing str -- unicode_resize()
355 * reads str[0], and the Keep-Alive optimization can keep memory
356 * allocated for str alive across a call to unicode_dealloc(unicode).
357 * We don't want unicode_resize to read uninitialized memory in
361 unicode
->str
[length
] = 0;
362 unicode
->length
= length
;
364 unicode
->defenc
= NULL
;
368 /* XXX UNREF/NEWREF interface should be more symmetrical */
370 _Py_ForgetReference((PyObject
*)unicode
);
371 PyObject_Del(unicode
);
376 void unicode_dealloc(register PyUnicodeObject
*unicode
)
378 if (PyUnicode_CheckExact(unicode
) &&
379 numfree
< PyUnicode_MAXFREELIST
) {
380 /* Keep-Alive optimization */
381 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
382 PyObject_DEL(unicode
->str
);
386 if (unicode
->defenc
) {
387 Py_DECREF(unicode
->defenc
);
388 unicode
->defenc
= NULL
;
390 /* Add to free list */
391 *(PyUnicodeObject
**)unicode
= free_list
;
396 PyObject_DEL(unicode
->str
);
397 Py_XDECREF(unicode
->defenc
);
398 Py_TYPE(unicode
)->tp_free((PyObject
*)unicode
);
403 int _PyUnicode_Resize(PyUnicodeObject
**unicode
, Py_ssize_t length
)
405 register PyUnicodeObject
*v
;
407 /* Argument checks */
408 if (unicode
== NULL
) {
409 PyErr_BadInternalCall();
413 if (v
== NULL
|| !PyUnicode_Check(v
) || Py_REFCNT(v
) != 1 || length
< 0) {
414 PyErr_BadInternalCall();
418 /* Resizing unicode_empty and single character objects is not
419 possible since these are being shared. We simply return a fresh
420 copy with the same Unicode content. */
421 if (v
->length
!= length
&&
422 (v
== unicode_empty
|| v
->length
== 1)) {
423 PyUnicodeObject
*w
= _PyUnicode_New(length
);
426 Py_UNICODE_COPY(w
->str
, v
->str
,
427 length
< v
->length
? length
: v
->length
);
433 /* Note that we don't have to modify *unicode for unshared Unicode
434 objects, since we can modify them in-place. */
435 return unicode_resize(v
, length
);
438 int PyUnicode_Resize(PyObject
**unicode
, Py_ssize_t length
)
440 return _PyUnicode_Resize((PyUnicodeObject
**)unicode
, length
);
443 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
446 PyUnicodeObject
*unicode
;
448 /* If the Unicode data is known at construction time, we can apply
449 some optimizations which share commonly used objects. */
452 /* Optimization for empty strings */
453 if (size
== 0 && unicode_empty
!= NULL
) {
454 Py_INCREF(unicode_empty
);
455 return (PyObject
*)unicode_empty
;
458 /* Single character Unicode objects in the Latin-1 range are
459 shared when using this constructor */
460 if (size
== 1 && *u
< 256) {
461 unicode
= unicode_latin1
[*u
];
463 unicode
= _PyUnicode_New(1);
466 unicode
->str
[0] = *u
;
467 unicode_latin1
[*u
] = unicode
;
470 return (PyObject
*)unicode
;
474 unicode
= _PyUnicode_New(size
);
478 /* Copy the Unicode data into the new object */
480 Py_UNICODE_COPY(unicode
->str
, u
, size
);
482 return (PyObject
*)unicode
;
485 PyObject
*PyUnicode_FromStringAndSize(const char *u
, Py_ssize_t size
)
487 PyUnicodeObject
*unicode
;
490 PyErr_SetString(PyExc_SystemError
,
491 "Negative size passed to PyUnicode_FromStringAndSize");
495 /* If the Unicode data is known at construction time, we can apply
496 some optimizations which share commonly used objects.
497 Also, this means the input must be UTF-8, so fall back to the
498 UTF-8 decoder at the end. */
501 /* Optimization for empty strings */
502 if (size
== 0 && unicode_empty
!= NULL
) {
503 Py_INCREF(unicode_empty
);
504 return (PyObject
*)unicode_empty
;
507 /* Single characters are shared when using this constructor.
508 Restrict to ASCII, since the input must be UTF-8. */
509 if (size
== 1 && Py_CHARMASK(*u
) < 128) {
510 unicode
= unicode_latin1
[Py_CHARMASK(*u
)];
512 unicode
= _PyUnicode_New(1);
515 unicode
->str
[0] = Py_CHARMASK(*u
);
516 unicode_latin1
[Py_CHARMASK(*u
)] = unicode
;
519 return (PyObject
*)unicode
;
522 return PyUnicode_DecodeUTF8(u
, size
, NULL
);
525 unicode
= _PyUnicode_New(size
);
529 return (PyObject
*)unicode
;
532 PyObject
*PyUnicode_FromString(const char *u
)
534 size_t size
= strlen(u
);
535 if (size
> PY_SSIZE_T_MAX
) {
536 PyErr_SetString(PyExc_OverflowError
, "input too long");
540 return PyUnicode_FromStringAndSize(u
, size
);
545 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
546 # define CONVERT_WCHAR_TO_SURROGATES
549 #ifdef CONVERT_WCHAR_TO_SURROGATES
551 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
552 to convert from UTF32 to UTF16. */
554 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
557 PyUnicodeObject
*unicode
;
558 register Py_ssize_t i
;
560 const wchar_t *orig_w
;
563 PyErr_BadInternalCall();
569 for (i
= size
; i
> 0; i
--) {
575 unicode
= _PyUnicode_New(alloc
);
579 /* Copy the wchar_t data into the new object */
581 register Py_UNICODE
*u
;
582 u
= PyUnicode_AS_UNICODE(unicode
);
583 for (i
= size
; i
> 0; i
--) {
585 wchar_t ordinal
= *w
++;
587 *u
++ = 0xD800 | (ordinal
>> 10);
588 *u
++ = 0xDC00 | (ordinal
& 0x3FF);
594 return (PyObject
*)unicode
;
599 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
602 PyUnicodeObject
*unicode
;
605 PyErr_BadInternalCall();
609 unicode
= _PyUnicode_New(size
);
613 /* Copy the wchar_t data into the new object */
614 #ifdef HAVE_USABLE_WCHAR_T
615 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
618 register Py_UNICODE
*u
;
619 register Py_ssize_t i
;
620 u
= PyUnicode_AS_UNICODE(unicode
);
621 for (i
= size
; i
> 0; i
--)
626 return (PyObject
*)unicode
;
629 #endif /* CONVERT_WCHAR_TO_SURROGATES */
631 #undef CONVERT_WCHAR_TO_SURROGATES
634 makefmt(char *fmt
, int longflag
, int size_tflag
, int zeropad
, int width
, int precision
, char c
)
640 fmt
+= sprintf(fmt
, "%d", width
);
643 fmt
+= sprintf(fmt
, ".%d", precision
);
646 else if (size_tflag
) {
647 char *f
= PY_FORMAT_SIZE_T
;
655 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
658 PyUnicode_FromFormatV(const char *format
, va_list vargs
)
661 Py_ssize_t callcount
= 0;
662 PyObject
**callresults
= NULL
;
663 PyObject
**callresult
= NULL
;
671 /* used by sprintf */
673 /* use abuffer instead of buffer, if we need more space
674 * (which can happen if there's a format specifier with width). */
675 char *abuffer
= NULL
;
677 Py_ssize_t abuffersize
= 0;
678 char fmt
[60]; /* should be enough for %0width.precisionld */
681 #ifdef VA_LIST_IS_ARRAY
682 Py_MEMCPY(count
, vargs
, sizeof(va_list));
685 __va_copy(count
, vargs
);
690 /* step 1: count the number of %S/%R/%s format specifications
691 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
692 * objects once during step 3 and put the result in an array) */
693 for (f
= format
; *f
; f
++) {
697 if (*(f
+1)=='S' || *(f
+1)=='R')
699 while (isdigit((unsigned)*f
))
700 width
= (width
*10) + *f
++ - '0';
701 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
707 /* step 2: allocate memory for the results of
708 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
710 callresults
= PyObject_Malloc(sizeof(PyObject
*)*callcount
);
715 callresult
= callresults
;
717 /* step 3: figure out how large a buffer we need */
718 for (f
= format
; *f
; f
++) {
722 while (isdigit((unsigned)*f
))
723 width
= (width
*10) + *f
++ - '0';
724 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
727 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
728 * they don't affect the amount of space we reserve.
730 if ((*f
== 'l' || *f
== 'z') &&
731 (f
[1] == 'd' || f
[1] == 'u'))
736 (void)va_arg(count
, int);
737 /* fall through... */
741 case 'd': case 'u': case 'i': case 'x':
742 (void) va_arg(count
, int);
743 /* 20 bytes is enough to hold a 64-bit
744 integer. Decimal takes the most space.
745 This isn't enough for octal.
746 If a width is specified we need more
747 (which we allocate later). */
751 if (abuffersize
< width
)
757 const char *s
= va_arg(count
, const char*);
758 PyObject
*str
= PyUnicode_DecodeUTF8(s
, strlen(s
), "replace");
761 n
+= PyUnicode_GET_SIZE(str
);
762 /* Remember the str and switch to the next slot */
768 PyObject
*obj
= va_arg(count
, PyObject
*);
769 assert(obj
&& PyUnicode_Check(obj
));
770 n
+= PyUnicode_GET_SIZE(obj
);
775 PyObject
*obj
= va_arg(count
, PyObject
*);
776 const char *str
= va_arg(count
, const char *);
778 assert(!obj
|| PyUnicode_Check(obj
));
780 n
+= PyUnicode_GET_SIZE(obj
);
787 PyObject
*obj
= va_arg(count
, PyObject
*);
790 str
= PyObject_Str(obj
);
793 n
+= PyUnicode_GET_SIZE(str
);
794 /* Remember the str and switch to the next slot */
800 PyObject
*obj
= va_arg(count
, PyObject
*);
803 repr
= PyObject_Repr(obj
);
806 n
+= PyUnicode_GET_SIZE(repr
);
807 /* Remember the repr and switch to the next slot */
808 *callresult
++ = repr
;
812 (void) va_arg(count
, int);
813 /* maximum 64-bit pointer representation:
815 * so 19 characters is enough.
816 * XXX I count 18 -- what's the extra for?
821 /* if we stumble upon an unknown
822 formatting code, copy the rest of
823 the format string to the output
824 string. (we cannot just skip the
825 code, since there's no way to know
826 what's in the argument list) */
834 if (abuffersize
> 20) {
835 abuffer
= PyObject_Malloc(abuffersize
);
840 realbuffer
= abuffer
;
844 /* step 4: fill the buffer */
845 /* Since we've analyzed how much space we need for the worst case,
846 we don't have to resize the string.
847 There can be no errors beyond this point. */
848 string
= PyUnicode_FromUnicode(NULL
, n
);
852 s
= PyUnicode_AS_UNICODE(string
);
853 callresult
= callresults
;
855 for (f
= format
; *f
; f
++) {
860 zeropad
= (*f
== '0');
861 /* parse the width.precision part */
863 while (isdigit((unsigned)*f
))
864 width
= (width
*10) + *f
++ - '0';
868 while (isdigit((unsigned)*f
))
869 precision
= (precision
*10) + *f
++ - '0';
871 /* handle the long flag, but only for %ld and %lu.
872 others can be added when necessary. */
873 if (*f
== 'l' && (f
[1] == 'd' || f
[1] == 'u')) {
877 /* handle the size_t flag. */
878 if (*f
== 'z' && (f
[1] == 'd' || f
[1] == 'u')) {
885 *s
++ = va_arg(vargs
, int);
888 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'd');
890 sprintf(realbuffer
, fmt
, va_arg(vargs
, long));
892 sprintf(realbuffer
, fmt
, va_arg(vargs
, Py_ssize_t
));
894 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
895 appendstring(realbuffer
);
898 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'u');
900 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned long));
902 sprintf(realbuffer
, fmt
, va_arg(vargs
, size_t));
904 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned int));
905 appendstring(realbuffer
);
908 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'i');
909 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
910 appendstring(realbuffer
);
913 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'x');
914 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
915 appendstring(realbuffer
);
919 /* unused, since we already have the result */
920 (void) va_arg(vargs
, char *);
921 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(*callresult
),
922 PyUnicode_GET_SIZE(*callresult
));
923 s
+= PyUnicode_GET_SIZE(*callresult
);
924 /* We're done with the unicode()/repr() => forget it */
925 Py_DECREF(*callresult
);
926 /* switch to next unicode()/repr() result */
932 PyObject
*obj
= va_arg(vargs
, PyObject
*);
933 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
934 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
940 PyObject
*obj
= va_arg(vargs
, PyObject
*);
941 const char *str
= va_arg(vargs
, const char *);
943 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
944 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
957 /* unused, since we already have the result */
958 (void) va_arg(vargs
, PyObject
*);
959 ucopy
= PyUnicode_AS_UNICODE(*callresult
);
960 usize
= PyUnicode_GET_SIZE(*callresult
);
961 for (upos
= 0; upos
<usize
;)
962 *s
++ = ucopy
[upos
++];
963 /* We're done with the unicode()/repr() => forget it */
964 Py_DECREF(*callresult
);
965 /* switch to next unicode()/repr() result */
970 sprintf(buffer
, "%p", va_arg(vargs
, void*));
971 /* %p is ill-defined: ensure leading 0x. */
972 if (buffer
[1] == 'X')
974 else if (buffer
[1] != 'x') {
975 memmove(buffer
+2, buffer
, strlen(buffer
)+1);
979 appendstring(buffer
);
994 PyObject_Free(callresults
);
996 PyObject_Free(abuffer
);
997 PyUnicode_Resize(&string
, s
- PyUnicode_AS_UNICODE(string
));
1001 PyObject
**callresult2
= callresults
;
1002 while (callresult2
< callresult
) {
1003 Py_DECREF(*callresult2
);
1006 PyObject_Free(callresults
);
1009 PyObject_Free(abuffer
);
1016 PyUnicode_FromFormat(const char *format
, ...)
1021 #ifdef HAVE_STDARG_PROTOTYPES
1022 va_start(vargs
, format
);
1026 ret
= PyUnicode_FromFormatV(format
, vargs
);
1031 Py_ssize_t
PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
1035 if (unicode
== NULL
) {
1036 PyErr_BadInternalCall();
1040 /* If possible, try to copy the 0-termination as well */
1041 if (size
> PyUnicode_GET_SIZE(unicode
))
1042 size
= PyUnicode_GET_SIZE(unicode
) + 1;
1044 #ifdef HAVE_USABLE_WCHAR_T
1045 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
1048 register Py_UNICODE
*u
;
1049 register Py_ssize_t i
;
1050 u
= PyUnicode_AS_UNICODE(unicode
);
1051 for (i
= size
; i
> 0; i
--)
1056 if (size
> PyUnicode_GET_SIZE(unicode
))
1057 return PyUnicode_GET_SIZE(unicode
);
1064 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
1068 #ifdef Py_UNICODE_WIDE
1069 if (ordinal
< 0 || ordinal
> 0x10ffff) {
1070 PyErr_SetString(PyExc_ValueError
,
1071 "unichr() arg not in range(0x110000) "
1072 "(wide Python build)");
1076 if (ordinal
< 0 || ordinal
> 0xffff) {
1077 PyErr_SetString(PyExc_ValueError
,
1078 "unichr() arg not in range(0x10000) "
1079 "(narrow Python build)");
1084 s
[0] = (Py_UNICODE
)ordinal
;
1085 return PyUnicode_FromUnicode(s
, 1);
1088 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
1090 /* XXX Perhaps we should make this API an alias of
1091 PyObject_Unicode() instead ?! */
1092 if (PyUnicode_CheckExact(obj
)) {
1096 if (PyUnicode_Check(obj
)) {
1097 /* For a Unicode subtype that's not a Unicode object,
1098 return a true Unicode object with the same data. */
1099 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
1100 PyUnicode_GET_SIZE(obj
));
1102 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
1105 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
1106 const char *encoding
,
1109 const char *s
= NULL
;
1114 PyErr_BadInternalCall();
1119 /* For b/w compatibility we also accept Unicode objects provided
1120 that no encodings is given and then redirect to
1121 PyObject_Unicode() which then applies the additional logic for
1124 NOTE: This API should really only be used for object which
1125 represent *encoded* Unicode !
1128 if (PyUnicode_Check(obj
)) {
1130 PyErr_SetString(PyExc_TypeError
,
1131 "decoding Unicode is not supported");
1134 return PyObject_Unicode(obj
);
1137 if (PyUnicode_Check(obj
)) {
1138 PyErr_SetString(PyExc_TypeError
,
1139 "decoding Unicode is not supported");
1145 if (PyString_Check(obj
)) {
1146 s
= PyString_AS_STRING(obj
);
1147 len
= PyString_GET_SIZE(obj
);
1149 else if (PyByteArray_Check(obj
)) {
1150 /* Python 2.x specific */
1151 PyErr_Format(PyExc_TypeError
,
1152 "decoding bytearray is not supported");
1155 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
1156 /* Overwrite the error message with something more useful in
1157 case of a TypeError. */
1158 if (PyErr_ExceptionMatches(PyExc_TypeError
))
1159 PyErr_Format(PyExc_TypeError
,
1160 "coercing to Unicode: need string or buffer, "
1162 Py_TYPE(obj
)->tp_name
);
1166 /* Convert to Unicode */
1168 Py_INCREF(unicode_empty
);
1169 v
= (PyObject
*)unicode_empty
;
1172 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
1180 PyObject
*PyUnicode_Decode(const char *s
,
1182 const char *encoding
,
1185 PyObject
*buffer
= NULL
, *unicode
;
1187 if (encoding
== NULL
)
1188 encoding
= PyUnicode_GetDefaultEncoding();
1190 /* Shortcuts for common default encodings */
1191 if (strcmp(encoding
, "utf-8") == 0)
1192 return PyUnicode_DecodeUTF8(s
, size
, errors
);
1193 else if (strcmp(encoding
, "latin-1") == 0)
1194 return PyUnicode_DecodeLatin1(s
, size
, errors
);
1195 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1196 else if (strcmp(encoding
, "mbcs") == 0)
1197 return PyUnicode_DecodeMBCS(s
, size
, errors
);
1199 else if (strcmp(encoding
, "ascii") == 0)
1200 return PyUnicode_DecodeASCII(s
, size
, errors
);
1202 /* Decode via the codec registry */
1203 buffer
= PyBuffer_FromMemory((void *)s
, size
);
1206 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
1207 if (unicode
== NULL
)
1209 if (!PyUnicode_Check(unicode
)) {
1210 PyErr_Format(PyExc_TypeError
,
1211 "decoder did not return an unicode object (type=%.400s)",
1212 Py_TYPE(unicode
)->tp_name
);
1224 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
1225 const char *encoding
,
1230 if (!PyUnicode_Check(unicode
)) {
1231 PyErr_BadArgument();
1235 if (encoding
== NULL
)
1236 encoding
= PyUnicode_GetDefaultEncoding();
1238 /* Decode via the codec registry */
1239 v
= PyCodec_Decode(unicode
, encoding
, errors
);
1248 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
1250 const char *encoding
,
1253 PyObject
*v
, *unicode
;
1255 unicode
= PyUnicode_FromUnicode(s
, size
);
1256 if (unicode
== NULL
)
1258 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
1263 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
1264 const char *encoding
,
1269 if (!PyUnicode_Check(unicode
)) {
1270 PyErr_BadArgument();
1274 if (encoding
== NULL
)
1275 encoding
= PyUnicode_GetDefaultEncoding();
1277 /* Encode via the codec registry */
1278 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1287 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
1288 const char *encoding
,
1293 if (!PyUnicode_Check(unicode
)) {
1294 PyErr_BadArgument();
1298 if (encoding
== NULL
)
1299 encoding
= PyUnicode_GetDefaultEncoding();
1301 /* Shortcuts for common default encodings */
1302 if (errors
== NULL
) {
1303 if (strcmp(encoding
, "utf-8") == 0)
1304 return PyUnicode_AsUTF8String(unicode
);
1305 else if (strcmp(encoding
, "latin-1") == 0)
1306 return PyUnicode_AsLatin1String(unicode
);
1307 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1308 else if (strcmp(encoding
, "mbcs") == 0)
1309 return PyUnicode_AsMBCSString(unicode
);
1311 else if (strcmp(encoding
, "ascii") == 0)
1312 return PyUnicode_AsASCIIString(unicode
);
1315 /* Encode via the codec registry */
1316 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1319 if (!PyString_Check(v
)) {
1320 PyErr_Format(PyExc_TypeError
,
1321 "encoder did not return a string object (type=%.400s)",
1322 Py_TYPE(v
)->tp_name
);
1332 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
1335 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
1339 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
1340 if (v
&& errors
== NULL
)
1341 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
1345 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
1347 if (!PyUnicode_Check(unicode
)) {
1348 PyErr_BadArgument();
1351 return PyUnicode_AS_UNICODE(unicode
);
1357 Py_ssize_t
PyUnicode_GetSize(PyObject
*unicode
)
1359 if (!PyUnicode_Check(unicode
)) {
1360 PyErr_BadArgument();
1363 return PyUnicode_GET_SIZE(unicode
);
1369 const char *PyUnicode_GetDefaultEncoding(void)
1371 return unicode_default_encoding
;
1374 int PyUnicode_SetDefaultEncoding(const char *encoding
)
1378 /* Make sure the encoding is valid. As side effect, this also
1379 loads the encoding into the codec registry cache. */
1380 v
= _PyCodec_Lookup(encoding
);
1384 strncpy(unicode_default_encoding
,
1386 sizeof(unicode_default_encoding
));
1393 /* error handling callback helper:
1394 build arguments, call the callback and check the arguments,
1395 if no exception occurred, copy the replacement to the output
1396 and adjust various state variables.
1397 return 0 on success, -1 on error
1401 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
1402 const char *encoding
, const char *reason
,
1403 const char *input
, Py_ssize_t insize
, Py_ssize_t
*startinpos
,
1404 Py_ssize_t
*endinpos
, PyObject
**exceptionObject
, const char **inptr
,
1405 PyUnicodeObject
**output
, Py_ssize_t
*outpos
, Py_UNICODE
**outptr
)
1407 static char *argparse
= "O!n;decoding error handler must return (unicode, int) tuple";
1409 PyObject
*restuple
= NULL
;
1410 PyObject
*repunicode
= NULL
;
1411 Py_ssize_t outsize
= PyUnicode_GET_SIZE(*output
);
1412 Py_ssize_t requiredsize
;
1418 if (*errorHandler
== NULL
) {
1419 *errorHandler
= PyCodec_LookupError(errors
);
1420 if (*errorHandler
== NULL
)
1424 if (*exceptionObject
== NULL
) {
1425 *exceptionObject
= PyUnicodeDecodeError_Create(
1426 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
1427 if (*exceptionObject
== NULL
)
1431 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
1433 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
1435 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
1439 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
1440 if (restuple
== NULL
)
1442 if (!PyTuple_Check(restuple
)) {
1443 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
1446 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
1449 newpos
= insize
+newpos
;
1450 if (newpos
<0 || newpos
>insize
) {
1451 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", newpos
);
1455 /* need more space? (at least enough for what we
1456 have+the replacement+the rest of the string (starting
1457 at the new input position), so we won't have to check space
1458 when there are no errors in the rest of the string) */
1459 repptr
= PyUnicode_AS_UNICODE(repunicode
);
1460 repsize
= PyUnicode_GET_SIZE(repunicode
);
1461 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
1462 if (requiredsize
> outsize
) {
1463 if (requiredsize
<2*outsize
)
1464 requiredsize
= 2*outsize
;
1465 if (_PyUnicode_Resize(output
, requiredsize
) < 0)
1467 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
1470 *inptr
= input
+ newpos
;
1471 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
1478 Py_XDECREF(restuple
);
1482 /* --- UTF-7 Codec -------------------------------------------------------- */
1484 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1486 /* Three simple macros defining base-64. */
1488 /* Is c a base-64 character? */
1490 #define IS_BASE64(c) \
1491 (isalnum(c) || (c) == '+' || (c) == '/')
1493 /* given that c is a base-64 character, what is its base-64 value? */
1495 #define FROM_BASE64(c) \
1496 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1497 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1498 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1499 (c) == '+' ? 62 : 63)
1501 /* What is the base-64 character of the bottom 6 bits of n? */
1503 #define TO_BASE64(n) \
1504 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1506 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1507 * decoded as itself. We are permissive on decoding; the only ASCII
1508 * byte not decoding to itself is the + which begins a base64
1511 #define DECODE_DIRECT(c) \
1512 ((c) <= 127 && (c) != '+')
1514 /* The UTF-7 encoder treats ASCII characters differently according to
1515 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1516 * the above). See RFC2152. This array identifies these different
1519 * alphanumeric and '(),-./:?
1521 * !"#$%&*;<=>@[]^_`{|}
1524 * 3 : special (must be base64 encoded)
1525 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1529 char utf7_category
[128] = {
1530 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1532 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1533 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1534 /* sp ! " # $ % & ' ( ) * + , - . / */
1535 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1536 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1538 /* @ A B C D E F G H I J K L M N O */
1539 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1540 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1542 /* ` a b c d e f g h i j k l m n o */
1543 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1544 /* p q r s t u v w x y z { | } ~ del */
1545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1548 /* ENCODE_DIRECT: this character should be encoded as itself. The
1549 * answer depends on whether we are encoding set O as itself, and also
1550 * on whether we are encoding whitespace as itself. RFC2152 makes it
1551 * clear that the answers to these questions vary between
1552 * applications, so this code needs to be flexible. */
1554 #define ENCODE_DIRECT(c, directO, directWS) \
1555 ((c) < 128 && (c) > 0 && \
1556 ((utf7_category[(c)] == 0) || \
1557 (directWS && (utf7_category[(c)] == 2)) || \
1558 (directO && (utf7_category[(c)] == 1))))
1560 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
1564 return PyUnicode_DecodeUTF7Stateful(s
, size
, errors
, NULL
);
1567 /* The decoder. The only state we preserve is our read position,
1568 * i.e. how many characters we have consumed. So if we end in the
1569 * middle of a shift sequence we have to back off the read position
1570 * and the output to the beginning of the sequence, otherwise we lose
1571 * all the shift state (seen bits, number of bits seen, high
1574 PyObject
*PyUnicode_DecodeUTF7Stateful(const char *s
,
1577 Py_ssize_t
*consumed
)
1579 const char *starts
= s
;
1580 Py_ssize_t startinpos
;
1581 Py_ssize_t endinpos
;
1584 PyUnicodeObject
*unicode
;
1586 const char *errmsg
= "";
1588 Py_UNICODE
*shiftOutStart
;
1589 unsigned int base64bits
= 0;
1590 unsigned long base64buffer
= 0;
1591 Py_UNICODE surrogate
= 0;
1592 PyObject
*errorHandler
= NULL
;
1593 PyObject
*exc
= NULL
;
1595 unicode
= _PyUnicode_New(size
);
1601 return (PyObject
*)unicode
;
1609 Py_UNICODE ch
= (unsigned char) *s
;
1611 if (inShift
) { /* in a base-64 section */
1612 if (IS_BASE64(ch
)) { /* consume a base-64 character */
1613 base64buffer
= (base64buffer
<< 6) | FROM_BASE64(ch
);
1616 if (base64bits
>= 16) {
1617 /* we have enough bits for a UTF-16 value */
1618 Py_UNICODE outCh
= (Py_UNICODE
)
1619 (base64buffer
>> (base64bits
-16));
1621 base64buffer
&= (1 << base64bits
) - 1; /* clear high bits */
1623 /* expecting a second surrogate */
1624 if (outCh
>= 0xDC00 && outCh
<= 0xDFFF) {
1625 #ifdef Py_UNICODE_WIDE
1626 *p
++ = (((surrogate
& 0x3FF)<<10)
1627 | (outCh
& 0x3FF)) + 0x10000;
1636 errmsg
= "second surrogate missing";
1640 else if (outCh
>= 0xD800 && outCh
<= 0xDBFF) {
1641 /* first surrogate */
1644 else if (outCh
>= 0xDC00 && outCh
<= 0xDFFF) {
1645 errmsg
= "unexpected second surrogate";
1653 else { /* now leaving a base-64 section */
1657 errmsg
= "second surrogate missing at end of shift sequence";
1660 if (base64bits
> 0) { /* left-over bits */
1661 if (base64bits
>= 6) {
1662 /* We've seen at least one base-64 character */
1663 errmsg
= "partial character in shift sequence";
1667 /* Some bits remain; they should be zero */
1668 if (base64buffer
!= 0) {
1669 errmsg
= "non-zero padding bits in shift sequence";
1675 /* '-' is absorbed; other terminating
1676 characters are preserved */
1681 else if ( ch
== '+' ) {
1682 startinpos
= s
-starts
;
1683 s
++; /* consume '+' */
1684 if (s
< e
&& *s
== '-') { /* '+-' encodes '+' */
1688 else { /* begin base64-encoded section */
1694 else if (DECODE_DIRECT(ch
)) { /* character decodes as itself */
1699 startinpos
= s
-starts
;
1701 errmsg
= "unexpected special character";
1706 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1707 endinpos
= s
-starts
;
1708 if (unicode_decode_call_errorhandler(
1709 errors
, &errorHandler
,
1711 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1712 &unicode
, &outpos
, &p
))
1718 if (inShift
&& !consumed
) { /* in shift sequence, no more to follow */
1719 /* if we're in an inconsistent state, that's an error */
1721 (base64bits
>= 6) ||
1722 (base64bits
> 0 && base64buffer
!= 0)) {
1723 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1725 if (unicode_decode_call_errorhandler(
1726 errors
, &errorHandler
,
1727 "utf7", "unterminated shift sequence",
1728 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1729 &unicode
, &outpos
, &p
))
1737 p
= shiftOutStart
; /* back off output */
1738 *consumed
= startinpos
;
1741 *consumed
= s
-starts
;
1745 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1748 Py_XDECREF(errorHandler
);
1750 return (PyObject
*)unicode
;
1753 Py_XDECREF(errorHandler
);
1760 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1763 int base64WhiteSpace
,
1767 /* It might be possible to tighten this worst case */
1768 Py_ssize_t allocated
= 8 * size
;
1771 unsigned int base64bits
= 0;
1772 unsigned long base64buffer
= 0;
1776 if (allocated
/ 8 != size
)
1777 return PyErr_NoMemory();
1780 return PyString_FromStringAndSize(NULL
, 0);
1782 v
= PyString_FromStringAndSize(NULL
, allocated
);
1786 start
= out
= PyString_AS_STRING(v
);
1787 for (;i
< size
; ++i
) {
1788 Py_UNICODE ch
= s
[i
];
1791 if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1793 if (base64bits
) { /* output remaining bits */
1794 *out
++ = TO_BASE64(base64buffer
<< (6-base64bits
));
1799 /* Characters not in the BASE64 set implicitly unshift the sequence
1800 so no '-' is required, except if the character is itself a '-' */
1801 if (IS_BASE64(ch
) || ch
== '-') {
1810 else { /* not in a shift sequence */
1815 else if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1826 #ifdef Py_UNICODE_WIDE
1827 if (ch
>= 0x10000) {
1828 /* code first surrogate */
1830 base64buffer
= (base64buffer
<< 16) | 0xd800 | ((ch
-0x10000) >> 10);
1831 while (base64bits
>= 6) {
1832 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1835 /* prepare second surrogate */
1836 ch
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1840 base64buffer
= (base64buffer
<< 16) | ch
;
1841 while (base64bits
>= 6) {
1842 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1847 *out
++= TO_BASE64(base64buffer
<< (6-base64bits
) );
1851 if (_PyString_Resize(&v
, out
- start
))
1859 #undef DECODE_DIRECT
1860 #undef ENCODE_DIRECT
1862 /* --- UTF-8 Codec -------------------------------------------------------- */
1865 char utf8_code_length
[256] = {
1866 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1867 illegal prefix. See RFC 3629 for details */
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1874 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1875 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1878 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1879 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1880 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1881 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1882 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1883 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
1886 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1890 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1893 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1896 Py_ssize_t
*consumed
)
1898 const char *starts
= s
;
1901 Py_ssize_t startinpos
;
1902 Py_ssize_t endinpos
;
1905 PyUnicodeObject
*unicode
;
1907 const char *errmsg
= "";
1908 PyObject
*errorHandler
= NULL
;
1909 PyObject
*exc
= NULL
;
1911 /* Note: size will always be longer than the resulting Unicode
1913 unicode
= _PyUnicode_New(size
);
1919 return (PyObject
*)unicode
;
1922 /* Unpack UTF-8 encoded data */
1927 Py_UCS4 ch
= (unsigned char)*s
;
1930 *p
++ = (Py_UNICODE
)ch
;
1935 n
= utf8_code_length
[ch
];
1941 errmsg
= "unexpected end of data";
1942 startinpos
= s
-starts
;
1943 endinpos
= startinpos
+1;
1944 for (k
=1; (k
< size
-startinpos
) && ((s
[k
]&0xC0) == 0x80); k
++)
1953 errmsg
= "invalid start byte";
1954 startinpos
= s
-starts
;
1955 endinpos
= startinpos
+1;
1959 errmsg
= "internal error";
1960 startinpos
= s
-starts
;
1961 endinpos
= startinpos
+1;
1965 if ((s
[1] & 0xc0) != 0x80) {
1966 errmsg
= "invalid continuation byte";
1967 startinpos
= s
-starts
;
1968 endinpos
= startinpos
+ 1;
1971 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1972 assert ((ch
> 0x007F) && (ch
<= 0x07FF));
1973 *p
++ = (Py_UNICODE
)ch
;
1977 /* XXX: surrogates shouldn't be valid UTF-8!
1978 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1979 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1980 Uncomment the 2 lines below to make them invalid,
1981 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
1982 if ((s
[1] & 0xc0) != 0x80 ||
1983 (s
[2] & 0xc0) != 0x80 ||
1984 ((unsigned char)s
[0] == 0xE0 &&
1985 (unsigned char)s
[1] < 0xA0)/* ||
1986 ((unsigned char)s[0] == 0xED &&
1987 (unsigned char)s[1] > 0x9F)*/) {
1988 errmsg
= "invalid continuation byte";
1989 startinpos
= s
-starts
;
1990 endinpos
= startinpos
+ 1;
1992 /* if s[1] first two bits are 1 and 0, then the invalid
1993 continuation byte is s[2], so increment endinpos by 1,
1994 if not, s[1] is invalid and endinpos doesn't need to
1996 if ((s
[1] & 0xC0) == 0x80)
2000 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
2001 assert ((ch
> 0x07FF) && (ch
<= 0xFFFF));
2002 *p
++ = (Py_UNICODE
)ch
;
2006 if ((s
[1] & 0xc0) != 0x80 ||
2007 (s
[2] & 0xc0) != 0x80 ||
2008 (s
[3] & 0xc0) != 0x80 ||
2009 ((unsigned char)s
[0] == 0xF0 &&
2010 (unsigned char)s
[1] < 0x90) ||
2011 ((unsigned char)s
[0] == 0xF4 &&
2012 (unsigned char)s
[1] > 0x8F)) {
2013 errmsg
= "invalid continuation byte";
2014 startinpos
= s
-starts
;
2015 endinpos
= startinpos
+ 1;
2016 if ((s
[1] & 0xC0) == 0x80) {
2018 if ((s
[2] & 0xC0) == 0x80)
2023 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
2024 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
2025 assert ((ch
> 0xFFFF) && (ch
<= 0x10ffff));
2027 #ifdef Py_UNICODE_WIDE
2028 *p
++ = (Py_UNICODE
)ch
;
2030 /* compute and append the two surrogates: */
2032 /* translate from 10000..10FFFF to 0..FFFF */
2035 /* high surrogate = top 10 bits added to D800 */
2036 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
2038 /* low surrogate = bottom 10 bits added to DC00 */
2039 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
2047 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2048 if (unicode_decode_call_errorhandler(
2049 errors
, &errorHandler
,
2051 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2052 &unicode
, &outpos
, &p
))
2056 *consumed
= s
-starts
;
2059 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2062 Py_XDECREF(errorHandler
);
2064 return (PyObject
*)unicode
;
2067 Py_XDECREF(errorHandler
);
2073 /* Allocation strategy: if the string is short, convert into a stack buffer
2074 and allocate exactly as much space needed at the end. Else allocate the
2075 maximum possible needed (4 result bytes per Unicode character), and return
2076 the excess memory at the end.
2079 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
2083 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2085 Py_ssize_t i
; /* index into s of next input byte */
2086 PyObject
*v
; /* result string object */
2087 char *p
; /* next free byte in output buffer */
2088 Py_ssize_t nallocated
; /* number of result bytes allocated */
2089 Py_ssize_t nneeded
; /* number of result bytes needed */
2090 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
2095 if (size
<= MAX_SHORT_UNICHARS
) {
2096 /* Write into the stack buffer; nallocated can't overflow.
2097 * At the end, we'll allocate exactly as much heap space as it
2098 * turns out we need.
2100 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
2101 v
= NULL
; /* will allocate after we're done */
2105 /* Overallocate on the heap, and give the excess back at the end. */
2106 nallocated
= size
* 4;
2107 if (nallocated
/ 4 != size
) /* overflow! */
2108 return PyErr_NoMemory();
2109 v
= PyString_FromStringAndSize(NULL
, nallocated
);
2112 p
= PyString_AS_STRING(v
);
2115 for (i
= 0; i
< size
;) {
2116 Py_UCS4 ch
= s
[i
++];
2122 else if (ch
< 0x0800) {
2123 /* Encode Latin-1 */
2124 *p
++ = (char)(0xc0 | (ch
>> 6));
2125 *p
++ = (char)(0x80 | (ch
& 0x3f));
2128 /* Encode UCS2 Unicode ordinals */
2130 /* Special case: check for high surrogate */
2131 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
2133 /* Check for low surrogate and combine the two to
2134 form a UCS4 value */
2135 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2136 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
2140 /* Fall through: handles isolated high surrogates */
2142 *p
++ = (char)(0xe0 | (ch
>> 12));
2143 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2144 *p
++ = (char)(0x80 | (ch
& 0x3f));
2148 /* Encode UCS4 Unicode ordinals */
2149 *p
++ = (char)(0xf0 | (ch
>> 18));
2150 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
2151 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2152 *p
++ = (char)(0x80 | (ch
& 0x3f));
2157 /* This was stack allocated. */
2158 nneeded
= p
- stackbuf
;
2159 assert(nneeded
<= nallocated
);
2160 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
2163 /* Cut back to size actually needed. */
2164 nneeded
= p
- PyString_AS_STRING(v
);
2165 assert(nneeded
<= nallocated
);
2166 if (_PyString_Resize(&v
, nneeded
))
2171 #undef MAX_SHORT_UNICHARS
2174 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
2176 if (!PyUnicode_Check(unicode
)) {
2177 PyErr_BadArgument();
2180 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
2181 PyUnicode_GET_SIZE(unicode
),
2185 /* --- UTF-32 Codec ------------------------------------------------------- */
2188 PyUnicode_DecodeUTF32(const char *s
,
2193 return PyUnicode_DecodeUTF32Stateful(s
, size
, errors
, byteorder
, NULL
);
2197 PyUnicode_DecodeUTF32Stateful(const char *s
,
2201 Py_ssize_t
*consumed
)
2203 const char *starts
= s
;
2204 Py_ssize_t startinpos
;
2205 Py_ssize_t endinpos
;
2207 PyUnicodeObject
*unicode
;
2209 #ifndef Py_UNICODE_WIDE
2211 const unsigned char *qq
;
2213 const int pairs
= 0;
2215 const unsigned char *q
, *e
;
2216 int bo
= 0; /* assume native ordering by default */
2217 const char *errmsg
= "";
2218 /* Offsets from q for retrieving bytes in the right order. */
2219 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2220 int iorder
[] = {0, 1, 2, 3};
2222 int iorder
[] = {3, 2, 1, 0};
2224 PyObject
*errorHandler
= NULL
;
2225 PyObject
*exc
= NULL
;
2227 q
= (unsigned char *)s
;
2233 /* Check for BOM marks (U+FEFF) in the input and adjust current
2234 byte order setting accordingly. In native mode, the leading BOM
2235 mark is skipped, in all other modes, it is copied to the output
2236 stream as-is (giving a ZWNBSP character). */
2239 const Py_UCS4 bom
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2240 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2241 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2242 if (bom
== 0x0000FEFF) {
2246 else if (bom
== 0xFFFE0000) {
2251 if (bom
== 0x0000FEFF) {
2255 else if (bom
== 0xFFFE0000) {
2278 /* On narrow builds we split characters outside the BMP into two
2279 codepoints => count how much extra space we need. */
2280 #ifndef Py_UNICODE_WIDE
2281 for (qq
= q
; qq
< e
; qq
+= 4)
2282 if (qq
[iorder
[2]] != 0 || qq
[iorder
[3]] != 0)
2286 /* This might be one to much, because of a BOM */
2287 unicode
= _PyUnicode_New((size
+3)/4+pairs
);
2291 return (PyObject
*)unicode
;
2293 /* Unpack UTF-32 encoded data */
2298 /* remaining bytes at the end? (size should be divisible by 4) */
2302 errmsg
= "truncated data";
2303 startinpos
= ((const char *)q
)-starts
;
2304 endinpos
= ((const char *)e
)-starts
;
2306 /* The remaining input chars are ignored if the callback
2307 chooses to skip the input */
2309 ch
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2310 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2314 errmsg
= "codepoint not in range(0x110000)";
2315 startinpos
= ((const char *)q
)-starts
;
2316 endinpos
= startinpos
+4;
2319 #ifndef Py_UNICODE_WIDE
2322 *p
++ = 0xD800 | ((ch
-0x10000) >> 10);
2323 *p
++ = 0xDC00 | ((ch
-0x10000) & 0x3FF);
2331 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2332 if (unicode_decode_call_errorhandler(
2333 errors
, &errorHandler
,
2335 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2336 &unicode
, &outpos
, &p
))
2344 *consumed
= (const char *)q
-starts
;
2347 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2350 Py_XDECREF(errorHandler
);
2352 return (PyObject
*)unicode
;
2356 Py_XDECREF(errorHandler
);
2362 PyUnicode_EncodeUTF32(const Py_UNICODE
*s
,
2369 Py_ssize_t nsize
, bytesize
;
2370 #ifndef Py_UNICODE_WIDE
2371 Py_ssize_t i
, pairs
;
2373 const int pairs
= 0;
2375 /* Offsets from p for storing byte pairs in the right order. */
2376 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2377 int iorder
[] = {0, 1, 2, 3};
2379 int iorder
[] = {3, 2, 1, 0};
2382 #define STORECHAR(CH) \
2384 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2385 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2386 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2387 p[iorder[0]] = (CH) & 0xff; \
2391 /* In narrow builds we can output surrogate pairs as one codepoint,
2392 so we need less space. */
2393 #ifndef Py_UNICODE_WIDE
2394 for (i
= pairs
= 0; i
< size
-1; i
++)
2395 if (0xD800 <= s
[i
] && s
[i
] <= 0xDBFF &&
2396 0xDC00 <= s
[i
+1] && s
[i
+1] <= 0xDFFF)
2399 nsize
= (size
- pairs
+ (byteorder
== 0));
2400 bytesize
= nsize
* 4;
2401 if (bytesize
/ 4 != nsize
)
2402 return PyErr_NoMemory();
2403 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2407 p
= (unsigned char *)PyString_AS_STRING(v
);
2413 if (byteorder
== -1) {
2420 else if (byteorder
== 1) {
2428 while (size
-- > 0) {
2430 #ifndef Py_UNICODE_WIDE
2431 if (0xD800 <= ch
&& ch
<= 0xDBFF && size
> 0) {
2433 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2434 ch
= (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2446 PyObject
*PyUnicode_AsUTF32String(PyObject
*unicode
)
2448 if (!PyUnicode_Check(unicode
)) {
2449 PyErr_BadArgument();
2452 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode
),
2453 PyUnicode_GET_SIZE(unicode
),
2458 /* --- UTF-16 Codec ------------------------------------------------------- */
2461 PyUnicode_DecodeUTF16(const char *s
,
2466 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
2470 PyUnicode_DecodeUTF16Stateful(const char *s
,
2474 Py_ssize_t
*consumed
)
2476 const char *starts
= s
;
2477 Py_ssize_t startinpos
;
2478 Py_ssize_t endinpos
;
2480 PyUnicodeObject
*unicode
;
2482 const unsigned char *q
, *e
;
2483 int bo
= 0; /* assume native ordering by default */
2484 const char *errmsg
= "";
2485 /* Offsets from q for retrieving byte pairs in the right order. */
2486 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2487 int ihi
= 1, ilo
= 0;
2489 int ihi
= 0, ilo
= 1;
2491 PyObject
*errorHandler
= NULL
;
2492 PyObject
*exc
= NULL
;
2494 /* Note: size will always be longer than the resulting Unicode
2496 unicode
= _PyUnicode_New(size
);
2500 return (PyObject
*)unicode
;
2502 /* Unpack UTF-16 encoded data */
2504 q
= (unsigned char *)s
;
2510 /* Check for BOM marks (U+FEFF) in the input and adjust current
2511 byte order setting accordingly. In native mode, the leading BOM
2512 mark is skipped, in all other modes, it is copied to the output
2513 stream as-is (giving a ZWNBSP character). */
2516 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
2517 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2518 if (bom
== 0xFEFF) {
2522 else if (bom
== 0xFFFE) {
2527 if (bom
== 0xFEFF) {
2531 else if (bom
== 0xFFFE) {
2552 /* remaining bytes at the end? (size should be even) */
2556 errmsg
= "truncated data";
2557 startinpos
= ((const char *)q
)-starts
;
2558 endinpos
= ((const char *)e
)-starts
;
2560 /* The remaining input chars are ignored if the callback
2561 chooses to skip the input */
2563 ch
= (q
[ihi
] << 8) | q
[ilo
];
2567 if (ch
< 0xD800 || ch
> 0xDFFF) {
2572 /* UTF-16 code pair: */
2574 errmsg
= "unexpected end of data";
2575 startinpos
= (((const char *)q
)-2)-starts
;
2576 endinpos
= ((const char *)e
)-starts
;
2579 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
2580 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
2582 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2583 #ifndef Py_UNICODE_WIDE
2587 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2592 errmsg
= "illegal UTF-16 surrogate";
2593 startinpos
= (((const char *)q
)-4)-starts
;
2594 endinpos
= startinpos
+2;
2599 errmsg
= "illegal encoding";
2600 startinpos
= (((const char *)q
)-2)-starts
;
2601 endinpos
= startinpos
+2;
2602 /* Fall through to report the error */
2605 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2606 if (unicode_decode_call_errorhandler(
2607 errors
, &errorHandler
,
2609 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2610 &unicode
, &outpos
, &p
))
2618 *consumed
= (const char *)q
-starts
;
2621 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2624 Py_XDECREF(errorHandler
);
2626 return (PyObject
*)unicode
;
2630 Py_XDECREF(errorHandler
);
2636 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
2643 Py_ssize_t nsize
, bytesize
;
2644 #ifdef Py_UNICODE_WIDE
2645 Py_ssize_t i
, pairs
;
2647 const int pairs
= 0;
2649 /* Offsets from p for storing byte pairs in the right order. */
2650 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2651 int ihi
= 1, ilo
= 0;
2653 int ihi
= 0, ilo
= 1;
2656 #define STORECHAR(CH) \
2658 p[ihi] = ((CH) >> 8) & 0xff; \
2659 p[ilo] = (CH) & 0xff; \
2663 #ifdef Py_UNICODE_WIDE
2664 for (i
= pairs
= 0; i
< size
; i
++)
2665 if (s
[i
] >= 0x10000)
2668 /* 2 * (size + pairs + (byteorder == 0)) */
2669 if (size
> PY_SSIZE_T_MAX
||
2670 size
> PY_SSIZE_T_MAX
- pairs
- (byteorder
== 0))
2671 return PyErr_NoMemory();
2672 nsize
= size
+ pairs
+ (byteorder
== 0);
2673 bytesize
= nsize
* 2;
2674 if (bytesize
/ 2 != nsize
)
2675 return PyErr_NoMemory();
2676 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2680 p
= (unsigned char *)PyString_AS_STRING(v
);
2686 if (byteorder
== -1) {
2691 else if (byteorder
== 1) {
2697 while (size
-- > 0) {
2698 Py_UNICODE ch
= *s
++;
2700 #ifdef Py_UNICODE_WIDE
2701 if (ch
>= 0x10000) {
2702 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
2703 ch
= 0xD800 | ((ch
-0x10000) >> 10);
2714 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
2716 if (!PyUnicode_Check(unicode
)) {
2717 PyErr_BadArgument();
2720 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
2721 PyUnicode_GET_SIZE(unicode
),
2726 /* --- Unicode Escape Codec ----------------------------------------------- */
2728 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
2730 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
2734 const char *starts
= s
;
2735 Py_ssize_t startinpos
;
2736 Py_ssize_t endinpos
;
2743 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
2744 PyObject
*errorHandler
= NULL
;
2745 PyObject
*exc
= NULL
;
2747 /* Escaped strings will always be longer than the resulting
2748 Unicode string, so we start with size here and then reduce the
2749 length after conversion to the true value.
2750 (but if the error callback returns a long replacement string
2751 we'll have to allocate more space) */
2752 v
= _PyUnicode_New(size
);
2756 return (PyObject
*)v
;
2758 p
= PyUnicode_AS_UNICODE(v
);
2766 /* Non-escape characters are interpreted as Unicode ordinals */
2768 *p
++ = (unsigned char) *s
++;
2772 startinpos
= s
-starts
;
2777 c
= '\0'; /* Invalid after \ */
2782 case '\\': *p
++ = '\\'; break;
2783 case '\'': *p
++ = '\''; break;
2784 case '\"': *p
++ = '\"'; break;
2785 case 'b': *p
++ = '\b'; break;
2786 case 'f': *p
++ = '\014'; break; /* FF */
2787 case 't': *p
++ = '\t'; break;
2788 case 'n': *p
++ = '\n'; break;
2789 case 'r': *p
++ = '\r'; break;
2790 case 'v': *p
++ = '\013'; break; /* VT */
2791 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
2793 /* \OOO (octal) escapes */
2794 case '0': case '1': case '2': case '3':
2795 case '4': case '5': case '6': case '7':
2797 if (s
< end
&& '0' <= *s
&& *s
<= '7') {
2798 x
= (x
<<3) + *s
++ - '0';
2799 if (s
< end
&& '0' <= *s
&& *s
<= '7')
2800 x
= (x
<<3) + *s
++ - '0';
2809 message
= "truncated \\xXX escape";
2815 message
= "truncated \\uXXXX escape";
2821 message
= "truncated \\UXXXXXXXX escape";
2824 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2827 if (unicode_decode_call_errorhandler(
2828 errors
, &errorHandler
,
2829 "unicodeescape", "end of string in escape sequence",
2830 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2835 for (i
= 0; i
< digits
; ++i
) {
2836 c
= (unsigned char) s
[i
];
2838 endinpos
= (s
+i
+1)-starts
;
2839 if (unicode_decode_call_errorhandler(
2840 errors
, &errorHandler
,
2841 "unicodeescape", message
,
2842 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2847 chr
= (chr
<<4) & ~0xF;
2848 if (c
>= '0' && c
<= '9')
2850 else if (c
>= 'a' && c
<= 'f')
2851 chr
+= 10 + c
- 'a';
2853 chr
+= 10 + c
- 'A';
2856 if (chr
== 0xffffffff && PyErr_Occurred())
2857 /* _decoding_error will have already written into the
2861 /* when we get here, chr is a 32-bit unicode character */
2863 /* UCS-2 character */
2864 *p
++ = (Py_UNICODE
) chr
;
2865 else if (chr
<= 0x10ffff) {
2866 /* UCS-4 character. Either store directly, or as
2868 #ifdef Py_UNICODE_WIDE
2872 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
2873 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
2876 endinpos
= s
-starts
;
2877 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2878 if (unicode_decode_call_errorhandler(
2879 errors
, &errorHandler
,
2880 "unicodeescape", "illegal Unicode character",
2881 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2889 message
= "malformed \\N character escape";
2890 if (ucnhash_CAPI
== NULL
) {
2891 /* load the unicode data module */
2892 ucnhash_CAPI
= (_PyUnicode_Name_CAPI
*)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME
, 1);
2893 if (ucnhash_CAPI
== NULL
)
2897 const char *start
= s
+1;
2898 /* look for the closing brace */
2899 while (*s
!= '}' && s
< end
)
2901 if (s
> start
&& s
< end
&& *s
== '}') {
2902 /* found a name. look it up in the unicode database */
2903 message
= "unknown Unicode character name";
2905 if (ucnhash_CAPI
->getcode(NULL
, start
, (int)(s
-start
-1), &chr
))
2909 endinpos
= s
-starts
;
2910 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2911 if (unicode_decode_call_errorhandler(
2912 errors
, &errorHandler
,
2913 "unicodeescape", message
,
2914 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2921 message
= "\\ at end of string";
2923 endinpos
= s
-starts
;
2924 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2925 if (unicode_decode_call_errorhandler(
2926 errors
, &errorHandler
,
2927 "unicodeescape", message
,
2928 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2934 *p
++ = (unsigned char)s
[-1];
2941 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2943 Py_XDECREF(errorHandler
);
2945 return (PyObject
*)v
;
2950 "\\N escapes not supported (can't load unicodedata module)"
2953 Py_XDECREF(errorHandler
);
2959 Py_XDECREF(errorHandler
);
2964 /* Return a Unicode-Escape string version of the Unicode object.
2966 If quotes is true, the string is enclosed in u"" or u'' quotes as
2971 Py_LOCAL_INLINE(const Py_UNICODE
*) findchar(const Py_UNICODE
*s
,
2975 /* like wcschr, but doesn't stop at NULL characters */
2977 while (size
-- > 0) {
2987 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
2994 static const char *hexdigit
= "0123456789abcdef";
2995 #ifdef Py_UNICODE_WIDE
2996 const Py_ssize_t expandsize
= 10;
2998 const Py_ssize_t expandsize
= 6;
3001 /* XXX(nnorwitz): rather than over-allocating, it would be
3002 better to choose a different scheme. Perhaps scan the
3003 first N-chars of the string and allocate based on that size.
3005 /* Initial allocation is based on the longest-possible unichr
3008 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3009 unichr, so in this case it's the longest unichr escape. In
3010 narrow (UTF-16) builds this is five chars per source unichr
3011 since there are two unichrs in the surrogate pair, so in narrow
3012 (UTF-16) builds it's not the longest unichr escape.
3014 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3015 so in the narrow (UTF-16) build case it's the longest unichr
3019 if (size
> (PY_SSIZE_T_MAX
- 2 - 1) / expandsize
)
3020 return PyErr_NoMemory();
3022 repr
= PyString_FromStringAndSize(NULL
,
3029 p
= PyString_AS_STRING(repr
);
3033 *p
++ = (findchar(s
, size
, '\'') &&
3034 !findchar(s
, size
, '"')) ? '"' : '\'';
3036 while (size
-- > 0) {
3037 Py_UNICODE ch
= *s
++;
3039 /* Escape quotes and backslashes */
3041 ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1]) || ch
== '\\') {
3047 #ifdef Py_UNICODE_WIDE
3048 /* Map 21-bit characters to '\U00xxxxxx' */
3049 else if (ch
>= 0x10000) {
3052 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
3053 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
3054 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
3055 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
3056 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
3057 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
3058 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
3059 *p
++ = hexdigit
[ch
& 0x0000000F];
3063 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3064 else if (ch
>= 0xD800 && ch
< 0xDC00) {
3070 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3071 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3074 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
3075 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
3076 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
3077 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
3078 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
3079 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
3080 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
3081 *p
++ = hexdigit
[ucs
& 0x0000000F];
3084 /* Fall through: isolated surrogates are copied as-is */
3090 /* Map 16-bit characters to '\uxxxx' */
3094 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
3095 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
3096 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3097 *p
++ = hexdigit
[ch
& 0x000F];
3100 /* Map special whitespace to '\t', \n', '\r' */
3101 else if (ch
== '\t') {
3105 else if (ch
== '\n') {
3109 else if (ch
== '\r') {
3114 /* Map non-printable US ASCII to '\xhh' */
3115 else if (ch
< ' ' || ch
>= 0x7F) {
3118 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3119 *p
++ = hexdigit
[ch
& 0x000F];
3122 /* Copy everything else as-is */
3127 *p
++ = PyString_AS_STRING(repr
)[1];
3130 if (_PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
)))
3135 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
3138 return unicodeescape_string(s
, size
, 0);
3141 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
3143 if (!PyUnicode_Check(unicode
)) {
3144 PyErr_BadArgument();
3147 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3148 PyUnicode_GET_SIZE(unicode
));
3151 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3153 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
3157 const char *starts
= s
;
3158 Py_ssize_t startinpos
;
3159 Py_ssize_t endinpos
;
3165 PyObject
*errorHandler
= NULL
;
3166 PyObject
*exc
= NULL
;
3168 /* Escaped strings will always be longer than the resulting
3169 Unicode string, so we start with size here and then reduce the
3170 length after conversion to the true value. (But decoding error
3171 handler might have to resize the string) */
3172 v
= _PyUnicode_New(size
);
3176 return (PyObject
*)v
;
3177 p
= PyUnicode_AS_UNICODE(v
);
3185 /* Non-escape characters are interpreted as Unicode ordinals */
3187 *p
++ = (unsigned char)*s
++;
3190 startinpos
= s
-starts
;
3192 /* \u-escapes are only interpreted iff the number of leading
3193 backslashes if odd */
3198 *p
++ = (unsigned char)*s
++;
3200 if (((s
- bs
) & 1) == 0 ||
3202 (*s
!= 'u' && *s
!= 'U')) {
3206 count
= *s
=='u' ? 4 : 8;
3209 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3210 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3211 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
3212 c
= (unsigned char)*s
;
3214 endinpos
= s
-starts
;
3215 if (unicode_decode_call_errorhandler(
3216 errors
, &errorHandler
,
3217 "rawunicodeescape", "truncated \\uXXXX",
3218 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3224 if (c
>= '0' && c
<= '9')
3226 else if (c
>= 'a' && c
<= 'f')
3232 /* UCS-2 character */
3233 *p
++ = (Py_UNICODE
) x
;
3234 else if (x
<= 0x10ffff) {
3235 /* UCS-4 character. Either store directly, or as
3237 #ifdef Py_UNICODE_WIDE
3238 *p
++ = (Py_UNICODE
) x
;
3241 *p
++ = 0xD800 + (Py_UNICODE
) (x
>> 10);
3242 *p
++ = 0xDC00 + (Py_UNICODE
) (x
& 0x03FF);
3245 endinpos
= s
-starts
;
3246 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3247 if (unicode_decode_call_errorhandler(
3248 errors
, &errorHandler
,
3249 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3250 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3257 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3259 Py_XDECREF(errorHandler
);
3261 return (PyObject
*)v
;
3265 Py_XDECREF(errorHandler
);
3270 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
3277 static const char *hexdigit
= "0123456789abcdef";
3278 #ifdef Py_UNICODE_WIDE
3279 const Py_ssize_t expandsize
= 10;
3281 const Py_ssize_t expandsize
= 6;
3284 if (size
> PY_SSIZE_T_MAX
/ expandsize
)
3285 return PyErr_NoMemory();
3287 repr
= PyString_FromStringAndSize(NULL
, expandsize
* size
);
3293 p
= q
= PyString_AS_STRING(repr
);
3294 while (size
-- > 0) {
3295 Py_UNICODE ch
= *s
++;
3296 #ifdef Py_UNICODE_WIDE
3297 /* Map 32-bit characters to '\Uxxxxxxxx' */
3298 if (ch
>= 0x10000) {
3301 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
3302 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
3303 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
3304 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
3305 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3306 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3307 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3308 *p
++ = hexdigit
[ch
& 15];
3312 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3313 if (ch
>= 0xD800 && ch
< 0xDC00) {
3319 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3320 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3323 *p
++ = hexdigit
[(ucs
>> 28) & 0xf];
3324 *p
++ = hexdigit
[(ucs
>> 24) & 0xf];
3325 *p
++ = hexdigit
[(ucs
>> 20) & 0xf];
3326 *p
++ = hexdigit
[(ucs
>> 16) & 0xf];
3327 *p
++ = hexdigit
[(ucs
>> 12) & 0xf];
3328 *p
++ = hexdigit
[(ucs
>> 8) & 0xf];
3329 *p
++ = hexdigit
[(ucs
>> 4) & 0xf];
3330 *p
++ = hexdigit
[ucs
& 0xf];
3333 /* Fall through: isolated surrogates are copied as-is */
3338 /* Map 16-bit characters to '\uxxxx' */
3342 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3343 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3344 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3345 *p
++ = hexdigit
[ch
& 15];
3347 /* Copy everything else as-is */
3352 if (_PyString_Resize(&repr
, p
- q
))
3357 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
3359 if (!PyUnicode_Check(unicode
)) {
3360 PyErr_BadArgument();
3363 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3364 PyUnicode_GET_SIZE(unicode
));
3367 /* --- Unicode Internal Codec ------------------------------------------- */
3369 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
3373 const char *starts
= s
;
3374 Py_ssize_t startinpos
;
3375 Py_ssize_t endinpos
;
3381 PyObject
*errorHandler
= NULL
;
3382 PyObject
*exc
= NULL
;
3384 #ifdef Py_UNICODE_WIDE
3385 Py_UNICODE unimax
= PyUnicode_GetMax();
3388 /* XXX overflow detection missing */
3389 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
3392 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
3393 return (PyObject
*)v
;
3394 p
= PyUnicode_AS_UNICODE(v
);
3398 memcpy(p
, s
, sizeof(Py_UNICODE
));
3399 /* We have to sanity check the raw data, otherwise doom looms for
3400 some malformed UCS-4 data. */
3402 #ifdef Py_UNICODE_WIDE
3403 *p
> unimax
|| *p
< 0 ||
3405 end
-s
< Py_UNICODE_SIZE
3408 startinpos
= s
- starts
;
3409 if (end
-s
< Py_UNICODE_SIZE
) {
3410 endinpos
= end
-starts
;
3411 reason
= "truncated input";
3414 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
3415 reason
= "illegal code point (> 0x10FFFF)";
3417 outpos
= p
- PyUnicode_AS_UNICODE(v
);
3418 if (unicode_decode_call_errorhandler(
3419 errors
, &errorHandler
,
3420 "unicode_internal", reason
,
3421 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3428 s
+= Py_UNICODE_SIZE
;
3432 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3434 Py_XDECREF(errorHandler
);
3436 return (PyObject
*)v
;
3440 Py_XDECREF(errorHandler
);
3445 /* --- Latin-1 Codec ------------------------------------------------------ */
3447 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
3454 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3456 Py_UNICODE r
= *(unsigned char*)s
;
3457 return PyUnicode_FromUnicode(&r
, 1);
3460 v
= _PyUnicode_New(size
);
3464 return (PyObject
*)v
;
3465 p
= PyUnicode_AS_UNICODE(v
);
3467 *p
++ = (unsigned char)*s
++;
3468 return (PyObject
*)v
;
3475 /* create or adjust a UnicodeEncodeError */
3476 static void make_encode_exception(PyObject
**exceptionObject
,
3477 const char *encoding
,
3478 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3479 Py_ssize_t startpos
, Py_ssize_t endpos
,
3482 if (*exceptionObject
== NULL
) {
3483 *exceptionObject
= PyUnicodeEncodeError_Create(
3484 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3487 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
3489 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
3491 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
3495 Py_DECREF(*exceptionObject
);
3496 *exceptionObject
= NULL
;
3500 /* raises a UnicodeEncodeError */
3501 static void raise_encode_exception(PyObject
**exceptionObject
,
3502 const char *encoding
,
3503 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3504 Py_ssize_t startpos
, Py_ssize_t endpos
,
3507 make_encode_exception(exceptionObject
,
3508 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3509 if (*exceptionObject
!= NULL
)
3510 PyCodec_StrictErrors(*exceptionObject
);
3513 /* error handling callback helper:
3514 build arguments, call the callback and check the arguments,
3515 put the result into newpos and return the replacement string, which
3516 has to be freed by the caller */
3517 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
3518 PyObject
**errorHandler
,
3519 const char *encoding
, const char *reason
,
3520 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
3521 Py_ssize_t startpos
, Py_ssize_t endpos
,
3524 static char *argparse
= "O!n;encoding error handler must return (unicode, int) tuple";
3527 PyObject
*resunicode
;
3529 if (*errorHandler
== NULL
) {
3530 *errorHandler
= PyCodec_LookupError(errors
);
3531 if (*errorHandler
== NULL
)
3535 make_encode_exception(exceptionObject
,
3536 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3537 if (*exceptionObject
== NULL
)
3540 restuple
= PyObject_CallFunctionObjArgs(
3541 *errorHandler
, *exceptionObject
, NULL
);
3542 if (restuple
== NULL
)
3544 if (!PyTuple_Check(restuple
)) {
3545 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
3546 Py_DECREF(restuple
);
3549 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3550 &resunicode
, newpos
)) {
3551 Py_DECREF(restuple
);
3555 *newpos
= size
+*newpos
;
3556 if (*newpos
<0 || *newpos
>size
) {
3557 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
3558 Py_DECREF(restuple
);
3561 Py_INCREF(resunicode
);
3562 Py_DECREF(restuple
);
3566 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
3573 /* pointers to the beginning and end+1 of input */
3574 const Py_UNICODE
*startp
= p
;
3575 const Py_UNICODE
*endp
= p
+ size
;
3576 /* pointer to the beginning of the unencodable characters */
3577 /* const Py_UNICODE *badp = NULL; */
3578 /* pointer into the output */
3580 /* current output position */
3581 Py_ssize_t respos
= 0;
3583 const char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
3584 const char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3585 PyObject
*errorHandler
= NULL
;
3586 PyObject
*exc
= NULL
;
3587 /* the following variable is used for caching string comparisons
3588 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3589 int known_errorHandler
= -1;
3591 /* allocate enough for a simple encoding without
3592 replacements, if we need more, we'll resize */
3593 res
= PyString_FromStringAndSize(NULL
, size
);
3598 str
= PyString_AS_STRING(res
);
3604 /* can we encode this? */
3606 /* no overflow check, because we know that the space is enough */
3611 Py_ssize_t unicodepos
= p
-startp
;
3612 Py_ssize_t requiredsize
;
3613 PyObject
*repunicode
;
3618 /* startpos for collecting unencodable chars */
3619 const Py_UNICODE
*collstart
= p
;
3620 const Py_UNICODE
*collend
= p
;
3621 /* find all unecodable characters */
3622 while ((collend
< endp
) && ((*collend
)>=limit
))
3624 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3625 if (known_errorHandler
==-1) {
3626 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3627 known_errorHandler
= 1;
3628 else if (!strcmp(errors
, "replace"))
3629 known_errorHandler
= 2;
3630 else if (!strcmp(errors
, "ignore"))
3631 known_errorHandler
= 3;
3632 else if (!strcmp(errors
, "xmlcharrefreplace"))
3633 known_errorHandler
= 4;
3635 known_errorHandler
= 0;
3637 switch (known_errorHandler
) {
3638 case 1: /* strict */
3639 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
3641 case 2: /* replace */
3642 while (collstart
++<collend
)
3643 *str
++ = '?'; /* fall through */
3644 case 3: /* ignore */
3647 case 4: /* xmlcharrefreplace */
3648 respos
= str
-PyString_AS_STRING(res
);
3649 /* determine replacement size (temporarily (mis)uses p) */
3650 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
3659 #ifndef Py_UNICODE_WIDE
3665 else if (*p
<1000000)
3671 requiredsize
= respos
+repsize
+(endp
-collend
);
3672 if (requiredsize
> ressize
) {
3673 if (requiredsize
<2*ressize
)
3674 requiredsize
= 2*ressize
;
3675 if (_PyString_Resize(&res
, requiredsize
))
3677 str
= PyString_AS_STRING(res
) + respos
;
3678 ressize
= requiredsize
;
3680 /* generate replacement (temporarily (mis)uses p) */
3681 for (p
= collstart
; p
< collend
; ++p
) {
3682 str
+= sprintf(str
, "&#%d;", (int)*p
);
3687 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
3688 encoding
, reason
, startp
, size
, &exc
,
3689 collstart
-startp
, collend
-startp
, &newpos
);
3690 if (repunicode
== NULL
)
3692 /* need more space? (at least enough for what we have+the
3693 replacement+the rest of the string, so we won't have to
3694 check space for encodable characters) */
3695 respos
= str
-PyString_AS_STRING(res
);
3696 repsize
= PyUnicode_GET_SIZE(repunicode
);
3697 requiredsize
= respos
+repsize
+(endp
-collend
);
3698 if (requiredsize
> ressize
) {
3699 if (requiredsize
<2*ressize
)
3700 requiredsize
= 2*ressize
;
3701 if (_PyString_Resize(&res
, requiredsize
)) {
3702 Py_DECREF(repunicode
);
3705 str
= PyString_AS_STRING(res
) + respos
;
3706 ressize
= requiredsize
;
3708 /* check if there is anything unencodable in the replacement
3709 and copy it to the output */
3710 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
3713 raise_encode_exception(&exc
, encoding
, startp
, size
,
3714 unicodepos
, unicodepos
+1, reason
);
3715 Py_DECREF(repunicode
);
3720 p
= startp
+ newpos
;
3721 Py_DECREF(repunicode
);
3725 /* Resize if we allocated to much */
3726 respos
= str
-PyString_AS_STRING(res
);
3728 /* If this falls res will be NULL */
3729 _PyString_Resize(&res
, respos
);
3730 Py_XDECREF(errorHandler
);
3736 Py_XDECREF(errorHandler
);
3741 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
3745 return unicode_encode_ucs1(p
, size
, errors
, 256);
3748 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
3750 if (!PyUnicode_Check(unicode
)) {
3751 PyErr_BadArgument();
3754 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
3755 PyUnicode_GET_SIZE(unicode
),
3759 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3761 PyObject
*PyUnicode_DecodeASCII(const char *s
,
3765 const char *starts
= s
;
3768 Py_ssize_t startinpos
;
3769 Py_ssize_t endinpos
;
3772 PyObject
*errorHandler
= NULL
;
3773 PyObject
*exc
= NULL
;
3775 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3776 if (size
== 1 && *(unsigned char*)s
< 128) {
3777 Py_UNICODE r
= *(unsigned char*)s
;
3778 return PyUnicode_FromUnicode(&r
, 1);
3781 v
= _PyUnicode_New(size
);
3785 return (PyObject
*)v
;
3786 p
= PyUnicode_AS_UNICODE(v
);
3789 register unsigned char c
= (unsigned char)*s
;
3795 startinpos
= s
-starts
;
3796 endinpos
= startinpos
+ 1;
3797 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
3798 if (unicode_decode_call_errorhandler(
3799 errors
, &errorHandler
,
3800 "ascii", "ordinal not in range(128)",
3801 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3806 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
3807 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3809 Py_XDECREF(errorHandler
);
3811 return (PyObject
*)v
;
3815 Py_XDECREF(errorHandler
);
3820 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
3824 return unicode_encode_ucs1(p
, size
, errors
, 128);
3827 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
3829 if (!PyUnicode_Check(unicode
)) {
3830 PyErr_BadArgument();
3833 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
3834 PyUnicode_GET_SIZE(unicode
),
3838 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3840 /* --- MBCS codecs for Windows -------------------------------------------- */
3842 #if SIZEOF_INT < SIZEOF_SIZE_T
3846 /* XXX This code is limited to "true" double-byte encodings, as
3847 a) it assumes an incomplete character consists of a single byte, and
3848 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3849 encodings, see IsDBCSLeadByteEx documentation. */
3851 static int is_dbcs_lead_byte(const char *s
, int offset
)
3853 const char *curr
= s
+ offset
;
3855 if (IsDBCSLeadByte(*curr
)) {
3856 const char *prev
= CharPrev(s
, curr
);
3857 return (prev
== curr
) || !IsDBCSLeadByte(*prev
) || (curr
- prev
== 2);
3863 * Decode MBCS string into unicode object. If 'final' is set, converts
3864 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3866 static int decode_mbcs(PyUnicodeObject
**v
,
3867 const char *s
, /* MBCS string */
3868 int size
, /* sizeof MBCS string */
3877 /* Skip trailing lead-byte unless 'final' is set */
3878 if (!final
&& size
>= 1 && is_dbcs_lead_byte(s
, size
- 1))
3881 /* First get the size of the result */
3883 usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
3885 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3891 /* Create unicode object */
3892 *v
= _PyUnicode_New(usize
);
3897 /* Extend unicode object */
3898 n
= PyUnicode_GET_SIZE(*v
);
3899 if (_PyUnicode_Resize(v
, n
+ usize
) < 0)
3903 /* Do the conversion */
3905 p
= PyUnicode_AS_UNICODE(*v
) + n
;
3906 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
3907 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3915 PyObject
*PyUnicode_DecodeMBCSStateful(const char *s
,
3918 Py_ssize_t
*consumed
)
3920 PyUnicodeObject
*v
= NULL
;
3929 done
= decode_mbcs(&v
, s
, INT_MAX
, 0);
3932 done
= decode_mbcs(&v
, s
, (int)size
, !consumed
);
3943 if (size
> INT_MAX
) {
3950 return (PyObject
*)v
;
3953 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
3957 return PyUnicode_DecodeMBCSStateful(s
, size
, errors
, NULL
);
3961 * Convert unicode into string object (MBCS).
3962 * Returns 0 if succeed, -1 otherwise.
3964 static int encode_mbcs(PyObject
**repr
,
3965 const Py_UNICODE
*p
, /* unicode */
3966 int size
) /* size of unicode */
3973 /* First get the size of the result */
3975 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
3976 if (mbcssize
== 0) {
3977 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3982 if (*repr
== NULL
) {
3983 /* Create string object */
3984 *repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
3989 /* Extend string object */
3990 n
= PyString_Size(*repr
);
3991 if (_PyString_Resize(repr
, n
+ mbcssize
) < 0)
3995 /* Do the conversion */
3997 char *s
= PyString_AS_STRING(*repr
) + n
;
3998 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
3999 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
4007 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
4011 PyObject
*repr
= NULL
;
4017 ret
= encode_mbcs(&repr
, p
, INT_MAX
);
4020 ret
= encode_mbcs(&repr
, p
, (int)size
);
4028 if (size
> INT_MAX
) {
4038 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
4040 if (!PyUnicode_Check(unicode
)) {
4041 PyErr_BadArgument();
4044 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
4045 PyUnicode_GET_SIZE(unicode
),
4051 #endif /* MS_WINDOWS */
4053 /* --- Character Mapping Codec -------------------------------------------- */
4055 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
4060 const char *starts
= s
;
4061 Py_ssize_t startinpos
;
4062 Py_ssize_t endinpos
;
4067 Py_ssize_t extrachars
= 0;
4068 PyObject
*errorHandler
= NULL
;
4069 PyObject
*exc
= NULL
;
4070 Py_UNICODE
*mapstring
= NULL
;
4071 Py_ssize_t maplen
= 0;
4073 /* Default to Latin-1 */
4074 if (mapping
== NULL
)
4075 return PyUnicode_DecodeLatin1(s
, size
, errors
);
4077 v
= _PyUnicode_New(size
);
4081 return (PyObject
*)v
;
4082 p
= PyUnicode_AS_UNICODE(v
);
4084 if (PyUnicode_CheckExact(mapping
)) {
4085 mapstring
= PyUnicode_AS_UNICODE(mapping
);
4086 maplen
= PyUnicode_GET_SIZE(mapping
);
4088 unsigned char ch
= *s
;
4089 Py_UNICODE x
= 0xfffe; /* illegal value */
4095 /* undefined mapping */
4096 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4097 startinpos
= s
-starts
;
4098 endinpos
= startinpos
+1;
4099 if (unicode_decode_call_errorhandler(
4100 errors
, &errorHandler
,
4101 "charmap", "character maps to <undefined>",
4102 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4114 unsigned char ch
= *s
;
4117 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4118 w
= PyInt_FromLong((long)ch
);
4121 x
= PyObject_GetItem(mapping
, w
);
4124 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4125 /* No mapping found means: mapping is undefined. */
4134 if (PyInt_Check(x
)) {
4135 long value
= PyInt_AS_LONG(x
);
4136 if (value
< 0 || value
> 65535) {
4137 PyErr_SetString(PyExc_TypeError
,
4138 "character mapping must be in range(65536)");
4142 *p
++ = (Py_UNICODE
)value
;
4144 else if (x
== Py_None
) {
4145 /* undefined mapping */
4146 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4147 startinpos
= s
-starts
;
4148 endinpos
= startinpos
+1;
4149 if (unicode_decode_call_errorhandler(
4150 errors
, &errorHandler
,
4151 "charmap", "character maps to <undefined>",
4152 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4160 else if (PyUnicode_Check(x
)) {
4161 Py_ssize_t targetsize
= PyUnicode_GET_SIZE(x
);
4163 if (targetsize
== 1)
4165 *p
++ = *PyUnicode_AS_UNICODE(x
);
4167 else if (targetsize
> 1) {
4169 if (targetsize
> extrachars
) {
4171 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
4172 Py_ssize_t needed
= (targetsize
- extrachars
) + \
4174 extrachars
+= needed
;
4175 /* XXX overflow detection missing */
4176 if (_PyUnicode_Resize(&v
,
4177 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
4181 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
4184 PyUnicode_AS_UNICODE(x
),
4187 extrachars
-= targetsize
;
4189 /* 1-0 mapping: skip the character */
4192 /* wrong return value */
4193 PyErr_SetString(PyExc_TypeError
,
4194 "character mapping must return integer, None or unicode");
4202 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
4203 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
4205 Py_XDECREF(errorHandler
);
4207 return (PyObject
*)v
;
4210 Py_XDECREF(errorHandler
);
4216 /* Charmap encoding: the lookup table */
4218 struct encoding_map
{
4220 unsigned char level1
[32];
4222 unsigned char level23
[1];
4226 encoding_map_size(PyObject
*obj
, PyObject
* args
)
4228 struct encoding_map
*map
= (struct encoding_map
*)obj
;
4229 return PyInt_FromLong(sizeof(*map
) - 1 + 16*map
->count2
+
4233 static PyMethodDef encoding_map_methods
[] = {
4234 {"size", encoding_map_size
, METH_NOARGS
,
4235 PyDoc_STR("Return the size (in bytes) of this object") },
4240 encoding_map_dealloc(PyObject
* o
)
4245 static PyTypeObject EncodingMapType
= {
4246 PyVarObject_HEAD_INIT(NULL
, 0)
4247 "EncodingMap", /*tp_name*/
4248 sizeof(struct encoding_map
), /*tp_basicsize*/
4251 encoding_map_dealloc
, /*tp_dealloc*/
4258 0, /*tp_as_sequence*/
4259 0, /*tp_as_mapping*/
4266 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
4270 0, /*tp_richcompare*/
4271 0, /*tp_weaklistoffset*/
4274 encoding_map_methods
, /*tp_methods*/
4281 0, /*tp_dictoffset*/
4290 PyUnicode_BuildEncodingMap(PyObject
* string
)
4294 struct encoding_map
*mresult
;
4297 unsigned char level1
[32];
4298 unsigned char level2
[512];
4299 unsigned char *mlevel1
, *mlevel2
, *mlevel3
;
4300 int count2
= 0, count3
= 0;
4302 if (!PyUnicode_Check(string
) || PyUnicode_GetSize(string
) != 256) {
4303 PyErr_BadArgument();
4306 decode
= PyUnicode_AS_UNICODE(string
);
4307 memset(level1
, 0xFF, sizeof level1
);
4308 memset(level2
, 0xFF, sizeof level2
);
4310 /* If there isn't a one-to-one mapping of NULL to \0,
4311 or if there are non-BMP characters, we need to use
4312 a mapping dictionary. */
4315 for (i
= 1; i
< 256; i
++) {
4318 #ifdef Py_UNICODE_WIDE
4319 || decode
[i
] > 0xFFFF
4325 if (decode
[i
] == 0xFFFE)
4326 /* unmapped character */
4328 l1
= decode
[i
] >> 11;
4329 l2
= decode
[i
] >> 7;
4330 if (level1
[l1
] == 0xFF)
4331 level1
[l1
] = count2
++;
4332 if (level2
[l2
] == 0xFF)
4333 level2
[l2
] = count3
++;
4336 if (count2
>= 0xFF || count3
>= 0xFF)
4340 PyObject
*result
= PyDict_New();
4341 PyObject
*key
, *value
;
4344 for (i
= 0; i
< 256; i
++) {
4346 key
= PyInt_FromLong(decode
[i
]);
4347 value
= PyInt_FromLong(i
);
4350 if (PyDict_SetItem(result
, key
, value
) == -1)
4363 /* Create a three-level trie */
4364 result
= PyObject_MALLOC(sizeof(struct encoding_map
) +
4365 16*count2
+ 128*count3
- 1);
4367 return PyErr_NoMemory();
4368 PyObject_Init(result
, &EncodingMapType
);
4369 mresult
= (struct encoding_map
*)result
;
4370 mresult
->count2
= count2
;
4371 mresult
->count3
= count3
;
4372 mlevel1
= mresult
->level1
;
4373 mlevel2
= mresult
->level23
;
4374 mlevel3
= mresult
->level23
+ 16*count2
;
4375 memcpy(mlevel1
, level1
, 32);
4376 memset(mlevel2
, 0xFF, 16*count2
);
4377 memset(mlevel3
, 0, 128*count3
);
4379 for (i
= 1; i
< 256; i
++) {
4380 int o1
, o2
, o3
, i2
, i3
;
4381 if (decode
[i
] == 0xFFFE)
4382 /* unmapped character */
4385 o2
= (decode
[i
]>>7) & 0xF;
4386 i2
= 16*mlevel1
[o1
] + o2
;
4387 if (mlevel2
[i2
] == 0xFF)
4388 mlevel2
[i2
] = count3
++;
4389 o3
= decode
[i
] & 0x7F;
4390 i3
= 128*mlevel2
[i2
] + o3
;
4397 encoding_map_lookup(Py_UNICODE c
, PyObject
*mapping
)
4399 struct encoding_map
*map
= (struct encoding_map
*)mapping
;
4401 int l2
= (c
>>7) & 0xF;
4405 #ifdef Py_UNICODE_WIDE
4413 i
= map
->level1
[l1
];
4418 i
= map
->level23
[16*i
+l2
];
4423 i
= map
->level23
[16*map
->count2
+ 128*i
+ l3
];
4430 /* Lookup the character ch in the mapping. If the character
4431 can't be found, Py_None is returned (or NULL, if another
4433 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
4435 PyObject
*w
= PyInt_FromLong((long)c
);
4440 x
= PyObject_GetItem(mapping
, w
);
4443 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4444 /* No mapping found means: mapping is undefined. */
4452 else if (x
== Py_None
)
4454 else if (PyInt_Check(x
)) {
4455 long value
= PyInt_AS_LONG(x
);
4456 if (value
< 0 || value
> 255) {
4457 PyErr_SetString(PyExc_TypeError
,
4458 "character mapping must be in range(256)");
4464 else if (PyString_Check(x
))
4467 /* wrong return value */
4468 PyErr_SetString(PyExc_TypeError
,
4469 "character mapping must return integer, None or str");
4476 charmapencode_resize(PyObject
**outobj
, Py_ssize_t
*outpos
, Py_ssize_t requiredsize
)
4478 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4479 /* exponentially overallocate to minimize reallocations */
4480 if (requiredsize
< 2*outsize
)
4481 requiredsize
= 2*outsize
;
4482 if (_PyString_Resize(outobj
, requiredsize
)) {
4488 typedef enum charmapencode_result
{
4489 enc_SUCCESS
, enc_FAILED
, enc_EXCEPTION
4490 }charmapencode_result
;
4491 /* lookup the character, put the result in the output string and adjust
4492 various state variables. Reallocate the output string if not enough
4493 space is available. Return a new reference to the object that
4494 was put in the output buffer, or Py_None, if the mapping was undefined
4495 (in which case no character was written) or NULL, if a
4496 reallocation error occurred. The caller must decref the result */
4498 charmapencode_result
charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
4499 PyObject
**outobj
, Py_ssize_t
*outpos
)
4503 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4505 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4506 int res
= encoding_map_lookup(c
, mapping
);
4507 Py_ssize_t requiredsize
= *outpos
+1;
4510 if (outsize
<requiredsize
)
4511 if (!charmapencode_resize(outobj
, outpos
, requiredsize
))
4512 return enc_EXCEPTION
;
4513 outstart
= PyString_AS_STRING(*outobj
);
4514 outstart
[(*outpos
)++] = (char)res
;
4518 rep
= charmapencode_lookup(c
, mapping
);
4520 return enc_EXCEPTION
;
4521 else if (rep
==Py_None
) {
4525 if (PyInt_Check(rep
)) {
4526 Py_ssize_t requiredsize
= *outpos
+1;
4527 if (outsize
<requiredsize
)
4528 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4530 return enc_EXCEPTION
;
4532 outstart
= PyString_AS_STRING(*outobj
);
4533 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
4536 const char *repchars
= PyString_AS_STRING(rep
);
4537 Py_ssize_t repsize
= PyString_GET_SIZE(rep
);
4538 Py_ssize_t requiredsize
= *outpos
+repsize
;
4539 if (outsize
<requiredsize
)
4540 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4542 return enc_EXCEPTION
;
4544 outstart
= PyString_AS_STRING(*outobj
);
4545 memcpy(outstart
+ *outpos
, repchars
, repsize
);
4553 /* handle an error in PyUnicode_EncodeCharmap
4554 Return 0 on success, -1 on error */
4556 int charmap_encoding_error(
4557 const Py_UNICODE
*p
, Py_ssize_t size
, Py_ssize_t
*inpos
, PyObject
*mapping
,
4558 PyObject
**exceptionObject
,
4559 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
4560 PyObject
**res
, Py_ssize_t
*respos
)
4562 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4566 /* startpos for collecting unencodable chars */
4567 Py_ssize_t collstartpos
= *inpos
;
4568 Py_ssize_t collendpos
= *inpos
+1;
4570 char *encoding
= "charmap";
4571 char *reason
= "character maps to <undefined>";
4572 charmapencode_result x
;
4574 /* find all unencodable characters */
4575 while (collendpos
< size
) {
4577 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4578 int res
= encoding_map_lookup(p
[collendpos
], mapping
);
4585 rep
= charmapencode_lookup(p
[collendpos
], mapping
);
4588 else if (rep
!=Py_None
) {
4595 /* cache callback name lookup
4596 * (if not done yet, i.e. it's the first error) */
4597 if (*known_errorHandler
==-1) {
4598 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4599 *known_errorHandler
= 1;
4600 else if (!strcmp(errors
, "replace"))
4601 *known_errorHandler
= 2;
4602 else if (!strcmp(errors
, "ignore"))
4603 *known_errorHandler
= 3;
4604 else if (!strcmp(errors
, "xmlcharrefreplace"))
4605 *known_errorHandler
= 4;
4607 *known_errorHandler
= 0;
4609 switch (*known_errorHandler
) {
4610 case 1: /* strict */
4611 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4613 case 2: /* replace */
4614 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
4615 x
= charmapencode_output('?', mapping
, res
, respos
);
4616 if (x
==enc_EXCEPTION
) {
4619 else if (x
==enc_FAILED
) {
4620 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4625 case 3: /* ignore */
4626 *inpos
= collendpos
;
4628 case 4: /* xmlcharrefreplace */
4629 /* generate replacement (temporarily (mis)uses p) */
4630 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
4631 char buffer
[2+29+1+1];
4633 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
4634 for (cp
= buffer
; *cp
; ++cp
) {
4635 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
4636 if (x
==enc_EXCEPTION
)
4638 else if (x
==enc_FAILED
) {
4639 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4644 *inpos
= collendpos
;
4647 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
4648 encoding
, reason
, p
, size
, exceptionObject
,
4649 collstartpos
, collendpos
, &newpos
);
4650 if (repunicode
== NULL
)
4652 /* generate replacement */
4653 repsize
= PyUnicode_GET_SIZE(repunicode
);
4654 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
4655 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
4656 if (x
==enc_EXCEPTION
) {
4659 else if (x
==enc_FAILED
) {
4660 Py_DECREF(repunicode
);
4661 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4666 Py_DECREF(repunicode
);
4671 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
4677 PyObject
*res
= NULL
;
4678 /* current input position */
4679 Py_ssize_t inpos
= 0;
4680 /* current output position */
4681 Py_ssize_t respos
= 0;
4682 PyObject
*errorHandler
= NULL
;
4683 PyObject
*exc
= NULL
;
4684 /* the following variable is used for caching string comparisons
4685 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4686 * 3=ignore, 4=xmlcharrefreplace */
4687 int known_errorHandler
= -1;
4689 /* Default to Latin-1 */
4690 if (mapping
== NULL
)
4691 return PyUnicode_EncodeLatin1(p
, size
, errors
);
4693 /* allocate enough for a simple encoding without
4694 replacements, if we need more, we'll resize */
4695 res
= PyString_FromStringAndSize(NULL
, size
);
4701 while (inpos
<size
) {
4702 /* try to encode it */
4703 charmapencode_result x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
4704 if (x
==enc_EXCEPTION
) /* error */
4706 if (x
==enc_FAILED
) { /* unencodable character */
4707 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
4709 &known_errorHandler
, &errorHandler
, errors
,
4715 /* done with this character => adjust input position */
4719 /* Resize if we allocated to much */
4720 if (respos
<PyString_GET_SIZE(res
)) {
4721 if (_PyString_Resize(&res
, respos
))
4725 Py_XDECREF(errorHandler
);
4731 Py_XDECREF(errorHandler
);
4735 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
4738 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
4739 PyErr_BadArgument();
4742 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
4743 PyUnicode_GET_SIZE(unicode
),
4748 /* create or adjust a UnicodeTranslateError */
4749 static void make_translate_exception(PyObject
**exceptionObject
,
4750 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4751 Py_ssize_t startpos
, Py_ssize_t endpos
,
4754 if (*exceptionObject
== NULL
) {
4755 *exceptionObject
= PyUnicodeTranslateError_Create(
4756 unicode
, size
, startpos
, endpos
, reason
);
4759 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
4761 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
4763 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
4767 Py_DECREF(*exceptionObject
);
4768 *exceptionObject
= NULL
;
4772 /* raises a UnicodeTranslateError */
4773 static void raise_translate_exception(PyObject
**exceptionObject
,
4774 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4775 Py_ssize_t startpos
, Py_ssize_t endpos
,
4778 make_translate_exception(exceptionObject
,
4779 unicode
, size
, startpos
, endpos
, reason
);
4780 if (*exceptionObject
!= NULL
)
4781 PyCodec_StrictErrors(*exceptionObject
);
4784 /* error handling callback helper:
4785 build arguments, call the callback and check the arguments,
4786 put the result into newpos and return the replacement string, which
4787 has to be freed by the caller */
4788 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
4789 PyObject
**errorHandler
,
4791 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
4792 Py_ssize_t startpos
, Py_ssize_t endpos
,
4795 static char *argparse
= "O!n;translating error handler must return (unicode, int) tuple";
4797 Py_ssize_t i_newpos
;
4799 PyObject
*resunicode
;
4801 if (*errorHandler
== NULL
) {
4802 *errorHandler
= PyCodec_LookupError(errors
);
4803 if (*errorHandler
== NULL
)
4807 make_translate_exception(exceptionObject
,
4808 unicode
, size
, startpos
, endpos
, reason
);
4809 if (*exceptionObject
== NULL
)
4812 restuple
= PyObject_CallFunctionObjArgs(
4813 *errorHandler
, *exceptionObject
, NULL
);
4814 if (restuple
== NULL
)
4816 if (!PyTuple_Check(restuple
)) {
4817 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
4818 Py_DECREF(restuple
);
4821 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
4822 &resunicode
, &i_newpos
)) {
4823 Py_DECREF(restuple
);
4827 *newpos
= size
+i_newpos
;
4830 if (*newpos
<0 || *newpos
>size
) {
4831 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
4832 Py_DECREF(restuple
);
4835 Py_INCREF(resunicode
);
4836 Py_DECREF(restuple
);
4840 /* Lookup the character ch in the mapping and put the result in result,
4841 which must be decrefed by the caller.
4842 Return 0 on success, -1 on error */
4844 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
4846 PyObject
*w
= PyInt_FromLong((long)c
);
4851 x
= PyObject_GetItem(mapping
, w
);
4854 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4855 /* No mapping found means: use 1:1 mapping. */
4862 else if (x
== Py_None
) {
4866 else if (PyInt_Check(x
)) {
4867 long value
= PyInt_AS_LONG(x
);
4868 long max
= PyUnicode_GetMax();
4869 if (value
< 0 || value
> max
) {
4870 PyErr_Format(PyExc_TypeError
,
4871 "character mapping must be in range(0x%lx)", max
+1);
4878 else if (PyUnicode_Check(x
)) {
4883 /* wrong return value */
4884 PyErr_SetString(PyExc_TypeError
,
4885 "character mapping must return integer, None or unicode");
4890 /* ensure that *outobj is at least requiredsize characters long,
4891 if not reallocate and adjust various state variables.
4892 Return 0 on success, -1 on error */
4894 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
4895 Py_ssize_t requiredsize
)
4897 Py_ssize_t oldsize
= PyUnicode_GET_SIZE(*outobj
);
4898 if (requiredsize
> oldsize
) {
4899 /* remember old output position */
4900 Py_ssize_t outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
4901 /* exponentially overallocate to minimize reallocations */
4902 if (requiredsize
< 2 * oldsize
)
4903 requiredsize
= 2 * oldsize
;
4904 if (PyUnicode_Resize(outobj
, requiredsize
) < 0)
4906 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
4910 /* lookup the character, put the result in the output string and adjust
4911 various state variables. Return a new reference to the object that
4912 was put in the output buffer in *result, or Py_None, if the mapping was
4913 undefined (in which case no character was written).
4914 The called must decref result.
4915 Return 0 on success, -1 on error. */
4917 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
4918 Py_ssize_t insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
4921 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
4924 /* not found => default to 1:1 mapping */
4925 *(*outp
)++ = *curinp
;
4927 else if (*res
==Py_None
)
4929 else if (PyInt_Check(*res
)) {
4930 /* no overflow check, because we know that the space is enough */
4931 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
4933 else if (PyUnicode_Check(*res
)) {
4934 Py_ssize_t repsize
= PyUnicode_GET_SIZE(*res
);
4936 /* no overflow check, because we know that the space is enough */
4937 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
4939 else if (repsize
!=0) {
4940 /* more than one character */
4941 Py_ssize_t requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
4942 (insize
- (curinp
-startinp
)) +
4944 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
4946 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
4955 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
4961 PyObject
*res
= NULL
;
4962 /* pointers to the beginning and end+1 of input */
4963 const Py_UNICODE
*startp
= p
;
4964 const Py_UNICODE
*endp
= p
+ size
;
4965 /* pointer into the output */
4967 /* current output position */
4968 Py_ssize_t respos
= 0;
4969 char *reason
= "character maps to <undefined>";
4970 PyObject
*errorHandler
= NULL
;
4971 PyObject
*exc
= NULL
;
4972 /* the following variable is used for caching string comparisons
4973 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4974 * 3=ignore, 4=xmlcharrefreplace */
4975 int known_errorHandler
= -1;
4977 if (mapping
== NULL
) {
4978 PyErr_BadArgument();
4982 /* allocate enough for a simple 1:1 translation without
4983 replacements, if we need more, we'll resize */
4984 res
= PyUnicode_FromUnicode(NULL
, size
);
4989 str
= PyUnicode_AS_UNICODE(res
);
4992 /* try to encode it */
4994 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
4999 if (x
!=Py_None
) /* it worked => adjust input pointer */
5001 else { /* untranslatable character */
5002 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
5006 /* startpos for collecting untranslatable chars */
5007 const Py_UNICODE
*collstart
= p
;
5008 const Py_UNICODE
*collend
= p
+1;
5009 const Py_UNICODE
*coll
;
5011 /* find all untranslatable characters */
5012 while (collend
< endp
) {
5013 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
5020 /* cache callback name lookup
5021 * (if not done yet, i.e. it's the first error) */
5022 if (known_errorHandler
==-1) {
5023 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5024 known_errorHandler
= 1;
5025 else if (!strcmp(errors
, "replace"))
5026 known_errorHandler
= 2;
5027 else if (!strcmp(errors
, "ignore"))
5028 known_errorHandler
= 3;
5029 else if (!strcmp(errors
, "xmlcharrefreplace"))
5030 known_errorHandler
= 4;
5032 known_errorHandler
= 0;
5034 switch (known_errorHandler
) {
5035 case 1: /* strict */
5036 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
5038 case 2: /* replace */
5039 /* No need to check for space, this is a 1:1 replacement */
5040 for (coll
= collstart
; coll
<collend
; ++coll
)
5043 case 3: /* ignore */
5046 case 4: /* xmlcharrefreplace */
5047 /* generate replacement (temporarily (mis)uses p) */
5048 for (p
= collstart
; p
< collend
; ++p
) {
5049 char buffer
[2+29+1+1];
5051 sprintf(buffer
, "&#%d;", (int)*p
);
5052 if (charmaptranslate_makespace(&res
, &str
,
5053 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
5055 for (cp
= buffer
; *cp
; ++cp
)
5061 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
5062 reason
, startp
, size
, &exc
,
5063 collstart
-startp
, collend
-startp
, &newpos
);
5064 if (repunicode
== NULL
)
5066 /* generate replacement */
5067 repsize
= PyUnicode_GET_SIZE(repunicode
);
5068 if (charmaptranslate_makespace(&res
, &str
,
5069 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
5070 Py_DECREF(repunicode
);
5073 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
5075 p
= startp
+ newpos
;
5076 Py_DECREF(repunicode
);
5080 /* Resize if we allocated to much */
5081 respos
= str
-PyUnicode_AS_UNICODE(res
);
5082 if (respos
<PyUnicode_GET_SIZE(res
)) {
5083 if (PyUnicode_Resize(&res
, respos
) < 0)
5087 Py_XDECREF(errorHandler
);
5093 Py_XDECREF(errorHandler
);
5097 PyObject
*PyUnicode_Translate(PyObject
*str
,
5103 str
= PyUnicode_FromObject(str
);
5106 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
5107 PyUnicode_GET_SIZE(str
),
5118 /* --- Decimal Encoder ---------------------------------------------------- */
5120 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
5125 Py_UNICODE
*p
, *end
;
5126 PyObject
*errorHandler
= NULL
;
5127 PyObject
*exc
= NULL
;
5128 const char *encoding
= "decimal";
5129 const char *reason
= "invalid decimal Unicode string";
5130 /* the following variable is used for caching string comparisons
5131 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5132 int known_errorHandler
= -1;
5134 if (output
== NULL
) {
5135 PyErr_BadArgument();
5142 register Py_UNICODE ch
= *p
;
5144 PyObject
*repunicode
;
5148 Py_UNICODE
*collstart
;
5149 Py_UNICODE
*collend
;
5151 if (Py_UNICODE_ISSPACE(ch
)) {
5156 decimal
= Py_UNICODE_TODECIMAL(ch
);
5158 *output
++ = '0' + decimal
;
5162 if (0 < ch
&& ch
< 256) {
5163 *output
++ = (char)ch
;
5167 /* All other characters are considered unencodable */
5170 while (collend
< end
) {
5171 if ((0 < *collend
&& *collend
< 256) ||
5172 !Py_UNICODE_ISSPACE(*collend
) ||
5173 Py_UNICODE_TODECIMAL(*collend
))
5176 /* cache callback name lookup
5177 * (if not done yet, i.e. it's the first error) */
5178 if (known_errorHandler
==-1) {
5179 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5180 known_errorHandler
= 1;
5181 else if (!strcmp(errors
, "replace"))
5182 known_errorHandler
= 2;
5183 else if (!strcmp(errors
, "ignore"))
5184 known_errorHandler
= 3;
5185 else if (!strcmp(errors
, "xmlcharrefreplace"))
5186 known_errorHandler
= 4;
5188 known_errorHandler
= 0;
5190 switch (known_errorHandler
) {
5191 case 1: /* strict */
5192 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
5194 case 2: /* replace */
5195 for (p
= collstart
; p
< collend
; ++p
)
5198 case 3: /* ignore */
5201 case 4: /* xmlcharrefreplace */
5202 /* generate replacement (temporarily (mis)uses p) */
5203 for (p
= collstart
; p
< collend
; ++p
)
5204 output
+= sprintf(output
, "&#%d;", (int)*p
);
5208 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
5209 encoding
, reason
, s
, length
, &exc
,
5210 collstart
-s
, collend
-s
, &newpos
);
5211 if (repunicode
== NULL
)
5213 /* generate replacement */
5214 repsize
= PyUnicode_GET_SIZE(repunicode
);
5215 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
5216 Py_UNICODE ch
= *uni2
;
5217 if (Py_UNICODE_ISSPACE(ch
))
5220 decimal
= Py_UNICODE_TODECIMAL(ch
);
5222 *output
++ = '0' + decimal
;
5223 else if (0 < ch
&& ch
< 256)
5224 *output
++ = (char)ch
;
5226 Py_DECREF(repunicode
);
5227 raise_encode_exception(&exc
, encoding
,
5228 s
, length
, collstart
-s
, collend
-s
, reason
);
5234 Py_DECREF(repunicode
);
5237 /* 0-terminate the output string */
5240 Py_XDECREF(errorHandler
);
5245 Py_XDECREF(errorHandler
);
5249 /* --- Helpers ------------------------------------------------------------ */
5251 #include "stringlib/unicodedefs.h"
5252 #include "stringlib/fastsearch.h"
5254 #include "stringlib/count.h"
5255 #include "stringlib/find.h"
5256 #include "stringlib/partition.h"
5257 #include "stringlib/split.h"
5259 /* helper macro to fixup start/end slice values */
5260 #define ADJUST_INDICES(start, end, len) \
5263 else if (end < 0) { \
5274 Py_ssize_t
PyUnicode_Count(PyObject
*str
,
5280 PyUnicodeObject
* str_obj
;
5281 PyUnicodeObject
* sub_obj
;
5283 str_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(str
);
5286 sub_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(substr
);
5292 ADJUST_INDICES(start
, end
, str_obj
->length
);
5293 result
= stringlib_count(
5294 str_obj
->str
+ start
, end
- start
, sub_obj
->str
, sub_obj
->length
,
5304 Py_ssize_t
PyUnicode_Find(PyObject
*str
,
5312 str
= PyUnicode_FromObject(str
);
5315 sub
= PyUnicode_FromObject(sub
);
5322 result
= stringlib_find_slice(
5323 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5324 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5328 result
= stringlib_rfind_slice(
5329 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5330 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5341 int tailmatch(PyUnicodeObject
*self
,
5342 PyUnicodeObject
*substring
,
5347 if (substring
->length
== 0)
5350 ADJUST_INDICES(start
, end
, self
->length
);
5351 end
-= substring
->length
;
5355 if (direction
> 0) {
5356 if (Py_UNICODE_MATCH(self
, end
, substring
))
5359 if (Py_UNICODE_MATCH(self
, start
, substring
))
5366 Py_ssize_t
PyUnicode_Tailmatch(PyObject
*str
,
5374 str
= PyUnicode_FromObject(str
);
5377 substr
= PyUnicode_FromObject(substr
);
5378 if (substr
== NULL
) {
5383 result
= tailmatch((PyUnicodeObject
*)str
,
5384 (PyUnicodeObject
*)substr
,
5385 start
, end
, direction
);
5391 /* Apply fixfct filter to the Unicode object self and return a
5392 reference to the modified object */
5395 PyObject
*fixup(PyUnicodeObject
*self
,
5396 int (*fixfct
)(PyUnicodeObject
*s
))
5401 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5405 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5407 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
5408 /* fixfct should return TRUE if it modified the buffer. If
5409 FALSE, return a reference to the original buffer instead
5410 (to save space, not time) */
5413 return (PyObject
*) self
;
5415 return (PyObject
*) u
;
5419 int fixupper(PyUnicodeObject
*self
)
5421 Py_ssize_t len
= self
->length
;
5422 Py_UNICODE
*s
= self
->str
;
5426 register Py_UNICODE ch
;
5428 ch
= Py_UNICODE_TOUPPER(*s
);
5440 int fixlower(PyUnicodeObject
*self
)
5442 Py_ssize_t len
= self
->length
;
5443 Py_UNICODE
*s
= self
->str
;
5447 register Py_UNICODE ch
;
5449 ch
= Py_UNICODE_TOLOWER(*s
);
5461 int fixswapcase(PyUnicodeObject
*self
)
5463 Py_ssize_t len
= self
->length
;
5464 Py_UNICODE
*s
= self
->str
;
5468 if (Py_UNICODE_ISUPPER(*s
)) {
5469 *s
= Py_UNICODE_TOLOWER(*s
);
5471 } else if (Py_UNICODE_ISLOWER(*s
)) {
5472 *s
= Py_UNICODE_TOUPPER(*s
);
5482 int fixcapitalize(PyUnicodeObject
*self
)
5484 Py_ssize_t len
= self
->length
;
5485 Py_UNICODE
*s
= self
->str
;
5490 if (Py_UNICODE_ISLOWER(*s
)) {
5491 *s
= Py_UNICODE_TOUPPER(*s
);
5496 if (Py_UNICODE_ISUPPER(*s
)) {
5497 *s
= Py_UNICODE_TOLOWER(*s
);
5506 int fixtitle(PyUnicodeObject
*self
)
5508 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5509 register Py_UNICODE
*e
;
5510 int previous_is_cased
;
5512 /* Shortcut for single character strings */
5513 if (PyUnicode_GET_SIZE(self
) == 1) {
5514 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
5523 e
= p
+ PyUnicode_GET_SIZE(self
);
5524 previous_is_cased
= 0;
5525 for (; p
< e
; p
++) {
5526 register const Py_UNICODE ch
= *p
;
5528 if (previous_is_cased
)
5529 *p
= Py_UNICODE_TOLOWER(ch
);
5531 *p
= Py_UNICODE_TOTITLE(ch
);
5533 if (Py_UNICODE_ISLOWER(ch
) ||
5534 Py_UNICODE_ISUPPER(ch
) ||
5535 Py_UNICODE_ISTITLE(ch
))
5536 previous_is_cased
= 1;
5538 previous_is_cased
= 0;
5544 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
5546 PyObject
*internal_separator
= NULL
;
5547 const Py_UNICODE blank
= ' ';
5548 const Py_UNICODE
*sep
= &blank
;
5549 Py_ssize_t seplen
= 1;
5550 PyUnicodeObject
*res
= NULL
; /* the result */
5551 Py_ssize_t res_alloc
= 100; /* # allocated bytes for string in res */
5552 Py_ssize_t res_used
; /* # used bytes */
5553 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
5554 PyObject
*fseq
; /* PySequence_Fast(seq) */
5555 Py_ssize_t seqlen
; /* len(fseq) -- number of items in sequence */
5559 fseq
= PySequence_Fast(seq
, "");
5564 /* Grrrr. A codec may be invoked to convert str objects to
5565 * Unicode, and so it's possible to call back into Python code
5566 * during PyUnicode_FromObject(), and so it's possible for a sick
5567 * codec to change the size of fseq (if seq is a list). Therefore
5568 * we have to keep refetching the size -- can't assume seqlen
5571 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5572 /* If empty sequence, return u"". */
5574 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
5577 /* If singleton sequence with an exact Unicode, return that. */
5579 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
5580 if (PyUnicode_CheckExact(item
)) {
5582 res
= (PyUnicodeObject
*)item
;
5587 /* At least two items to join, or one that isn't exact Unicode. */
5589 /* Set up sep and seplen -- they're needed. */
5590 if (separator
== NULL
) {
5595 internal_separator
= PyUnicode_FromObject(separator
);
5596 if (internal_separator
== NULL
)
5598 sep
= PyUnicode_AS_UNICODE(internal_separator
);
5599 seplen
= PyUnicode_GET_SIZE(internal_separator
);
5600 /* In case PyUnicode_FromObject() mutated seq. */
5601 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5606 res
= _PyUnicode_New(res_alloc
);
5609 res_p
= PyUnicode_AS_UNICODE(res
);
5612 for (i
= 0; i
< seqlen
; ++i
) {
5614 Py_ssize_t new_res_used
;
5616 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
5617 /* Convert item to Unicode. */
5618 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
5619 PyErr_Format(PyExc_TypeError
,
5620 "sequence item %zd: expected string or Unicode,"
5622 i
, Py_TYPE(item
)->tp_name
);
5625 item
= PyUnicode_FromObject(item
);
5628 /* We own a reference to item from here on. */
5630 /* In case PyUnicode_FromObject() mutated seq. */
5631 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5633 /* Make sure we have enough space for the separator and the item. */
5634 itemlen
= PyUnicode_GET_SIZE(item
);
5635 new_res_used
= res_used
+ itemlen
;
5636 if (new_res_used
< 0)
5638 if (i
< seqlen
- 1) {
5639 new_res_used
+= seplen
;
5640 if (new_res_used
< 0)
5643 if (new_res_used
> res_alloc
) {
5644 /* double allocated size until it's big enough */
5646 res_alloc
+= res_alloc
;
5649 } while (new_res_used
> res_alloc
);
5650 if (_PyUnicode_Resize(&res
, res_alloc
) < 0) {
5654 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
5657 /* Copy item, and maybe the separator. */
5658 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), itemlen
);
5660 if (i
< seqlen
- 1) {
5661 Py_UNICODE_COPY(res_p
, sep
, seplen
);
5665 res_used
= new_res_used
;
5668 /* Shrink res to match the used area; this probably can't fail,
5669 * but it's cheap to check.
5671 if (_PyUnicode_Resize(&res
, res_used
) < 0)
5675 Py_XDECREF(internal_separator
);
5677 return (PyObject
*)res
;
5680 PyErr_SetString(PyExc_OverflowError
,
5681 "join() result is too long for a Python string");
5686 Py_XDECREF(internal_separator
);
5693 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
5705 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
5710 if (left
> PY_SSIZE_T_MAX
- self
->length
||
5711 right
> PY_SSIZE_T_MAX
- (left
+ self
->length
)) {
5712 PyErr_SetString(PyExc_OverflowError
, "padded string is too long");
5715 u
= _PyUnicode_New(left
+ self
->length
+ right
);
5718 Py_UNICODE_FILL(u
->str
, fill
, left
);
5719 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
5721 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
5727 PyObject
*PyUnicode_Splitlines(PyObject
*string
, int keepends
)
5731 string
= PyUnicode_FromObject(string
);
5735 list
= stringlib_splitlines(
5736 (PyObject
*) string
, PyUnicode_AS_UNICODE(string
),
5737 PyUnicode_GET_SIZE(string
), keepends
);
5744 PyObject
*split(PyUnicodeObject
*self
,
5745 PyUnicodeObject
*substring
,
5746 Py_ssize_t maxcount
)
5749 maxcount
= PY_SSIZE_T_MAX
;
5751 if (substring
== NULL
)
5752 return stringlib_split_whitespace(
5753 (PyObject
*) self
, self
->str
, self
->length
, maxcount
5756 return stringlib_split(
5757 (PyObject
*) self
, self
->str
, self
->length
,
5758 substring
->str
, substring
->length
,
5764 PyObject
*rsplit(PyUnicodeObject
*self
,
5765 PyUnicodeObject
*substring
,
5766 Py_ssize_t maxcount
)
5769 maxcount
= PY_SSIZE_T_MAX
;
5771 if (substring
== NULL
)
5772 return stringlib_rsplit_whitespace(
5773 (PyObject
*) self
, self
->str
, self
->length
, maxcount
5776 return stringlib_rsplit(
5777 (PyObject
*) self
, self
->str
, self
->length
,
5778 substring
->str
, substring
->length
,
5784 PyObject
*replace(PyUnicodeObject
*self
,
5785 PyUnicodeObject
*str1
,
5786 PyUnicodeObject
*str2
,
5787 Py_ssize_t maxcount
)
5792 maxcount
= PY_SSIZE_T_MAX
;
5793 else if (maxcount
== 0 || self
->length
== 0)
5796 if (str1
->length
== str2
->length
) {
5799 if (str1
->length
== 0)
5801 if (str1
->length
== 1) {
5802 /* replace characters */
5804 if (!findchar(self
->str
, self
->length
, str1
->str
[0]))
5806 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5809 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5812 for (i
= 0; i
< u
->length
; i
++)
5813 if (u
->str
[i
] == u1
) {
5820 self
->str
, self
->length
, str1
->str
, str1
->length
, 0
5824 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5827 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5829 /* change everything in-place, starting with this one */
5830 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5833 while ( --maxcount
> 0) {
5834 i
= stringlib_find(self
->str
+i
, self
->length
-i
,
5835 str1
->str
, str1
->length
,
5839 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5846 Py_ssize_t product
, new_size
, delta
;
5849 /* replace strings */
5850 n
= stringlib_count(self
->str
, self
->length
, str1
->str
, str1
->length
,
5854 /* new_size = self->length + n * (str2->length - str1->length)); */
5855 delta
= (str2
->length
- str1
->length
);
5857 new_size
= self
->length
;
5859 product
= n
* (str2
->length
- str1
->length
);
5860 if ((product
/ (str2
->length
- str1
->length
)) != n
) {
5861 PyErr_SetString(PyExc_OverflowError
,
5862 "replace string is too long");
5865 new_size
= self
->length
+ product
;
5867 PyErr_SetString(PyExc_OverflowError
,
5868 "replace string is too long");
5872 u
= _PyUnicode_New(new_size
);
5877 if (str1
->length
> 0) {
5879 /* look for next match */
5880 j
= stringlib_find(self
->str
+i
, self
->length
-i
,
5881 str1
->str
, str1
->length
,
5886 /* copy unchanged part [i:j] */
5887 Py_UNICODE_COPY(p
, self
->str
+i
, j
-i
);
5890 /* copy substitution string */
5891 if (str2
->length
> 0) {
5892 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5895 i
= j
+ str1
->length
;
5897 if (i
< self
->length
)
5898 /* copy tail [i:] */
5899 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5903 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5907 *p
++ = self
->str
[i
++];
5909 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5912 return (PyObject
*) u
;
5915 /* nothing to replace; return original string (when possible) */
5916 if (PyUnicode_CheckExact(self
)) {
5918 return (PyObject
*) self
;
5920 return PyUnicode_FromUnicode(self
->str
, self
->length
);
5923 /* --- Unicode Object Methods --------------------------------------------- */
5925 PyDoc_STRVAR(title__doc__
,
5926 "S.title() -> unicode\n\
5928 Return a titlecased version of S, i.e. words start with title case\n\
5929 characters, all remaining cased characters have lower case.");
5932 unicode_title(PyUnicodeObject
*self
)
5934 return fixup(self
, fixtitle
);
5937 PyDoc_STRVAR(capitalize__doc__
,
5938 "S.capitalize() -> unicode\n\
5940 Return a capitalized version of S, i.e. make the first character\n\
5944 unicode_capitalize(PyUnicodeObject
*self
)
5946 return fixup(self
, fixcapitalize
);
5950 PyDoc_STRVAR(capwords__doc__
,
5951 "S.capwords() -> unicode\n\
5953 Apply .capitalize() to all words in S and return the result with\n\
5954 normalized whitespace (all whitespace strings are replaced by ' ').");
5957 unicode_capwords(PyUnicodeObject
*self
)
5963 /* Split into words */
5964 list
= split(self
, NULL
, -1);
5968 /* Capitalize each word */
5969 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
5970 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
5974 Py_DECREF(PyList_GET_ITEM(list
, i
));
5975 PyList_SET_ITEM(list
, i
, item
);
5978 /* Join the words to form a new string */
5979 item
= PyUnicode_Join(NULL
, list
);
5983 return (PyObject
*)item
;
5987 /* Argument converter. Coerces to a single unicode character */
5990 convert_uc(PyObject
*obj
, void *addr
)
5992 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
5996 uniobj
= PyUnicode_FromObject(obj
);
5997 if (uniobj
== NULL
) {
5998 PyErr_SetString(PyExc_TypeError
,
5999 "The fill character cannot be converted to Unicode");
6002 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
6003 PyErr_SetString(PyExc_TypeError
,
6004 "The fill character must be exactly one character long");
6008 unistr
= PyUnicode_AS_UNICODE(uniobj
);
6009 *fillcharloc
= unistr
[0];
6014 PyDoc_STRVAR(center__doc__
,
6015 "S.center(width[, fillchar]) -> unicode\n\
6017 Return S centered in a Unicode string of length width. Padding is\n\
6018 done using the specified fill character (default is a space)");
6021 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
6023 Py_ssize_t marg
, left
;
6025 Py_UNICODE fillchar
= ' ';
6027 if (!PyArg_ParseTuple(args
, "n|O&:center", &width
, convert_uc
, &fillchar
))
6030 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6032 return (PyObject
*) self
;
6035 marg
= width
- self
->length
;
6036 left
= marg
/ 2 + (marg
& width
& 1);
6038 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
6043 /* This code should go into some future Unicode collation support
6044 module. The basic comparison should compare ordinals on a naive
6045 basis (this is what Java does and thus Jython too). */
6047 /* speedy UTF-16 code point order comparison */
6049 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6051 static short utf16Fixup
[32] =
6053 0, 0, 0, 0, 0, 0, 0, 0,
6054 0, 0, 0, 0, 0, 0, 0, 0,
6055 0, 0, 0, 0, 0, 0, 0, 0,
6056 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6060 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6062 Py_ssize_t len1
, len2
;
6064 Py_UNICODE
*s1
= str1
->str
;
6065 Py_UNICODE
*s2
= str2
->str
;
6067 len1
= str1
->length
;
6068 len2
= str2
->length
;
6070 while (len1
> 0 && len2
> 0) {
6076 if (c1
> (1<<11) * 26)
6077 c1
+= utf16Fixup
[c1
>>11];
6078 if (c2
> (1<<11) * 26)
6079 c2
+= utf16Fixup
[c2
>>11];
6080 /* now c1 and c2 are in UTF-32-compatible order */
6083 return (c1
< c2
) ? -1 : 1;
6088 return (len1
< len2
) ? -1 : (len1
!= len2
);
6094 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6096 register Py_ssize_t len1
, len2
;
6098 Py_UNICODE
*s1
= str1
->str
;
6099 Py_UNICODE
*s2
= str2
->str
;
6101 len1
= str1
->length
;
6102 len2
= str2
->length
;
6104 while (len1
> 0 && len2
> 0) {
6111 return (c1
< c2
) ? -1 : 1;
6116 return (len1
< len2
) ? -1 : (len1
!= len2
);
6121 int PyUnicode_Compare(PyObject
*left
,
6124 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
6127 /* Coerce the two arguments */
6128 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6131 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6135 /* Shortcut for empty or interned objects */
6142 result
= unicode_compare(u
, v
);
6154 PyObject
*PyUnicode_RichCompare(PyObject
*left
,
6160 result
= PyUnicode_Compare(left
, right
);
6161 if (result
== -1 && PyErr_Occurred())
6164 /* Convert the return value to a Boolean */
6167 result
= (result
== 0);
6170 result
= (result
!= 0);
6173 result
= (result
<= 0);
6176 result
= (result
>= 0);
6179 result
= (result
== -1);
6182 result
= (result
== 1);
6185 return PyBool_FromLong(result
);
6191 Type errors mean that PyUnicode_FromObject() could not convert
6192 one of the arguments (usually the right hand side) to Unicode,
6193 ie. we can't handle the comparison request. However, it is
6194 possible that the other object knows a comparison method, which
6195 is why we return Py_NotImplemented to give the other object a
6199 if (PyErr_ExceptionMatches(PyExc_TypeError
)) {
6201 Py_INCREF(Py_NotImplemented
);
6202 return Py_NotImplemented
;
6204 if (op
!= Py_EQ
&& op
!= Py_NE
)
6207 /* Equality comparison.
6209 This is a special case: we silence any PyExc_UnicodeDecodeError
6210 and instead turn it into a PyErr_UnicodeWarning.
6213 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
))
6216 if (PyErr_Warn(PyExc_UnicodeWarning
,
6218 "Unicode equal comparison "
6219 "failed to convert both arguments to Unicode - "
6220 "interpreting them as being unequal" :
6221 "Unicode unequal comparison "
6222 "failed to convert both arguments to Unicode - "
6223 "interpreting them as being unequal"
6226 result
= (op
== Py_NE
);
6227 return PyBool_FromLong(result
);
6230 int PyUnicode_Contains(PyObject
*container
,
6233 PyObject
*str
, *sub
;
6236 /* Coerce the two arguments */
6237 sub
= PyUnicode_FromObject(element
);
6242 str
= PyUnicode_FromObject(container
);
6248 result
= stringlib_contains_obj(str
, sub
);
6256 /* Concat to string or Unicode object giving a new Unicode object. */
6258 PyObject
*PyUnicode_Concat(PyObject
*left
,
6261 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
6263 /* Coerce the two arguments */
6264 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6267 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6272 if (v
== unicode_empty
) {
6274 return (PyObject
*)u
;
6276 if (u
== unicode_empty
) {
6278 return (PyObject
*)v
;
6281 /* Concat the two Unicode strings */
6282 w
= _PyUnicode_New(u
->length
+ v
->length
);
6285 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
6286 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
6290 return (PyObject
*)w
;
6298 PyDoc_STRVAR(count__doc__
,
6299 "S.count(sub[, start[, end]]) -> int\n\
6301 Return the number of non-overlapping occurrences of substring sub in\n\
6302 Unicode string S[start:end]. Optional arguments start and end are\n\
6303 interpreted as in slice notation.");
6306 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
6308 PyUnicodeObject
*substring
;
6309 Py_ssize_t start
= 0;
6310 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6313 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
6314 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6317 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6318 (PyObject
*)substring
);
6319 if (substring
== NULL
)
6322 ADJUST_INDICES(start
, end
, self
->length
);
6323 result
= PyInt_FromSsize_t(
6324 stringlib_count(self
->str
+ start
, end
- start
,
6325 substring
->str
, substring
->length
,
6329 Py_DECREF(substring
);
6334 PyDoc_STRVAR(encode__doc__
,
6335 "S.encode([encoding[,errors]]) -> string or unicode\n\
6337 Encodes S using the codec registered for encoding. encoding defaults\n\
6338 to the default encoding. errors may be given to set a different error\n\
6339 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6340 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6341 'xmlcharrefreplace' as well as any other name registered with\n\
6342 codecs.register_error that can handle UnicodeEncodeErrors.");
6345 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
, PyObject
*kwargs
)
6347 static char *kwlist
[] = {"encoding", "errors", 0};
6348 char *encoding
= NULL
;
6349 char *errors
= NULL
;
6352 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "|ss:encode",
6353 kwlist
, &encoding
, &errors
))
6355 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
6358 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6359 PyErr_Format(PyExc_TypeError
,
6360 "encoder did not return a string/unicode object "
6362 Py_TYPE(v
)->tp_name
);
6372 PyDoc_STRVAR(decode__doc__
,
6373 "S.decode([encoding[,errors]]) -> string or unicode\n\
6375 Decodes S using the codec registered for encoding. encoding defaults\n\
6376 to the default encoding. errors may be given to set a different error\n\
6377 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6378 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6379 as well as any other name registerd with codecs.register_error that is\n\
6380 able to handle UnicodeDecodeErrors.");
6383 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
, PyObject
*kwargs
)
6385 static char *kwlist
[] = {"encoding", "errors", 0};
6386 char *encoding
= NULL
;
6387 char *errors
= NULL
;
6390 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "|ss:decode",
6391 kwlist
, &encoding
, &errors
))
6393 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
6396 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6397 PyErr_Format(PyExc_TypeError
,
6398 "decoder did not return a string/unicode object "
6400 Py_TYPE(v
)->tp_name
);
6410 PyDoc_STRVAR(expandtabs__doc__
,
6411 "S.expandtabs([tabsize]) -> unicode\n\
6413 Return a copy of S where all tab characters are expanded using spaces.\n\
6414 If tabsize is not given, a tab size of 8 characters is assumed.");
6417 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
6423 Py_ssize_t i
, j
, incr
;
6427 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
6430 /* First pass: determine size of output string */
6431 i
= 0; /* chars up to and including most recent \n or \r */
6432 j
= 0; /* chars since most recent \n or \r (use in tab calculations) */
6433 e
= self
->str
+ self
->length
; /* end of input */
6434 for (p
= self
->str
; p
< e
; p
++)
6437 incr
= tabsize
- (j
% tabsize
); /* cannot overflow */
6438 if (j
> PY_SSIZE_T_MAX
- incr
)
6444 if (j
> PY_SSIZE_T_MAX
- 1)
6447 if (*p
== '\n' || *p
== '\r') {
6448 if (i
> PY_SSIZE_T_MAX
- j
)
6455 if (i
> PY_SSIZE_T_MAX
- j
)
6458 /* Second pass: create output string and fill it */
6459 u
= _PyUnicode_New(i
+ j
);
6463 j
= 0; /* same as in first pass */
6464 q
= u
->str
; /* next output char */
6465 qe
= u
->str
+ u
->length
; /* end of output */
6467 for (p
= self
->str
; p
< e
; p
++)
6470 i
= tabsize
- (j
% tabsize
);
6484 if (*p
== '\n' || *p
== '\r')
6488 return (PyObject
*) u
;
6493 PyErr_SetString(PyExc_OverflowError
, "new string is too long");
6497 PyDoc_STRVAR(find__doc__
,
6498 "S.find(sub [,start [,end]]) -> int\n\
6500 Return the lowest index in S where substring sub is found,\n\
6501 such that sub is contained within s[start:end]. Optional\n\
6502 arguments start and end are interpreted as in slice notation.\n\
6504 Return -1 on failure.");
6507 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
6509 PyObject
*substring
;
6514 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6517 result
= stringlib_find_slice(
6518 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6519 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6523 Py_DECREF(substring
);
6525 return PyInt_FromSsize_t(result
);
6529 unicode_getitem(PyUnicodeObject
*self
, Py_ssize_t index
)
6531 if (index
< 0 || index
>= self
->length
) {
6532 PyErr_SetString(PyExc_IndexError
, "string index out of range");
6536 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
6540 unicode_hash(PyUnicodeObject
*self
)
6542 /* Since Unicode objects compare equal to their ASCII string
6543 counterparts, they should use the individual character values
6544 as basis for their hash value. This is needed to assure that
6545 strings and Unicode objects behave in the same way as
6548 register Py_ssize_t len
;
6549 register Py_UNICODE
*p
;
6552 if (self
->hash
!= -1)
6554 len
= PyUnicode_GET_SIZE(self
);
6555 p
= PyUnicode_AS_UNICODE(self
);
6558 x
= (1000003*x
) ^ *p
++;
6559 x
^= PyUnicode_GET_SIZE(self
);
6566 PyDoc_STRVAR(index__doc__
,
6567 "S.index(sub [,start [,end]]) -> int\n\
6569 Like S.find() but raise ValueError when the substring is not found.");
6572 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
6575 PyObject
*substring
;
6579 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6582 result
= stringlib_find_slice(
6583 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6584 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6588 Py_DECREF(substring
);
6591 PyErr_SetString(PyExc_ValueError
, "substring not found");
6595 return PyInt_FromSsize_t(result
);
6598 PyDoc_STRVAR(islower__doc__
,
6599 "S.islower() -> bool\n\
6601 Return True if all cased characters in S are lowercase and there is\n\
6602 at least one cased character in S, False otherwise.");
6605 unicode_islower(PyUnicodeObject
*self
)
6607 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6608 register const Py_UNICODE
*e
;
6611 /* Shortcut for single character strings */
6612 if (PyUnicode_GET_SIZE(self
) == 1)
6613 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
6615 /* Special case for empty strings */
6616 if (PyUnicode_GET_SIZE(self
) == 0)
6617 return PyBool_FromLong(0);
6619 e
= p
+ PyUnicode_GET_SIZE(self
);
6621 for (; p
< e
; p
++) {
6622 register const Py_UNICODE ch
= *p
;
6624 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
6625 return PyBool_FromLong(0);
6626 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
6629 return PyBool_FromLong(cased
);
6632 PyDoc_STRVAR(isupper__doc__
,
6633 "S.isupper() -> bool\n\
6635 Return True if all cased characters in S are uppercase and there is\n\
6636 at least one cased character in S, False otherwise.");
6639 unicode_isupper(PyUnicodeObject
*self
)
6641 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6642 register const Py_UNICODE
*e
;
6645 /* Shortcut for single character strings */
6646 if (PyUnicode_GET_SIZE(self
) == 1)
6647 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
6649 /* Special case for empty strings */
6650 if (PyUnicode_GET_SIZE(self
) == 0)
6651 return PyBool_FromLong(0);
6653 e
= p
+ PyUnicode_GET_SIZE(self
);
6655 for (; p
< e
; p
++) {
6656 register const Py_UNICODE ch
= *p
;
6658 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
6659 return PyBool_FromLong(0);
6660 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
6663 return PyBool_FromLong(cased
);
6666 PyDoc_STRVAR(istitle__doc__
,
6667 "S.istitle() -> bool\n\
6669 Return True if S is a titlecased string and there is at least one\n\
6670 character in S, i.e. upper- and titlecase characters may only\n\
6671 follow uncased characters and lowercase characters only cased ones.\n\
6672 Return False otherwise.");
6675 unicode_istitle(PyUnicodeObject
*self
)
6677 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6678 register const Py_UNICODE
*e
;
6679 int cased
, previous_is_cased
;
6681 /* Shortcut for single character strings */
6682 if (PyUnicode_GET_SIZE(self
) == 1)
6683 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
6684 (Py_UNICODE_ISUPPER(*p
) != 0));
6686 /* Special case for empty strings */
6687 if (PyUnicode_GET_SIZE(self
) == 0)
6688 return PyBool_FromLong(0);
6690 e
= p
+ PyUnicode_GET_SIZE(self
);
6692 previous_is_cased
= 0;
6693 for (; p
< e
; p
++) {
6694 register const Py_UNICODE ch
= *p
;
6696 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
6697 if (previous_is_cased
)
6698 return PyBool_FromLong(0);
6699 previous_is_cased
= 1;
6702 else if (Py_UNICODE_ISLOWER(ch
)) {
6703 if (!previous_is_cased
)
6704 return PyBool_FromLong(0);
6705 previous_is_cased
= 1;
6709 previous_is_cased
= 0;
6711 return PyBool_FromLong(cased
);
6714 PyDoc_STRVAR(isspace__doc__
,
6715 "S.isspace() -> bool\n\
6717 Return True if all characters in S are whitespace\n\
6718 and there is at least one character in S, False otherwise.");
6721 unicode_isspace(PyUnicodeObject
*self
)
6723 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6724 register const Py_UNICODE
*e
;
6726 /* Shortcut for single character strings */
6727 if (PyUnicode_GET_SIZE(self
) == 1 &&
6728 Py_UNICODE_ISSPACE(*p
))
6729 return PyBool_FromLong(1);
6731 /* Special case for empty strings */
6732 if (PyUnicode_GET_SIZE(self
) == 0)
6733 return PyBool_FromLong(0);
6735 e
= p
+ PyUnicode_GET_SIZE(self
);
6736 for (; p
< e
; p
++) {
6737 if (!Py_UNICODE_ISSPACE(*p
))
6738 return PyBool_FromLong(0);
6740 return PyBool_FromLong(1);
6743 PyDoc_STRVAR(isalpha__doc__
,
6744 "S.isalpha() -> bool\n\
6746 Return True if all characters in S are alphabetic\n\
6747 and there is at least one character in S, False otherwise.");
6750 unicode_isalpha(PyUnicodeObject
*self
)
6752 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6753 register const Py_UNICODE
*e
;
6755 /* Shortcut for single character strings */
6756 if (PyUnicode_GET_SIZE(self
) == 1 &&
6757 Py_UNICODE_ISALPHA(*p
))
6758 return PyBool_FromLong(1);
6760 /* Special case for empty strings */
6761 if (PyUnicode_GET_SIZE(self
) == 0)
6762 return PyBool_FromLong(0);
6764 e
= p
+ PyUnicode_GET_SIZE(self
);
6765 for (; p
< e
; p
++) {
6766 if (!Py_UNICODE_ISALPHA(*p
))
6767 return PyBool_FromLong(0);
6769 return PyBool_FromLong(1);
6772 PyDoc_STRVAR(isalnum__doc__
,
6773 "S.isalnum() -> bool\n\
6775 Return True if all characters in S are alphanumeric\n\
6776 and there is at least one character in S, False otherwise.");
6779 unicode_isalnum(PyUnicodeObject
*self
)
6781 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6782 register const Py_UNICODE
*e
;
6784 /* Shortcut for single character strings */
6785 if (PyUnicode_GET_SIZE(self
) == 1 &&
6786 Py_UNICODE_ISALNUM(*p
))
6787 return PyBool_FromLong(1);
6789 /* Special case for empty strings */
6790 if (PyUnicode_GET_SIZE(self
) == 0)
6791 return PyBool_FromLong(0);
6793 e
= p
+ PyUnicode_GET_SIZE(self
);
6794 for (; p
< e
; p
++) {
6795 if (!Py_UNICODE_ISALNUM(*p
))
6796 return PyBool_FromLong(0);
6798 return PyBool_FromLong(1);
6801 PyDoc_STRVAR(isdecimal__doc__
,
6802 "S.isdecimal() -> bool\n\
6804 Return True if there are only decimal characters in S,\n\
6808 unicode_isdecimal(PyUnicodeObject
*self
)
6810 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6811 register const Py_UNICODE
*e
;
6813 /* Shortcut for single character strings */
6814 if (PyUnicode_GET_SIZE(self
) == 1 &&
6815 Py_UNICODE_ISDECIMAL(*p
))
6816 return PyBool_FromLong(1);
6818 /* Special case for empty strings */
6819 if (PyUnicode_GET_SIZE(self
) == 0)
6820 return PyBool_FromLong(0);
6822 e
= p
+ PyUnicode_GET_SIZE(self
);
6823 for (; p
< e
; p
++) {
6824 if (!Py_UNICODE_ISDECIMAL(*p
))
6825 return PyBool_FromLong(0);
6827 return PyBool_FromLong(1);
6830 PyDoc_STRVAR(isdigit__doc__
,
6831 "S.isdigit() -> bool\n\
6833 Return True if all characters in S are digits\n\
6834 and there is at least one character in S, False otherwise.");
6837 unicode_isdigit(PyUnicodeObject
*self
)
6839 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6840 register const Py_UNICODE
*e
;
6842 /* Shortcut for single character strings */
6843 if (PyUnicode_GET_SIZE(self
) == 1 &&
6844 Py_UNICODE_ISDIGIT(*p
))
6845 return PyBool_FromLong(1);
6847 /* Special case for empty strings */
6848 if (PyUnicode_GET_SIZE(self
) == 0)
6849 return PyBool_FromLong(0);
6851 e
= p
+ PyUnicode_GET_SIZE(self
);
6852 for (; p
< e
; p
++) {
6853 if (!Py_UNICODE_ISDIGIT(*p
))
6854 return PyBool_FromLong(0);
6856 return PyBool_FromLong(1);
6859 PyDoc_STRVAR(isnumeric__doc__
,
6860 "S.isnumeric() -> bool\n\
6862 Return True if there are only numeric characters in S,\n\
6866 unicode_isnumeric(PyUnicodeObject
*self
)
6868 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6869 register const Py_UNICODE
*e
;
6871 /* Shortcut for single character strings */
6872 if (PyUnicode_GET_SIZE(self
) == 1 &&
6873 Py_UNICODE_ISNUMERIC(*p
))
6874 return PyBool_FromLong(1);
6876 /* Special case for empty strings */
6877 if (PyUnicode_GET_SIZE(self
) == 0)
6878 return PyBool_FromLong(0);
6880 e
= p
+ PyUnicode_GET_SIZE(self
);
6881 for (; p
< e
; p
++) {
6882 if (!Py_UNICODE_ISNUMERIC(*p
))
6883 return PyBool_FromLong(0);
6885 return PyBool_FromLong(1);
6888 PyDoc_STRVAR(join__doc__
,
6889 "S.join(iterable) -> unicode\n\
6891 Return a string which is the concatenation of the strings in the\n\
6892 iterable. The separator between elements is S.");
6895 unicode_join(PyObject
*self
, PyObject
*data
)
6897 return PyUnicode_Join(self
, data
);
6901 unicode_length(PyUnicodeObject
*self
)
6903 return self
->length
;
6906 PyDoc_STRVAR(ljust__doc__
,
6907 "S.ljust(width[, fillchar]) -> int\n\
6909 Return S left-justified in a Unicode string of length width. Padding is\n\
6910 done using the specified fill character (default is a space).");
6913 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
6916 Py_UNICODE fillchar
= ' ';
6918 if (!PyArg_ParseTuple(args
, "n|O&:ljust", &width
, convert_uc
, &fillchar
))
6921 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6923 return (PyObject
*) self
;
6926 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
6929 PyDoc_STRVAR(lower__doc__
,
6930 "S.lower() -> unicode\n\
6932 Return a copy of the string S converted to lowercase.");
6935 unicode_lower(PyUnicodeObject
*self
)
6937 return fixup(self
, fixlower
);
6941 #define RIGHTSTRIP 1
6944 /* Arrays indexed by above */
6945 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6947 #define STRIPNAME(i) (stripformat[i]+3)
6949 /* externally visible for str.strip(unicode) */
6951 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
6953 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6954 Py_ssize_t len
= PyUnicode_GET_SIZE(self
);
6955 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
6956 Py_ssize_t seplen
= PyUnicode_GET_SIZE(sepobj
);
6959 BLOOM_MASK sepmask
= make_bloom_mask(sep
, seplen
);
6962 if (striptype
!= RIGHTSTRIP
) {
6963 while (i
< len
&& BLOOM_MEMBER(sepmask
, s
[i
], sep
, seplen
)) {
6969 if (striptype
!= LEFTSTRIP
) {
6972 } while (j
>= i
&& BLOOM_MEMBER(sepmask
, s
[j
], sep
, seplen
));
6976 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
6978 return (PyObject
*)self
;
6981 return PyUnicode_FromUnicode(s
+i
, j
-i
);
6986 do_strip(PyUnicodeObject
*self
, int striptype
)
6988 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6989 Py_ssize_t len
= PyUnicode_GET_SIZE(self
), i
, j
;
6992 if (striptype
!= RIGHTSTRIP
) {
6993 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
6999 if (striptype
!= LEFTSTRIP
) {
7002 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
7006 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7008 return (PyObject
*)self
;
7011 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7016 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
7018 PyObject
*sep
= NULL
;
7020 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
7023 if (sep
!= NULL
&& sep
!= Py_None
) {
7024 if (PyUnicode_Check(sep
))
7025 return _PyUnicode_XStrip(self
, striptype
, sep
);
7026 else if (PyString_Check(sep
)) {
7028 sep
= PyUnicode_FromObject(sep
);
7031 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
7036 PyErr_Format(PyExc_TypeError
,
7037 "%s arg must be None, unicode or str",
7038 STRIPNAME(striptype
));
7043 return do_strip(self
, striptype
);
7047 PyDoc_STRVAR(strip__doc__
,
7048 "S.strip([chars]) -> unicode\n\
7050 Return a copy of the string S with leading and trailing\n\
7051 whitespace removed.\n\
7052 If chars is given and not None, remove characters in chars instead.\n\
7053 If chars is a str, it will be converted to unicode before stripping");
7056 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
7058 if (PyTuple_GET_SIZE(args
) == 0)
7059 return do_strip(self
, BOTHSTRIP
); /* Common case */
7061 return do_argstrip(self
, BOTHSTRIP
, args
);
7065 PyDoc_STRVAR(lstrip__doc__
,
7066 "S.lstrip([chars]) -> unicode\n\
7068 Return a copy of the string S with leading whitespace removed.\n\
7069 If chars is given and not None, remove characters in chars instead.\n\
7070 If chars is a str, it will be converted to unicode before stripping");
7073 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
7075 if (PyTuple_GET_SIZE(args
) == 0)
7076 return do_strip(self
, LEFTSTRIP
); /* Common case */
7078 return do_argstrip(self
, LEFTSTRIP
, args
);
7082 PyDoc_STRVAR(rstrip__doc__
,
7083 "S.rstrip([chars]) -> unicode\n\
7085 Return a copy of the string S with trailing whitespace removed.\n\
7086 If chars is given and not None, remove characters in chars instead.\n\
7087 If chars is a str, it will be converted to unicode before stripping");
7090 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
7092 if (PyTuple_GET_SIZE(args
) == 0)
7093 return do_strip(self
, RIGHTSTRIP
); /* Common case */
7095 return do_argstrip(self
, RIGHTSTRIP
, args
);
7100 unicode_repeat(PyUnicodeObject
*str
, Py_ssize_t len
)
7110 if (len
== 1 && PyUnicode_CheckExact(str
)) {
7111 /* no repeat, return original string */
7113 return (PyObject
*) str
;
7116 /* ensure # of chars needed doesn't overflow int and # of bytes
7117 * needed doesn't overflow size_t
7119 nchars
= len
* str
->length
;
7120 if (len
&& nchars
/ len
!= str
->length
) {
7121 PyErr_SetString(PyExc_OverflowError
,
7122 "repeated string is too long");
7125 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
7126 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
7127 PyErr_SetString(PyExc_OverflowError
,
7128 "repeated string is too long");
7131 u
= _PyUnicode_New(nchars
);
7137 if (str
->length
== 1 && len
> 0) {
7138 Py_UNICODE_FILL(p
, str
->str
[0], len
);
7140 Py_ssize_t done
= 0; /* number of characters copied this far */
7141 if (done
< nchars
) {
7142 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
7145 while (done
< nchars
) {
7146 Py_ssize_t n
= (done
<= nchars
-done
) ? done
: nchars
-done
;
7147 Py_UNICODE_COPY(p
+done
, p
, n
);
7152 return (PyObject
*) u
;
7155 PyObject
*PyUnicode_Replace(PyObject
*obj
,
7158 Py_ssize_t maxcount
)
7165 self
= PyUnicode_FromObject(obj
);
7168 str1
= PyUnicode_FromObject(subobj
);
7173 str2
= PyUnicode_FromObject(replobj
);
7179 result
= replace((PyUnicodeObject
*)self
,
7180 (PyUnicodeObject
*)str1
,
7181 (PyUnicodeObject
*)str2
,
7189 PyDoc_STRVAR(replace__doc__
,
7190 "S.replace(old, new[, count]) -> unicode\n\
7192 Return a copy of S with all occurrences of substring\n\
7193 old replaced by new. If the optional argument count is\n\
7194 given, only the first count occurrences are replaced.");
7197 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
7199 PyUnicodeObject
*str1
;
7200 PyUnicodeObject
*str2
;
7201 Py_ssize_t maxcount
= -1;
7204 if (!PyArg_ParseTuple(args
, "OO|n:replace", &str1
, &str2
, &maxcount
))
7206 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
7209 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
7215 result
= replace(self
, str1
, str2
, maxcount
);
7223 PyObject
*unicode_repr(PyObject
*unicode
)
7225 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
7226 PyUnicode_GET_SIZE(unicode
),
7230 PyDoc_STRVAR(rfind__doc__
,
7231 "S.rfind(sub [,start [,end]]) -> int\n\
7233 Return the highest index in S where substring sub is found,\n\
7234 such that sub is contained within s[start:end]. Optional\n\
7235 arguments start and end are interpreted as in slice notation.\n\
7237 Return -1 on failure.");
7240 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
7242 PyObject
*substring
;
7247 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7250 result
= stringlib_rfind_slice(
7251 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7252 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7256 Py_DECREF(substring
);
7258 return PyInt_FromSsize_t(result
);
7261 PyDoc_STRVAR(rindex__doc__
,
7262 "S.rindex(sub [,start [,end]]) -> int\n\
7264 Like S.rfind() but raise ValueError when the substring is not found.");
7267 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
7269 PyObject
*substring
;
7274 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7277 result
= stringlib_rfind_slice(
7278 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7279 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7283 Py_DECREF(substring
);
7286 PyErr_SetString(PyExc_ValueError
, "substring not found");
7289 return PyInt_FromSsize_t(result
);
7292 PyDoc_STRVAR(rjust__doc__
,
7293 "S.rjust(width[, fillchar]) -> unicode\n\
7295 Return S right-justified in a Unicode string of length width. Padding is\n\
7296 done using the specified fill character (default is a space).");
7299 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
7302 Py_UNICODE fillchar
= ' ';
7304 if (!PyArg_ParseTuple(args
, "n|O&:rjust", &width
, convert_uc
, &fillchar
))
7307 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7309 return (PyObject
*) self
;
7312 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
7316 unicode_slice(PyUnicodeObject
*self
, Py_ssize_t start
, Py_ssize_t end
)
7318 /* standard clamping */
7323 if (end
> self
->length
)
7325 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
7326 /* full slice, return original string */
7328 return (PyObject
*) self
;
7333 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
7337 PyObject
*PyUnicode_Split(PyObject
*s
,
7339 Py_ssize_t maxsplit
)
7343 s
= PyUnicode_FromObject(s
);
7347 sep
= PyUnicode_FromObject(sep
);
7354 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7361 PyDoc_STRVAR(split__doc__
,
7362 "S.split([sep [,maxsplit]]) -> list of strings\n\
7364 Return a list of the words in S, using sep as the\n\
7365 delimiter string. If maxsplit is given, at most maxsplit\n\
7366 splits are done. If sep is not specified or is None, any\n\
7367 whitespace string is a separator and empty strings are\n\
7368 removed from the result.");
7371 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
7373 PyObject
*substring
= Py_None
;
7374 Py_ssize_t maxcount
= -1;
7376 if (!PyArg_ParseTuple(args
, "|On:split", &substring
, &maxcount
))
7379 if (substring
== Py_None
)
7380 return split(self
, NULL
, maxcount
);
7381 else if (PyUnicode_Check(substring
))
7382 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
7384 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
7388 PyUnicode_Partition(PyObject
*str_in
, PyObject
*sep_in
)
7394 str_obj
= PyUnicode_FromObject(str_in
);
7397 sep_obj
= PyUnicode_FromObject(sep_in
);
7403 out
= stringlib_partition(
7404 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7405 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7416 PyUnicode_RPartition(PyObject
*str_in
, PyObject
*sep_in
)
7422 str_obj
= PyUnicode_FromObject(str_in
);
7425 sep_obj
= PyUnicode_FromObject(sep_in
);
7431 out
= stringlib_rpartition(
7432 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7433 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7442 PyDoc_STRVAR(partition__doc__
,
7443 "S.partition(sep) -> (head, sep, tail)\n\
7445 Search for the separator sep in S, and return the part before it,\n\
7446 the separator itself, and the part after it. If the separator is not\n\
7447 found, return S and two empty strings.");
7450 unicode_partition(PyUnicodeObject
*self
, PyObject
*separator
)
7452 return PyUnicode_Partition((PyObject
*)self
, separator
);
7455 PyDoc_STRVAR(rpartition__doc__
,
7456 "S.rpartition(sep) -> (head, sep, tail)\n\
7458 Search for the separator sep in S, starting at the end of S, and return\n\
7459 the part before it, the separator itself, and the part after it. If the\n\
7460 separator is not found, return two empty strings and S.");
7463 unicode_rpartition(PyUnicodeObject
*self
, PyObject
*separator
)
7465 return PyUnicode_RPartition((PyObject
*)self
, separator
);
7468 PyObject
*PyUnicode_RSplit(PyObject
*s
,
7470 Py_ssize_t maxsplit
)
7474 s
= PyUnicode_FromObject(s
);
7478 sep
= PyUnicode_FromObject(sep
);
7485 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7492 PyDoc_STRVAR(rsplit__doc__
,
7493 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7495 Return a list of the words in S, using sep as the\n\
7496 delimiter string, starting at the end of the string and\n\
7497 working to the front. If maxsplit is given, at most maxsplit\n\
7498 splits are done. If sep is not specified, any whitespace string\n\
7502 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
7504 PyObject
*substring
= Py_None
;
7505 Py_ssize_t maxcount
= -1;
7507 if (!PyArg_ParseTuple(args
, "|On:rsplit", &substring
, &maxcount
))
7510 if (substring
== Py_None
)
7511 return rsplit(self
, NULL
, maxcount
);
7512 else if (PyUnicode_Check(substring
))
7513 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
7515 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
7518 PyDoc_STRVAR(splitlines__doc__
,
7519 "S.splitlines([keepends]) -> list of strings\n\
7521 Return a list of the lines in S, breaking at line boundaries.\n\
7522 Line breaks are not included in the resulting list unless keepends\n\
7523 is given and true.");
7526 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
7530 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
7533 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
7537 PyObject
*unicode_str(PyUnicodeObject
*self
)
7539 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
7542 PyDoc_STRVAR(swapcase__doc__
,
7543 "S.swapcase() -> unicode\n\
7545 Return a copy of S with uppercase characters converted to lowercase\n\
7549 unicode_swapcase(PyUnicodeObject
*self
)
7551 return fixup(self
, fixswapcase
);
7554 PyDoc_STRVAR(translate__doc__
,
7555 "S.translate(table) -> unicode\n\
7557 Return a copy of the string S, where all characters have been mapped\n\
7558 through the given translation table, which must be a mapping of\n\
7559 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7560 Unmapped characters are left untouched. Characters mapped to None\n\
7564 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
7566 return PyUnicode_TranslateCharmap(self
->str
,
7572 PyDoc_STRVAR(upper__doc__
,
7573 "S.upper() -> unicode\n\
7575 Return a copy of S converted to uppercase.");
7578 unicode_upper(PyUnicodeObject
*self
)
7580 return fixup(self
, fixupper
);
7583 PyDoc_STRVAR(zfill__doc__
,
7584 "S.zfill(width) -> unicode\n\
7586 Pad a numeric string S with zeros on the left, to fill a field\n\
7587 of the specified width. The string S is never truncated.");
7590 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
7596 if (!PyArg_ParseTuple(args
, "n:zfill", &width
))
7599 if (self
->length
>= width
) {
7600 if (PyUnicode_CheckExact(self
)) {
7602 return (PyObject
*) self
;
7605 return PyUnicode_FromUnicode(
7606 PyUnicode_AS_UNICODE(self
),
7607 PyUnicode_GET_SIZE(self
)
7611 fill
= width
- self
->length
;
7613 u
= pad(self
, fill
, 0, '0');
7618 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
7619 /* move sign to beginning of string */
7620 u
->str
[0] = u
->str
[fill
];
7624 return (PyObject
*) u
;
7629 free_listsize(PyUnicodeObject
*self
)
7631 return PyInt_FromLong(numfree
);
7635 PyDoc_STRVAR(startswith__doc__
,
7636 "S.startswith(prefix[, start[, end]]) -> bool\n\
7638 Return True if S starts with the specified prefix, False otherwise.\n\
7639 With optional start, test S beginning at that position.\n\
7640 With optional end, stop comparing S at that position.\n\
7641 prefix can also be a tuple of strings to try.");
7644 unicode_startswith(PyUnicodeObject
*self
,
7648 PyUnicodeObject
*substring
;
7649 Py_ssize_t start
= 0;
7650 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7653 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &subobj
,
7654 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7656 if (PyTuple_Check(subobj
)) {
7658 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7659 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7660 PyTuple_GET_ITEM(subobj
, i
));
7661 if (substring
== NULL
)
7663 result
= tailmatch(self
, substring
, start
, end
, -1);
7664 Py_DECREF(substring
);
7669 /* nothing matched */
7672 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7673 if (substring
== NULL
)
7675 result
= tailmatch(self
, substring
, start
, end
, -1);
7676 Py_DECREF(substring
);
7677 return PyBool_FromLong(result
);
7681 PyDoc_STRVAR(endswith__doc__
,
7682 "S.endswith(suffix[, start[, end]]) -> bool\n\
7684 Return True if S ends with the specified suffix, False otherwise.\n\
7685 With optional start, test S beginning at that position.\n\
7686 With optional end, stop comparing S at that position.\n\
7687 suffix can also be a tuple of strings to try.");
7690 unicode_endswith(PyUnicodeObject
*self
,
7694 PyUnicodeObject
*substring
;
7695 Py_ssize_t start
= 0;
7696 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7699 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &subobj
,
7700 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7702 if (PyTuple_Check(subobj
)) {
7704 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7705 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7706 PyTuple_GET_ITEM(subobj
, i
));
7707 if (substring
== NULL
)
7709 result
= tailmatch(self
, substring
, start
, end
, +1);
7710 Py_DECREF(substring
);
7717 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7718 if (substring
== NULL
)
7721 result
= tailmatch(self
, substring
, start
, end
, +1);
7722 Py_DECREF(substring
);
7723 return PyBool_FromLong(result
);
7727 /* Implements do_string_format, which is unicode because of stringlib */
7728 #include "stringlib/string_format.h"
7730 PyDoc_STRVAR(format__doc__
,
7731 "S.format(*args, **kwargs) -> unicode\n\
7736 unicode__format__(PyObject
*self
, PyObject
*args
)
7738 PyObject
*format_spec
;
7739 PyObject
*result
= NULL
;
7740 PyObject
*tmp
= NULL
;
7742 /* If 2.x, convert format_spec to the same type as value */
7743 /* This is to allow things like u''.format('') */
7744 if (!PyArg_ParseTuple(args
, "O:__format__", &format_spec
))
7746 if (!(PyBytes_Check(format_spec
) || PyUnicode_Check(format_spec
))) {
7747 PyErr_Format(PyExc_TypeError
, "__format__ arg must be str "
7748 "or unicode, not %s", Py_TYPE(format_spec
)->tp_name
);
7751 tmp
= PyObject_Unicode(format_spec
);
7756 result
= _PyUnicode_FormatAdvanced(self
,
7757 PyUnicode_AS_UNICODE(format_spec
),
7758 PyUnicode_GET_SIZE(format_spec
));
7764 PyDoc_STRVAR(p_format__doc__
,
7765 "S.__format__(format_spec) -> unicode\n\
7770 unicode__sizeof__(PyUnicodeObject
*v
)
7772 return PyInt_FromSsize_t(sizeof(PyUnicodeObject
) +
7773 sizeof(Py_UNICODE
) * (v
->length
+ 1));
7776 PyDoc_STRVAR(sizeof__doc__
,
7777 "S.__sizeof__() -> size of S in memory, in bytes\n\
7782 unicode_getnewargs(PyUnicodeObject
*v
)
7784 return Py_BuildValue("(u#)", v
->str
, v
->length
);
7788 static PyMethodDef unicode_methods
[] = {
7790 /* Order is according to common usage: often used methods should
7791 appear first, since lookup is done sequentially. */
7793 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
| METH_KEYWORDS
, encode__doc__
},
7794 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
7795 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
7796 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
7797 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
7798 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
7799 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
7800 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
7801 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
7802 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
7803 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
7804 {"partition", (PyCFunction
) unicode_partition
, METH_O
, partition__doc__
},
7805 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
7806 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
7807 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
7808 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
7809 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
| METH_KEYWORDS
, decode__doc__
},
7810 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7811 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
7812 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
7813 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
7814 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
7815 {"rpartition", (PyCFunction
) unicode_rpartition
, METH_O
, rpartition__doc__
},
7816 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
7817 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
7818 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
7819 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
7820 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
7821 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
7822 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
7823 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
7824 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
7825 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
7826 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
7827 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
7828 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
7829 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
7830 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
7831 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
7832 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
7833 {"format", (PyCFunction
) do_string_format
, METH_VARARGS
| METH_KEYWORDS
, format__doc__
},
7834 {"__format__", (PyCFunction
) unicode__format__
, METH_VARARGS
, p_format__doc__
},
7835 {"_formatter_field_name_split", (PyCFunction
) formatter_field_name_split
, METH_NOARGS
},
7836 {"_formatter_parser", (PyCFunction
) formatter_parser
, METH_NOARGS
},
7837 {"__sizeof__", (PyCFunction
) unicode__sizeof__
, METH_NOARGS
, sizeof__doc__
},
7839 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
7843 /* This one is just used for debugging the implementation. */
7844 {"freelistsize", (PyCFunction
) free_listsize
, METH_NOARGS
},
7847 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
7852 unicode_mod(PyObject
*v
, PyObject
*w
)
7854 if (!PyUnicode_Check(v
)) {
7855 Py_INCREF(Py_NotImplemented
);
7856 return Py_NotImplemented
;
7858 return PyUnicode_Format(v
, w
);
7861 static PyNumberMethods unicode_as_number
= {
7866 unicode_mod
, /*nb_remainder*/
7869 static PySequenceMethods unicode_as_sequence
= {
7870 (lenfunc
) unicode_length
, /* sq_length */
7871 PyUnicode_Concat
, /* sq_concat */
7872 (ssizeargfunc
) unicode_repeat
, /* sq_repeat */
7873 (ssizeargfunc
) unicode_getitem
, /* sq_item */
7874 (ssizessizeargfunc
) unicode_slice
, /* sq_slice */
7875 0, /* sq_ass_item */
7876 0, /* sq_ass_slice */
7877 PyUnicode_Contains
, /* sq_contains */
7881 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
7883 if (PyIndex_Check(item
)) {
7884 Py_ssize_t i
= PyNumber_AsSsize_t(item
, PyExc_IndexError
);
7885 if (i
== -1 && PyErr_Occurred())
7888 i
+= PyUnicode_GET_SIZE(self
);
7889 return unicode_getitem(self
, i
);
7890 } else if (PySlice_Check(item
)) {
7891 Py_ssize_t start
, stop
, step
, slicelength
, cur
, i
;
7892 Py_UNICODE
* source_buf
;
7893 Py_UNICODE
* result_buf
;
7896 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyUnicode_GET_SIZE(self
),
7897 &start
, &stop
, &step
, &slicelength
) < 0) {
7901 if (slicelength
<= 0) {
7902 return PyUnicode_FromUnicode(NULL
, 0);
7903 } else if (start
== 0 && step
== 1 && slicelength
== self
->length
&&
7904 PyUnicode_CheckExact(self
)) {
7906 return (PyObject
*)self
;
7907 } else if (step
== 1) {
7908 return PyUnicode_FromUnicode(self
->str
+ start
, slicelength
);
7910 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
7911 result_buf
= (Py_UNICODE
*)PyObject_MALLOC(slicelength
*
7912 sizeof(Py_UNICODE
));
7914 if (result_buf
== NULL
)
7915 return PyErr_NoMemory();
7917 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
7918 result_buf
[i
] = source_buf
[cur
];
7921 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
7922 PyObject_FREE(result_buf
);
7926 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
7931 static PyMappingMethods unicode_as_mapping
= {
7932 (lenfunc
)unicode_length
, /* mp_length */
7933 (binaryfunc
)unicode_subscript
, /* mp_subscript */
7934 (objobjargproc
)0, /* mp_ass_subscript */
7938 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
7943 PyErr_SetString(PyExc_SystemError
,
7944 "accessing non-existent unicode segment");
7947 *ptr
= (void *) self
->str
;
7948 return PyUnicode_GET_DATA_SIZE(self
);
7952 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, Py_ssize_t index
,
7955 PyErr_SetString(PyExc_TypeError
,
7956 "cannot use unicode as modifiable buffer");
7961 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
7965 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
7970 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
7977 PyErr_SetString(PyExc_SystemError
,
7978 "accessing non-existent unicode segment");
7981 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
7984 *ptr
= (void *) PyString_AS_STRING(str
);
7985 return PyString_GET_SIZE(str
);
7988 /* Helpers for PyUnicode_Format() */
7991 getnextarg(PyObject
*args
, Py_ssize_t arglen
, Py_ssize_t
*p_argidx
)
7993 Py_ssize_t argidx
= *p_argidx
;
7994 if (argidx
< arglen
) {
7999 return PyTuple_GetItem(args
, argidx
);
8001 PyErr_SetString(PyExc_TypeError
,
8002 "not enough arguments for format string");
8006 #define F_LJUST (1<<0)
8007 #define F_SIGN (1<<1)
8008 #define F_BLANK (1<<2)
8009 #define F_ALT (1<<3)
8010 #define F_ZERO (1<<4)
8013 strtounicode(Py_UNICODE
*buffer
, const char *charbuffer
)
8015 register Py_ssize_t i
;
8016 Py_ssize_t len
= strlen(charbuffer
);
8017 for (i
= len
- 1; i
>= 0; i
--)
8018 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
8024 longtounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, long x
)
8028 PyOS_snprintf((char *)buffer
, len
, format
, x
);
8029 result
= strtounicode(buffer
, (char *)buffer
);
8030 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
8033 /* XXX To save some code duplication, formatfloat/long/int could have been
8034 shared with stringobject.c, converting from 8-bit to Unicode after the
8035 formatting is done. */
8037 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8040 formatfloat(PyObject
*v
, int flags
, int prec
, int type
)
8046 x
= PyFloat_AsDouble(v
);
8047 if (x
== -1.0 && PyErr_Occurred())
8053 p
= PyOS_double_to_string(x
, type
, prec
,
8054 (flags
& F_ALT
) ? Py_DTSF_ALT
: 0, NULL
);
8057 result
= PyUnicode_FromStringAndSize(p
, strlen(p
));
8063 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
8067 PyObject
*str
; /* temporary string object. */
8068 PyUnicodeObject
*result
;
8070 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
8073 result
= _PyUnicode_New(len
);
8078 for (i
= 0; i
< len
; i
++)
8079 result
->str
[i
] = buf
[i
];
8080 result
->str
[len
] = 0;
8082 return (PyObject
*)result
;
8086 formatint(Py_UNICODE
*buf
,
8093 /* fmt = '%#.' + `prec` + 'l' + `type`
8094 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8098 char fmt
[64]; /* plenty big enough! */
8102 x
= PyInt_AsLong(v
);
8103 if (x
== -1 && PyErr_Occurred())
8105 if (x
< 0 && type
== 'u') {
8108 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
8115 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8116 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8118 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
8119 PyErr_SetString(PyExc_OverflowError
,
8120 "formatted integer is too long (precision too large?)");
8124 if ((flags
& F_ALT
) &&
8125 (type
== 'x' || type
== 'X')) {
8126 /* When converting under %#x or %#X, there are a number
8127 * of issues that cause pain:
8128 * - when 0 is being converted, the C standard leaves off
8129 * the '0x' or '0X', which is inconsistent with other
8130 * %#x/%#X conversions and inconsistent with Python's
8132 * - there are platforms that violate the standard and
8133 * convert 0 with the '0x' or '0X'
8134 * (Metrowerks, Compaq Tru64)
8135 * - there are platforms that give '0x' when converting
8136 * under %#X, but convert 0 in accordance with the
8137 * standard (OS/2 EMX)
8139 * We can achieve the desired consistency by inserting our
8140 * own '0x' or '0X' prefix, and substituting %x/%X in place
8143 * Note that this is the same approach as used in
8144 * formatint() in stringobject.c
8146 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
8147 sign
, type
, prec
, type
);
8150 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
8151 sign
, (flags
&F_ALT
) ? "#" : "",
8155 return longtounicode(buf
, buflen
, fmt
, -x
);
8157 return longtounicode(buf
, buflen
, fmt
, x
);
8161 formatchar(Py_UNICODE
*buf
,
8167 /* presume that the buffer is at least 2 characters long */
8168 if (PyUnicode_Check(v
)) {
8169 if (PyUnicode_GET_SIZE(v
) != 1)
8171 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
8174 else if (PyString_Check(v
)) {
8175 if (PyString_GET_SIZE(v
) != 1)
8177 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8178 with a UnicodeDecodeError if 'char' is not decodable with the
8179 default encoding (usually ASCII, but it might be something else) */
8180 str
= PyString_AS_STRING(v
);
8181 if ((unsigned char)str
[0] > 0x7F) {
8182 /* the char is not ASCII; try to decode the string using the
8183 default encoding and return -1 to let the UnicodeDecodeError
8184 be raised if the string can't be decoded */
8185 unistr
= PyUnicode_Decode(str
, 1, NULL
, "strict");
8188 buf
[0] = PyUnicode_AS_UNICODE(unistr
)[0];
8192 buf
[0] = (Py_UNICODE
)str
[0];
8196 /* Integer input truncated to a character */
8198 x
= PyInt_AsLong(v
);
8199 if (x
== -1 && PyErr_Occurred())
8201 #ifdef Py_UNICODE_WIDE
8202 if (x
< 0 || x
> 0x10ffff) {
8203 PyErr_SetString(PyExc_OverflowError
,
8204 "%c arg not in range(0x110000) "
8205 "(wide Python build)");
8209 if (x
< 0 || x
> 0xffff) {
8210 PyErr_SetString(PyExc_OverflowError
,
8211 "%c arg not in range(0x10000) "
8212 "(narrow Python build)");
8216 buf
[0] = (Py_UNICODE
) x
;
8222 PyErr_SetString(PyExc_TypeError
,
8223 "%c requires int or char");
8227 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8229 FORMATBUFLEN is the length of the buffer in which the ints &
8230 chars are formatted. XXX This is a magic number. Each formatting
8231 routine does bounds checking to ensure no overflow, but a better
8232 solution may be to malloc a buffer of appropriate size for each
8233 format. For now, the current solution is sufficient.
8235 #define FORMATBUFLEN (size_t)120
8237 PyObject
*PyUnicode_Format(PyObject
*format
,
8240 Py_UNICODE
*fmt
, *res
;
8241 Py_ssize_t fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
8243 PyUnicodeObject
*result
= NULL
;
8244 PyObject
*dict
= NULL
;
8247 if (format
== NULL
|| args
== NULL
) {
8248 PyErr_BadInternalCall();
8251 uformat
= PyUnicode_FromObject(format
);
8252 if (uformat
== NULL
)
8254 fmt
= PyUnicode_AS_UNICODE(uformat
);
8255 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
8257 reslen
= rescnt
= fmtcnt
+ 100;
8258 result
= _PyUnicode_New(reslen
);
8261 res
= PyUnicode_AS_UNICODE(result
);
8263 if (PyTuple_Check(args
)) {
8264 arglen
= PyTuple_Size(args
);
8271 if (Py_TYPE(args
)->tp_as_mapping
&& !PyTuple_Check(args
) &&
8272 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
8275 while (--fmtcnt
>= 0) {
8278 rescnt
= fmtcnt
+ 100;
8280 if (_PyUnicode_Resize(&result
, reslen
) < 0)
8282 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
8288 /* Got a format specifier */
8290 Py_ssize_t width
= -1;
8292 Py_UNICODE c
= '\0';
8296 PyObject
*temp
= NULL
;
8300 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{int,char}() */
8304 Py_UNICODE
*keystart
;
8310 PyErr_SetString(PyExc_TypeError
,
8311 "format requires a mapping");
8317 /* Skip over balanced parentheses */
8318 while (pcount
> 0 && --fmtcnt
>= 0) {
8321 else if (*fmt
== '(')
8325 keylen
= fmt
- keystart
- 1;
8326 if (fmtcnt
< 0 || pcount
> 0) {
8327 PyErr_SetString(PyExc_ValueError
,
8328 "incomplete format key");
8332 /* keys are converted to strings using UTF-8 and
8333 then looked up since Python uses strings to hold
8334 variables names etc. in its namespaces and we
8335 wouldn't want to break common idioms. */
8336 key
= PyUnicode_EncodeUTF8(keystart
,
8340 key
= PyUnicode_FromUnicode(keystart
, keylen
);
8348 args
= PyObject_GetItem(dict
, key
);
8357 while (--fmtcnt
>= 0) {
8358 switch (c
= *fmt
++) {
8359 case '-': flags
|= F_LJUST
; continue;
8360 case '+': flags
|= F_SIGN
; continue;
8361 case ' ': flags
|= F_BLANK
; continue;
8362 case '#': flags
|= F_ALT
; continue;
8363 case '0': flags
|= F_ZERO
; continue;
8368 v
= getnextarg(args
, arglen
, &argidx
);
8371 if (!PyInt_Check(v
)) {
8372 PyErr_SetString(PyExc_TypeError
,
8376 width
= PyInt_AsLong(v
);
8384 else if (c
>= '0' && c
<= '9') {
8386 while (--fmtcnt
>= 0) {
8388 if (c
< '0' || c
> '9')
8390 if ((width
*10) / 10 != width
) {
8391 PyErr_SetString(PyExc_ValueError
,
8395 width
= width
*10 + (c
- '0');
8403 v
= getnextarg(args
, arglen
, &argidx
);
8406 if (!PyInt_Check(v
)) {
8407 PyErr_SetString(PyExc_TypeError
,
8411 prec
= PyInt_AsLong(v
);
8417 else if (c
>= '0' && c
<= '9') {
8419 while (--fmtcnt
>= 0) {
8420 c
= Py_CHARMASK(*fmt
++);
8421 if (c
< '0' || c
> '9')
8423 if ((prec
*10) / 10 != prec
) {
8424 PyErr_SetString(PyExc_ValueError
,
8428 prec
= prec
*10 + (c
- '0');
8433 if (c
== 'h' || c
== 'l' || c
== 'L') {
8439 PyErr_SetString(PyExc_ValueError
,
8440 "incomplete format");
8444 v
= getnextarg(args
, arglen
, &argidx
);
8454 /* presume that buffer length is at least 1 */
8461 if (PyUnicode_CheckExact(v
) && c
== 's') {
8468 temp
= PyObject_Unicode(v
);
8470 temp
= PyObject_Repr(v
);
8473 if (PyUnicode_Check(temp
))
8474 /* nothing to do */;
8475 else if (PyString_Check(temp
)) {
8476 /* convert to string to Unicode */
8477 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
8478 PyString_GET_SIZE(temp
),
8488 PyErr_SetString(PyExc_TypeError
,
8489 "%s argument has non-string str()");
8493 pbuf
= PyUnicode_AS_UNICODE(temp
);
8494 len
= PyUnicode_GET_SIZE(temp
);
8495 if (prec
>= 0 && len
> prec
)
8508 if (PyNumber_Check(v
)) {
8509 PyObject
*iobj
=NULL
;
8511 if (PyInt_Check(v
) || (PyLong_Check(v
))) {
8516 iobj
= PyNumber_Int(v
);
8517 if (iobj
==NULL
) iobj
= PyNumber_Long(v
);
8520 if (PyInt_Check(iobj
)) {
8523 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8524 flags
, prec
, c
, iobj
);
8530 else if (PyLong_Check(iobj
)) {
8532 temp
= formatlong(iobj
, flags
, prec
, c
);
8536 pbuf
= PyUnicode_AS_UNICODE(temp
);
8537 len
= PyUnicode_GET_SIZE(temp
);
8546 PyErr_Format(PyExc_TypeError
,
8547 "%%%c format: a number is required, "
8548 "not %.200s", (char)c
, Py_TYPE(v
)->tp_name
);
8561 temp
= formatfloat(v
, flags
, prec
, c
);
8564 pbuf
= PyUnicode_AS_UNICODE(temp
);
8565 len
= PyUnicode_GET_SIZE(temp
);
8573 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
8579 PyErr_Format(PyExc_ValueError
,
8580 "unsupported format character '%c' (0x%x) "
8582 (31<=c
&& c
<=126) ? (char)c
: '?',
8584 (Py_ssize_t
)(fmt
- 1 -
8585 PyUnicode_AS_UNICODE(uformat
)));
8589 if (*pbuf
== '-' || *pbuf
== '+') {
8593 else if (flags
& F_SIGN
)
8595 else if (flags
& F_BLANK
)
8602 if (rescnt
- (sign
!= 0) < width
) {
8604 rescnt
= width
+ fmtcnt
+ 100;
8611 if (_PyUnicode_Resize(&result
, reslen
) < 0) {
8615 res
= PyUnicode_AS_UNICODE(result
)
8625 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8626 assert(pbuf
[0] == '0');
8627 assert(pbuf
[1] == c
);
8638 if (width
> len
&& !(flags
& F_LJUST
)) {
8642 } while (--width
> len
);
8647 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8648 assert(pbuf
[0] == '0');
8649 assert(pbuf
[1] == c
);
8654 Py_UNICODE_COPY(res
, pbuf
, len
);
8657 while (--width
>= len
) {
8661 if (dict
&& (argidx
< arglen
) && c
!= '%') {
8662 PyErr_SetString(PyExc_TypeError
,
8663 "not all arguments converted during string formatting");
8670 if (argidx
< arglen
&& !dict
) {
8671 PyErr_SetString(PyExc_TypeError
,
8672 "not all arguments converted during string formatting");
8676 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
8682 return (PyObject
*)result
;
8693 static PyBufferProcs unicode_as_buffer
= {
8694 (readbufferproc
) unicode_buffer_getreadbuf
,
8695 (writebufferproc
) unicode_buffer_getwritebuf
,
8696 (segcountproc
) unicode_buffer_getsegcount
,
8697 (charbufferproc
) unicode_buffer_getcharbuf
,
8701 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
8704 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8707 static char *kwlist
[] = {"string", "encoding", "errors", 0};
8708 char *encoding
= NULL
;
8709 char *errors
= NULL
;
8711 if (type
!= &PyUnicode_Type
)
8712 return unicode_subtype_new(type
, args
, kwds
);
8713 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
8714 kwlist
, &x
, &encoding
, &errors
))
8717 return (PyObject
*)_PyUnicode_New(0);
8718 if (encoding
== NULL
&& errors
== NULL
)
8719 return PyObject_Unicode(x
);
8721 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
8725 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8727 PyUnicodeObject
*tmp
, *pnew
;
8730 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
8731 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
8734 assert(PyUnicode_Check(tmp
));
8735 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
8740 pnew
->str
= (Py_UNICODE
*) PyObject_MALLOC(sizeof(Py_UNICODE
) * (n
+1));
8741 if (pnew
->str
== NULL
) {
8742 _Py_ForgetReference((PyObject
*)pnew
);
8745 return PyErr_NoMemory();
8747 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
8749 pnew
->hash
= tmp
->hash
;
8751 return (PyObject
*)pnew
;
8754 PyDoc_STRVAR(unicode_doc
,
8755 "unicode(string [, encoding[, errors]]) -> object\n\
8757 Create a new Unicode object from the given encoded string.\n\
8758 encoding defaults to the current default string encoding.\n\
8759 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8761 PyTypeObject PyUnicode_Type
= {
8762 PyVarObject_HEAD_INIT(&PyType_Type
, 0)
8763 "unicode", /* tp_name */
8764 sizeof(PyUnicodeObject
), /* tp_size */
8765 0, /* tp_itemsize */
8767 (destructor
)unicode_dealloc
, /* tp_dealloc */
8772 unicode_repr
, /* tp_repr */
8773 &unicode_as_number
, /* tp_as_number */
8774 &unicode_as_sequence
, /* tp_as_sequence */
8775 &unicode_as_mapping
, /* tp_as_mapping */
8776 (hashfunc
) unicode_hash
, /* tp_hash*/
8778 (reprfunc
) unicode_str
, /* tp_str */
8779 PyObject_GenericGetAttr
, /* tp_getattro */
8780 0, /* tp_setattro */
8781 &unicode_as_buffer
, /* tp_as_buffer */
8782 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
8783 Py_TPFLAGS_BASETYPE
| Py_TPFLAGS_UNICODE_SUBCLASS
, /* tp_flags */
8784 unicode_doc
, /* tp_doc */
8785 0, /* tp_traverse */
8787 PyUnicode_RichCompare
, /* tp_richcompare */
8788 0, /* tp_weaklistoffset */
8790 0, /* tp_iternext */
8791 unicode_methods
, /* tp_methods */
8794 &PyBaseString_Type
, /* tp_base */
8796 0, /* tp_descr_get */
8797 0, /* tp_descr_set */
8798 0, /* tp_dictoffset */
8801 unicode_new
, /* tp_new */
8802 PyObject_Del
, /* tp_free */
8805 /* Initialize the Unicode implementation */
8807 void _PyUnicode_Init(void)
8811 /* XXX - move this array to unicodectype.c ? */
8812 Py_UNICODE linebreak
[] = {
8813 0x000A, /* LINE FEED */
8814 0x000D, /* CARRIAGE RETURN */
8815 0x001C, /* FILE SEPARATOR */
8816 0x001D, /* GROUP SEPARATOR */
8817 0x001E, /* RECORD SEPARATOR */
8818 0x0085, /* NEXT LINE */
8819 0x2028, /* LINE SEPARATOR */
8820 0x2029, /* PARAGRAPH SEPARATOR */
8823 /* Init the implementation */
8826 unicode_empty
= _PyUnicode_New(0);
8830 strcpy(unicode_default_encoding
, "ascii");
8831 for (i
= 0; i
< 256; i
++)
8832 unicode_latin1
[i
] = NULL
;
8833 if (PyType_Ready(&PyUnicode_Type
) < 0)
8834 Py_FatalError("Can't initialize 'unicode'");
8836 /* initialize the linebreak bloom filter */
8837 bloom_linebreak
= make_bloom_mask(
8838 linebreak
, sizeof(linebreak
) / sizeof(linebreak
[0])
8841 PyType_Ready(&EncodingMapType
);
8844 /* Finalize the Unicode implementation */
8847 PyUnicode_ClearFreeList(void)
8849 int freelist_size
= numfree
;
8852 for (u
= free_list
; u
!= NULL
;) {
8853 PyUnicodeObject
*v
= u
;
8854 u
= *(PyUnicodeObject
**)u
;
8856 PyObject_DEL(v
->str
);
8857 Py_XDECREF(v
->defenc
);
8862 assert(numfree
== 0);
8863 return freelist_size
;
8867 _PyUnicode_Fini(void)
8871 Py_XDECREF(unicode_empty
);
8872 unicode_empty
= NULL
;
8874 for (i
= 0; i
< 256; i
++) {
8875 if (unicode_latin1
[i
]) {
8876 Py_DECREF(unicode_latin1
[i
]);
8877 unicode_latin1
[i
] = NULL
;
8880 (void)PyUnicode_ClearFreeList();