3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
45 #include "unicodeobject.h"
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
80 # define BYTEORDER_IS_LITTLE_ENDIAN
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
95 /* Free list for Unicode objects */
96 static PyUnicodeObject
*free_list
;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject
*unicode_empty
;
102 /* Single character Unicode strings in the Latin-1 range are being
104 static PyUnicodeObject
*unicode_latin1
[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding
[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace
[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 // case 0x0009: /* HORIZONTAL TABULATION */
119 // case 0x000A: /* LINE FEED */
120 // case 0x000B: /* VERTICAL TABULATION */
121 // case 0x000C: /* FORM FEED */
122 // case 0x000D: /* CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 // case 0x001C: /* FILE SEPARATOR */
126 // case 0x001D: /* GROUP SEPARATOR */
127 // case 0x001E: /* RECORD SEPARATOR */
128 // case 0x001F: /* UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 // case 0x0020: /* SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak
[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 // 0x000A, /* LINE FEED */
150 // 0x000D, /* CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 // 0x001C, /* FILE SEPARATOR */
154 // 0x001D, /* GROUP SEPARATOR */
155 // 0x001E, /* RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #define BLOOM_MASK unsigned long
195 static BLOOM_MASK bloom_linebreak
;
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
199 #define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
203 Py_LOCAL_INLINE(BLOOM_MASK
) make_bloom_mask(Py_UNICODE
* ptr
, Py_ssize_t len
)
205 /* calculate simple bloom-style bitmask for a given unicode string */
211 for (i
= 0; i
< len
; i
++)
212 mask
|= (1 << (ptr
[i
] & 0x1F));
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr
, Py_UNICODE
* set
, Py_ssize_t setlen
)
221 for (i
= 0; i
< setlen
; i
++)
228 #define BLOOM_MEMBER(mask, chr, set, setlen)\
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231 /* --- Unicode Object ----------------------------------------------------- */
234 int unicode_resize(register PyUnicodeObject
*unicode
,
239 /* Shortcut if there's nothing much to do. */
240 if (unicode
->length
== length
)
243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
247 if (unicode
== unicode_empty
||
248 (unicode
->length
== 1 &&
249 unicode
->str
[0] < 256U &&
250 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
251 PyErr_SetString(PyExc_SystemError
,
252 "can't resize shared unicode objects");
256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
258 safe to look at str[length] (without making any assumptions about what
261 oldstr
= unicode
->str
;
262 unicode
->str
= PyObject_REALLOC(unicode
->str
,
263 sizeof(Py_UNICODE
) * (length
+ 1));
265 unicode
->str
= (Py_UNICODE
*)oldstr
;
269 unicode
->str
[length
] = 0;
270 unicode
->length
= length
;
273 /* Reset the object caches */
274 if (unicode
->defenc
) {
275 Py_DECREF(unicode
->defenc
);
276 unicode
->defenc
= NULL
;
283 /* We allocate one more byte to make sure the string is
284 Ux0000 terminated -- XXX is this needed ?
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
292 PyUnicodeObject
*_PyUnicode_New(Py_ssize_t length
)
294 register PyUnicodeObject
*unicode
;
296 /* Optimization for empty strings */
297 if (length
== 0 && unicode_empty
!= NULL
) {
298 Py_INCREF(unicode_empty
);
299 return unicode_empty
;
302 /* Unicode freelist & memory allocation */
305 free_list
= *(PyUnicodeObject
**)unicode
;
308 /* Keep-Alive optimization: we only upsize the buffer,
309 never downsize it. */
310 if ((unicode
->length
< length
) &&
311 unicode_resize(unicode
, length
) < 0) {
312 PyObject_DEL(unicode
->str
);
317 size_t new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
318 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
320 PyObject_INIT(unicode
, &PyUnicode_Type
);
324 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
327 new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
328 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
335 /* Initialize the first element to guard against cases where
336 * the caller fails before initializing str -- unicode_resize()
337 * reads str[0], and the Keep-Alive optimization can keep memory
338 * allocated for str alive across a call to unicode_dealloc(unicode).
339 * We don't want unicode_resize to read uninitialized memory in
343 unicode
->str
[length
] = 0;
344 unicode
->length
= length
;
346 unicode
->defenc
= NULL
;
350 _Py_ForgetReference((PyObject
*)unicode
);
351 PyObject_Del(unicode
);
356 void unicode_dealloc(register PyUnicodeObject
*unicode
)
358 if (PyUnicode_CheckExact(unicode
) &&
359 numfree
< PyUnicode_MAXFREELIST
) {
360 /* Keep-Alive optimization */
361 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
362 PyObject_DEL(unicode
->str
);
366 if (unicode
->defenc
) {
367 Py_DECREF(unicode
->defenc
);
368 unicode
->defenc
= NULL
;
370 /* Add to free list */
371 *(PyUnicodeObject
**)unicode
= free_list
;
376 PyObject_DEL(unicode
->str
);
377 Py_XDECREF(unicode
->defenc
);
378 Py_TYPE(unicode
)->tp_free((PyObject
*)unicode
);
382 int PyUnicode_Resize(PyObject
**unicode
, Py_ssize_t length
)
384 register PyUnicodeObject
*v
;
386 /* Argument checks */
387 if (unicode
== NULL
) {
388 PyErr_BadInternalCall();
391 v
= (PyUnicodeObject
*)*unicode
;
392 if (v
== NULL
|| !PyUnicode_Check(v
) || Py_REFCNT(v
) != 1 || length
< 0) {
393 PyErr_BadInternalCall();
397 /* Resizing unicode_empty and single character objects is not
398 possible since these are being shared. We simply return a fresh
399 copy with the same Unicode content. */
400 if (v
->length
!= length
&&
401 (v
== unicode_empty
|| v
->length
== 1)) {
402 PyUnicodeObject
*w
= _PyUnicode_New(length
);
405 Py_UNICODE_COPY(w
->str
, v
->str
,
406 length
< v
->length
? length
: v
->length
);
408 *unicode
= (PyObject
*)w
;
412 /* Note that we don't have to modify *unicode for unshared Unicode
413 objects, since we can modify them in-place. */
414 return unicode_resize(v
, length
);
417 /* Internal API for use in unicodeobject.c only ! */
418 #define _PyUnicode_Resize(unicodevar, length) \
419 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
421 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
424 PyUnicodeObject
*unicode
;
426 /* If the Unicode data is known at construction time, we can apply
427 some optimizations which share commonly used objects. */
430 /* Optimization for empty strings */
431 if (size
== 0 && unicode_empty
!= NULL
) {
432 Py_INCREF(unicode_empty
);
433 return (PyObject
*)unicode_empty
;
436 /* Single character Unicode objects in the Latin-1 range are
437 shared when using this constructor */
438 if (size
== 1 && *u
< 256) {
439 unicode
= unicode_latin1
[*u
];
441 unicode
= _PyUnicode_New(1);
444 unicode
->str
[0] = *u
;
445 unicode_latin1
[*u
] = unicode
;
448 return (PyObject
*)unicode
;
452 unicode
= _PyUnicode_New(size
);
456 /* Copy the Unicode data into the new object */
458 Py_UNICODE_COPY(unicode
->str
, u
, size
);
460 return (PyObject
*)unicode
;
463 PyObject
*PyUnicode_FromStringAndSize(const char *u
, Py_ssize_t size
)
465 PyUnicodeObject
*unicode
;
468 PyErr_SetString(PyExc_SystemError
,
469 "Negative size passed to PyUnicode_FromStringAndSize");
473 /* If the Unicode data is known at construction time, we can apply
474 some optimizations which share commonly used objects.
475 Also, this means the input must be UTF-8, so fall back to the
476 UTF-8 decoder at the end. */
479 /* Optimization for empty strings */
480 if (size
== 0 && unicode_empty
!= NULL
) {
481 Py_INCREF(unicode_empty
);
482 return (PyObject
*)unicode_empty
;
485 /* Single characters are shared when using this constructor.
486 Restrict to ASCII, since the input must be UTF-8. */
487 if (size
== 1 && Py_CHARMASK(*u
) < 128) {
488 unicode
= unicode_latin1
[Py_CHARMASK(*u
)];
490 unicode
= _PyUnicode_New(1);
493 unicode
->str
[0] = Py_CHARMASK(*u
);
494 unicode_latin1
[Py_CHARMASK(*u
)] = unicode
;
497 return (PyObject
*)unicode
;
500 return PyUnicode_DecodeUTF8(u
, size
, NULL
);
503 unicode
= _PyUnicode_New(size
);
507 return (PyObject
*)unicode
;
510 PyObject
*PyUnicode_FromString(const char *u
)
512 size_t size
= strlen(u
);
513 if (size
> PY_SSIZE_T_MAX
) {
514 PyErr_SetString(PyExc_OverflowError
, "input too long");
518 return PyUnicode_FromStringAndSize(u
, size
);
523 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
526 PyUnicodeObject
*unicode
;
529 PyErr_BadInternalCall();
533 unicode
= _PyUnicode_New(size
);
537 /* Copy the wchar_t data into the new object */
538 #ifdef HAVE_USABLE_WCHAR_T
539 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
542 register Py_UNICODE
*u
;
543 register Py_ssize_t i
;
544 u
= PyUnicode_AS_UNICODE(unicode
);
545 for (i
= size
; i
> 0; i
--)
550 return (PyObject
*)unicode
;
554 makefmt(char *fmt
, int longflag
, int size_tflag
, int zeropad
, int width
, int precision
, char c
)
560 fmt
+= sprintf(fmt
, "%d", width
);
563 fmt
+= sprintf(fmt
, ".%d", precision
);
566 else if (size_tflag
) {
567 char *f
= PY_FORMAT_SIZE_T
;
575 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
578 PyUnicode_FromFormatV(const char *format
, va_list vargs
)
581 Py_ssize_t callcount
= 0;
582 PyObject
**callresults
= NULL
;
583 PyObject
**callresult
= NULL
;
591 /* used by sprintf */
593 /* use abuffer instead of buffer, if we need more space
594 * (which can happen if there's a format specifier with width). */
595 char *abuffer
= NULL
;
597 Py_ssize_t abuffersize
= 0;
598 char fmt
[60]; /* should be enough for %0width.precisionld */
601 #ifdef VA_LIST_IS_ARRAY
602 Py_MEMCPY(count
, vargs
, sizeof(va_list));
605 __va_copy(count
, vargs
);
610 /* step 1: count the number of %S/%R format specifications
611 * (we call PyObject_Str()/PyObject_Repr() for these objects
612 * once during step 3 and put the result in an array) */
613 for (f
= format
; *f
; f
++) {
614 if (*f
== '%' && (*(f
+1)=='S' || *(f
+1)=='R'))
617 /* step 2: allocate memory for the results of
618 * PyObject_Str()/PyObject_Repr() calls */
620 callresults
= PyObject_Malloc(sizeof(PyObject
*)*callcount
);
625 callresult
= callresults
;
627 /* step 3: figure out how large a buffer we need */
628 for (f
= format
; *f
; f
++) {
632 while (isdigit((unsigned)*f
))
633 width
= (width
*10) + *f
++ - '0';
634 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
637 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
638 * they don't affect the amount of space we reserve.
640 if ((*f
== 'l' || *f
== 'z') &&
641 (f
[1] == 'd' || f
[1] == 'u'))
646 (void)va_arg(count
, int);
647 /* fall through... */
651 case 'd': case 'u': case 'i': case 'x':
652 (void) va_arg(count
, int);
653 /* 20 bytes is enough to hold a 64-bit
654 integer. Decimal takes the most space.
655 This isn't enough for octal.
656 If a width is specified we need more
657 (which we allocate later). */
661 if (abuffersize
< width
)
668 s
= va_arg(count
, unsigned char*);
672 } else if (*s
< 0xc0) {
675 } else if (*s
< 0xc0) {
679 } else if (*s
< 0xe0) {
685 #ifdef Py_UNICODE_WIDE
700 PyObject
*obj
= va_arg(count
, PyObject
*);
701 assert(obj
&& PyUnicode_Check(obj
));
702 n
+= PyUnicode_GET_SIZE(obj
);
707 PyObject
*obj
= va_arg(count
, PyObject
*);
708 const char *str
= va_arg(count
, const char *);
710 assert(!obj
|| PyUnicode_Check(obj
));
712 n
+= PyUnicode_GET_SIZE(obj
);
719 PyObject
*obj
= va_arg(count
, PyObject
*);
722 str
= PyObject_Str(obj
);
725 n
+= PyUnicode_GET_SIZE(str
);
726 /* Remember the str and switch to the next slot */
732 PyObject
*obj
= va_arg(count
, PyObject
*);
735 repr
= PyObject_Repr(obj
);
738 n
+= PyUnicode_GET_SIZE(repr
);
739 /* Remember the repr and switch to the next slot */
740 *callresult
++ = repr
;
744 (void) va_arg(count
, int);
745 /* maximum 64-bit pointer representation:
747 * so 19 characters is enough.
748 * XXX I count 18 -- what's the extra for?
753 /* if we stumble upon an unknown
754 formatting code, copy the rest of
755 the format string to the output
756 string. (we cannot just skip the
757 code, since there's no way to know
758 what's in the argument list) */
766 if (abuffersize
> 20) {
767 abuffer
= PyObject_Malloc(abuffersize
);
772 realbuffer
= abuffer
;
776 /* step 4: fill the buffer */
777 /* Since we've analyzed how much space we need for the worst case,
778 we don't have to resize the string.
779 There can be no errors beyond this point. */
780 string
= PyUnicode_FromUnicode(NULL
, n
);
784 s
= PyUnicode_AS_UNICODE(string
);
785 callresult
= callresults
;
787 for (f
= format
; *f
; f
++) {
792 zeropad
= (*f
== '0');
793 /* parse the width.precision part */
795 while (isdigit((unsigned)*f
))
796 width
= (width
*10) + *f
++ - '0';
800 while (isdigit((unsigned)*f
))
801 precision
= (precision
*10) + *f
++ - '0';
803 /* handle the long flag, but only for %ld and %lu.
804 others can be added when necessary. */
805 if (*f
== 'l' && (f
[1] == 'd' || f
[1] == 'u')) {
809 /* handle the size_t flag. */
810 if (*f
== 'z' && (f
[1] == 'd' || f
[1] == 'u')) {
817 *s
++ = va_arg(vargs
, int);
820 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'd');
822 sprintf(realbuffer
, fmt
, va_arg(vargs
, long));
824 sprintf(realbuffer
, fmt
, va_arg(vargs
, Py_ssize_t
));
826 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
827 appendstring(realbuffer
);
830 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'u');
832 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned long));
834 sprintf(realbuffer
, fmt
, va_arg(vargs
, size_t));
836 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned int));
837 appendstring(realbuffer
);
840 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'i');
841 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
842 appendstring(realbuffer
);
845 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'x');
846 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
847 appendstring(realbuffer
);
851 /* Parameter must be UTF-8 encoded.
852 In case of encoding errors, use
853 the replacement character. */
855 p
= va_arg(vargs
, char*);
856 u
= PyUnicode_DecodeUTF8(p
, strlen(p
),
860 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(u
),
861 PyUnicode_GET_SIZE(u
));
862 s
+= PyUnicode_GET_SIZE(u
);
868 PyObject
*obj
= va_arg(vargs
, PyObject
*);
869 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
870 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
876 PyObject
*obj
= va_arg(vargs
, PyObject
*);
877 const char *str
= va_arg(vargs
, const char *);
879 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
880 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
893 /* unused, since we already have the result */
894 (void) va_arg(vargs
, PyObject
*);
895 ucopy
= PyUnicode_AS_UNICODE(*callresult
);
896 usize
= PyUnicode_GET_SIZE(*callresult
);
897 for (upos
= 0; upos
<usize
;)
898 *s
++ = ucopy
[upos
++];
899 /* We're done with the unicode()/repr() => forget it */
900 Py_DECREF(*callresult
);
901 /* switch to next unicode()/repr() result */
906 sprintf(buffer
, "%p", va_arg(vargs
, void*));
907 /* %p is ill-defined: ensure leading 0x. */
908 if (buffer
[1] == 'X')
910 else if (buffer
[1] != 'x') {
911 memmove(buffer
+2, buffer
, strlen(buffer
)+1);
915 appendstring(buffer
);
930 PyObject_Free(callresults
);
932 PyObject_Free(abuffer
);
933 _PyUnicode_Resize(&string
, s
- PyUnicode_AS_UNICODE(string
));
937 PyObject
**callresult2
= callresults
;
938 while (callresult2
< callresult
) {
939 Py_DECREF(*callresult2
);
942 PyObject_Free(callresults
);
945 PyObject_Free(abuffer
);
952 PyUnicode_FromFormat(const char *format
, ...)
957 #ifdef HAVE_STDARG_PROTOTYPES
958 va_start(vargs
, format
);
962 ret
= PyUnicode_FromFormatV(format
, vargs
);
967 Py_ssize_t
PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
971 if (unicode
== NULL
) {
972 PyErr_BadInternalCall();
976 /* If possible, try to copy the 0-termination as well */
977 if (size
> PyUnicode_GET_SIZE(unicode
))
978 size
= PyUnicode_GET_SIZE(unicode
) + 1;
980 #ifdef HAVE_USABLE_WCHAR_T
981 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
984 register Py_UNICODE
*u
;
985 register Py_ssize_t i
;
986 u
= PyUnicode_AS_UNICODE(unicode
);
987 for (i
= size
; i
> 0; i
--)
992 if (size
> PyUnicode_GET_SIZE(unicode
))
993 return PyUnicode_GET_SIZE(unicode
);
1000 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
1004 #ifdef Py_UNICODE_WIDE
1005 if (ordinal
< 0 || ordinal
> 0x10ffff) {
1006 PyErr_SetString(PyExc_ValueError
,
1007 "unichr() arg not in range(0x110000) "
1008 "(wide Python build)");
1012 if (ordinal
< 0 || ordinal
> 0xffff) {
1013 PyErr_SetString(PyExc_ValueError
,
1014 "unichr() arg not in range(0x10000) "
1015 "(narrow Python build)");
1020 s
[0] = (Py_UNICODE
)ordinal
;
1021 return PyUnicode_FromUnicode(s
, 1);
1024 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
1026 /* XXX Perhaps we should make this API an alias of
1027 PyObject_Unicode() instead ?! */
1028 if (PyUnicode_CheckExact(obj
)) {
1032 if (PyUnicode_Check(obj
)) {
1033 /* For a Unicode subtype that's not a Unicode object,
1034 return a true Unicode object with the same data. */
1035 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
1036 PyUnicode_GET_SIZE(obj
));
1038 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
1041 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
1042 const char *encoding
,
1045 const char *s
= NULL
;
1050 PyErr_BadInternalCall();
1055 /* For b/w compatibility we also accept Unicode objects provided
1056 that no encodings is given and then redirect to
1057 PyObject_Unicode() which then applies the additional logic for
1060 NOTE: This API should really only be used for object which
1061 represent *encoded* Unicode !
1064 if (PyUnicode_Check(obj
)) {
1066 PyErr_SetString(PyExc_TypeError
,
1067 "decoding Unicode is not supported");
1070 return PyObject_Unicode(obj
);
1073 if (PyUnicode_Check(obj
)) {
1074 PyErr_SetString(PyExc_TypeError
,
1075 "decoding Unicode is not supported");
1081 if (PyString_Check(obj
)) {
1082 s
= PyString_AS_STRING(obj
);
1083 len
= PyString_GET_SIZE(obj
);
1085 else if (PyByteArray_Check(obj
)) {
1086 /* Python 2.x specific */
1087 PyErr_Format(PyExc_TypeError
,
1088 "decoding bytearray is not supported");
1091 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
1092 /* Overwrite the error message with something more useful in
1093 case of a TypeError. */
1094 if (PyErr_ExceptionMatches(PyExc_TypeError
))
1095 PyErr_Format(PyExc_TypeError
,
1096 "coercing to Unicode: need string or buffer, "
1098 Py_TYPE(obj
)->tp_name
);
1102 /* Convert to Unicode */
1104 Py_INCREF(unicode_empty
);
1105 v
= (PyObject
*)unicode_empty
;
1108 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
1116 PyObject
*PyUnicode_Decode(const char *s
,
1118 const char *encoding
,
1121 PyObject
*buffer
= NULL
, *unicode
;
1123 if (encoding
== NULL
)
1124 encoding
= PyUnicode_GetDefaultEncoding();
1126 /* Shortcuts for common default encodings */
1127 if (strcmp(encoding
, "utf-8") == 0)
1128 return PyUnicode_DecodeUTF8(s
, size
, errors
);
1129 else if (strcmp(encoding
, "latin-1") == 0)
1130 return PyUnicode_DecodeLatin1(s
, size
, errors
);
1131 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1132 else if (strcmp(encoding
, "mbcs") == 0)
1133 return PyUnicode_DecodeMBCS(s
, size
, errors
);
1135 else if (strcmp(encoding
, "ascii") == 0)
1136 return PyUnicode_DecodeASCII(s
, size
, errors
);
1138 /* Decode via the codec registry */
1139 buffer
= PyBuffer_FromMemory((void *)s
, size
);
1142 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
1143 if (unicode
== NULL
)
1145 if (!PyUnicode_Check(unicode
)) {
1146 PyErr_Format(PyExc_TypeError
,
1147 "decoder did not return an unicode object (type=%.400s)",
1148 Py_TYPE(unicode
)->tp_name
);
1160 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
1161 const char *encoding
,
1166 if (!PyUnicode_Check(unicode
)) {
1167 PyErr_BadArgument();
1171 if (encoding
== NULL
)
1172 encoding
= PyUnicode_GetDefaultEncoding();
1174 /* Decode via the codec registry */
1175 v
= PyCodec_Decode(unicode
, encoding
, errors
);
1184 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
1186 const char *encoding
,
1189 PyObject
*v
, *unicode
;
1191 unicode
= PyUnicode_FromUnicode(s
, size
);
1192 if (unicode
== NULL
)
1194 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
1199 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
1200 const char *encoding
,
1205 if (!PyUnicode_Check(unicode
)) {
1206 PyErr_BadArgument();
1210 if (encoding
== NULL
)
1211 encoding
= PyUnicode_GetDefaultEncoding();
1213 /* Encode via the codec registry */
1214 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1223 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
1224 const char *encoding
,
1229 if (!PyUnicode_Check(unicode
)) {
1230 PyErr_BadArgument();
1234 if (encoding
== NULL
)
1235 encoding
= PyUnicode_GetDefaultEncoding();
1237 /* Shortcuts for common default encodings */
1238 if (errors
== NULL
) {
1239 if (strcmp(encoding
, "utf-8") == 0)
1240 return PyUnicode_AsUTF8String(unicode
);
1241 else if (strcmp(encoding
, "latin-1") == 0)
1242 return PyUnicode_AsLatin1String(unicode
);
1243 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1244 else if (strcmp(encoding
, "mbcs") == 0)
1245 return PyUnicode_AsMBCSString(unicode
);
1247 else if (strcmp(encoding
, "ascii") == 0)
1248 return PyUnicode_AsASCIIString(unicode
);
1251 /* Encode via the codec registry */
1252 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1255 if (!PyString_Check(v
)) {
1256 PyErr_Format(PyExc_TypeError
,
1257 "encoder did not return a string object (type=%.400s)",
1258 Py_TYPE(v
)->tp_name
);
1268 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
1271 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
1275 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
1276 if (v
&& errors
== NULL
)
1277 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
1281 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
1283 if (!PyUnicode_Check(unicode
)) {
1284 PyErr_BadArgument();
1287 return PyUnicode_AS_UNICODE(unicode
);
1293 Py_ssize_t
PyUnicode_GetSize(PyObject
*unicode
)
1295 if (!PyUnicode_Check(unicode
)) {
1296 PyErr_BadArgument();
1299 return PyUnicode_GET_SIZE(unicode
);
1305 const char *PyUnicode_GetDefaultEncoding(void)
1307 return unicode_default_encoding
;
1310 int PyUnicode_SetDefaultEncoding(const char *encoding
)
1314 /* Make sure the encoding is valid. As side effect, this also
1315 loads the encoding into the codec registry cache. */
1316 v
= _PyCodec_Lookup(encoding
);
1320 strncpy(unicode_default_encoding
,
1322 sizeof(unicode_default_encoding
));
1329 /* error handling callback helper:
1330 build arguments, call the callback and check the arguments,
1331 if no exception occurred, copy the replacement to the output
1332 and adjust various state variables.
1333 return 0 on success, -1 on error
1337 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
1338 const char *encoding
, const char *reason
,
1339 const char *input
, Py_ssize_t insize
, Py_ssize_t
*startinpos
,
1340 Py_ssize_t
*endinpos
, PyObject
**exceptionObject
, const char **inptr
,
1341 PyObject
**output
, Py_ssize_t
*outpos
, Py_UNICODE
**outptr
)
1343 static char *argparse
= "O!n;decoding error handler must return (unicode, int) tuple";
1345 PyObject
*restuple
= NULL
;
1346 PyObject
*repunicode
= NULL
;
1347 Py_ssize_t outsize
= PyUnicode_GET_SIZE(*output
);
1348 Py_ssize_t requiredsize
;
1354 if (*errorHandler
== NULL
) {
1355 *errorHandler
= PyCodec_LookupError(errors
);
1356 if (*errorHandler
== NULL
)
1360 if (*exceptionObject
== NULL
) {
1361 *exceptionObject
= PyUnicodeDecodeError_Create(
1362 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
1363 if (*exceptionObject
== NULL
)
1367 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
1369 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
1371 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
1375 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
1376 if (restuple
== NULL
)
1378 if (!PyTuple_Check(restuple
)) {
1379 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
1382 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
1385 newpos
= insize
+newpos
;
1386 if (newpos
<0 || newpos
>insize
) {
1387 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", newpos
);
1391 /* need more space? (at least enough for what we
1392 have+the replacement+the rest of the string (starting
1393 at the new input position), so we won't have to check space
1394 when there are no errors in the rest of the string) */
1395 repptr
= PyUnicode_AS_UNICODE(repunicode
);
1396 repsize
= PyUnicode_GET_SIZE(repunicode
);
1397 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
1398 if (requiredsize
> outsize
) {
1399 if (requiredsize
<2*outsize
)
1400 requiredsize
= 2*outsize
;
1401 if (PyUnicode_Resize(output
, requiredsize
) < 0)
1403 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
1406 *inptr
= input
+ newpos
;
1407 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
1414 Py_XDECREF(restuple
);
1418 /* --- UTF-7 Codec -------------------------------------------------------- */
1420 /* see RFC2152 for details */
1423 char utf7_special
[128] = {
1424 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1428 2 - whitespace (optional)
1429 3 - RFC2152 Set O (optional) */
1430 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1431 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1432 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1434 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1436 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1441 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
1442 warnings about the comparison always being false; since
1443 utf7_special[0] is 1, we can safely make that one comparison
1446 #define SPECIAL(c, encodeO, encodeWS) \
1447 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1448 (encodeWS && (utf7_special[(c)] == 2)) || \
1449 (encodeO && (utf7_special[(c)] == 3)))
1452 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1453 #define B64CHAR(c) \
1454 (isalnum(c) || (c) == '+' || (c) == '/')
1456 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1457 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1459 #define ENCODE(out, ch, bits) \
1460 while (bits >= 6) { \
1461 *out++ = B64(ch >> (bits-6)); \
1465 #define DECODE(out, ch, bits, surrogate) \
1466 while (bits >= 16) { \
1467 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1470 /* We have already generated an error for the high surrogate \
1471 so let's not bother seeing if the low surrogate is correct or not */ \
1473 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
1474 /* This is a surrogate pair. Unfortunately we can't represent \
1475 it in a 16-bit character */ \
1477 errmsg = "code pairs are not supported"; \
1484 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
1488 return PyUnicode_DecodeUTF7Stateful(s
, size
, errors
, NULL
);
1491 PyObject
*PyUnicode_DecodeUTF7Stateful(const char *s
,
1494 Py_ssize_t
*consumed
)
1496 const char *starts
= s
;
1497 Py_ssize_t startinpos
;
1498 Py_ssize_t endinpos
;
1501 PyUnicodeObject
*unicode
;
1503 const char *errmsg
= "";
1505 unsigned int bitsleft
= 0;
1506 unsigned long charsleft
= 0;
1508 PyObject
*errorHandler
= NULL
;
1509 PyObject
*exc
= NULL
;
1511 unicode
= _PyUnicode_New(size
);
1517 return (PyObject
*)unicode
;
1529 if ((ch
== '-') || !B64CHAR(ch
)) {
1533 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
1534 if (bitsleft
>= 6) {
1535 /* The shift sequence has a partial character in it. If
1536 bitsleft < 6 then we could just classify it as padding
1537 but that is not the case here */
1539 errmsg
= "partial character in shift sequence";
1542 /* According to RFC2152 the remaining bits should be zero. We
1543 choose to signal an error/insert a replacement character
1544 here so indicate the potential of a misencoded character. */
1546 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1547 if (bitsleft
&& charsleft
<< (sizeof(charsleft
) * 8 - bitsleft
)) {
1548 errmsg
= "non-zero padding bits in shift sequence";
1553 if ((s
< e
) && (*(s
) == '-')) {
1557 } else if (SPECIAL(ch
,0,0)) {
1558 errmsg
= "unexpected special character";
1564 charsleft
= (charsleft
<< 6) | UB64(ch
);
1567 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
1570 else if ( ch
== '+' ) {
1571 startinpos
= s
-starts
;
1573 if (s
< e
&& *s
== '-') {
1582 else if (SPECIAL(ch
,0,0)) {
1583 startinpos
= s
-starts
;
1584 errmsg
= "unexpected special character";
1594 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1595 endinpos
= s
-starts
;
1596 if (unicode_decode_call_errorhandler(
1597 errors
, &errorHandler
,
1599 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1600 (PyObject
**)&unicode
, &outpos
, &p
))
1604 if (inShift
&& !consumed
) {
1605 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1607 if (unicode_decode_call_errorhandler(
1608 errors
, &errorHandler
,
1609 "utf7", "unterminated shift sequence",
1610 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1611 (PyObject
**)&unicode
, &outpos
, &p
))
1618 *consumed
= startinpos
;
1620 *consumed
= s
-starts
;
1623 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1626 Py_XDECREF(errorHandler
);
1628 return (PyObject
*)unicode
;
1631 Py_XDECREF(errorHandler
);
1638 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1641 int encodeWhiteSpace
,
1645 /* It might be possible to tighten this worst case */
1646 Py_ssize_t cbAllocated
= 5 * size
;
1649 unsigned int bitsleft
= 0;
1650 unsigned long charsleft
= 0;
1655 return PyString_FromStringAndSize(NULL
, 0);
1657 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
1661 start
= out
= PyString_AS_STRING(v
);
1662 for (;i
< size
; ++i
) {
1663 Py_UNICODE ch
= s
[i
];
1669 } else if (SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1673 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1674 inShift
= bitsleft
> 0;
1679 if (!SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1680 *out
++ = B64(charsleft
<< (6-bitsleft
));
1683 /* Characters not in the BASE64 set implicitly unshift the sequence
1684 so no '-' is required, except if the character is itself a '-' */
1685 if (B64CHAR(ch
) || ch
== '-') {
1692 charsleft
= (charsleft
<< 16) | ch
;
1693 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1695 /* If the next character is special then we dont' need to terminate
1696 the shift sequence. If the next character is not a BASE64 character
1697 or '-' then the shift sequence will be terminated implicitly and we
1698 don't have to insert a '-'. */
1700 if (bitsleft
== 0) {
1702 Py_UNICODE ch2
= s
[i
+1];
1704 if (SPECIAL(ch2
, encodeSetO
, encodeWhiteSpace
)) {
1706 } else if (B64CHAR(ch2
) || ch2
== '-') {
1723 *out
++= B64(charsleft
<< (6-bitsleft
) );
1727 _PyString_Resize(&v
, out
- start
);
1738 /* --- UTF-8 Codec -------------------------------------------------------- */
1741 char utf8_code_length
[256] = {
1742 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1743 illegal prefix. see RFC 2279 for details */
1744 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1745 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1746 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1747 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1748 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1749 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1750 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1751 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1753 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1754 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1755 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1756 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1757 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1758 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1759 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1762 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1766 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1769 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1772 Py_ssize_t
*consumed
)
1774 const char *starts
= s
;
1776 Py_ssize_t startinpos
;
1777 Py_ssize_t endinpos
;
1780 PyUnicodeObject
*unicode
;
1782 const char *errmsg
= "";
1783 PyObject
*errorHandler
= NULL
;
1784 PyObject
*exc
= NULL
;
1786 /* Note: size will always be longer than the resulting Unicode
1788 unicode
= _PyUnicode_New(size
);
1794 return (PyObject
*)unicode
;
1797 /* Unpack UTF-8 encoded data */
1802 Py_UCS4 ch
= (unsigned char)*s
;
1805 *p
++ = (Py_UNICODE
)ch
;
1810 n
= utf8_code_length
[ch
];
1816 errmsg
= "unexpected end of data";
1817 startinpos
= s
-starts
;
1826 errmsg
= "unexpected code byte";
1827 startinpos
= s
-starts
;
1828 endinpos
= startinpos
+1;
1832 errmsg
= "internal error";
1833 startinpos
= s
-starts
;
1834 endinpos
= startinpos
+1;
1838 if ((s
[1] & 0xc0) != 0x80) {
1839 errmsg
= "invalid data";
1840 startinpos
= s
-starts
;
1841 endinpos
= startinpos
+2;
1844 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1846 startinpos
= s
-starts
;
1847 endinpos
= startinpos
+2;
1848 errmsg
= "illegal encoding";
1852 *p
++ = (Py_UNICODE
)ch
;
1856 if ((s
[1] & 0xc0) != 0x80 ||
1857 (s
[2] & 0xc0) != 0x80) {
1858 errmsg
= "invalid data";
1859 startinpos
= s
-starts
;
1860 endinpos
= startinpos
+3;
1863 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1865 /* Note: UTF-8 encodings of surrogates are considered
1866 legal UTF-8 sequences;
1868 XXX For wide builds (UCS-4) we should probably try
1869 to recombine the surrogates into a single code
1872 errmsg
= "illegal encoding";
1873 startinpos
= s
-starts
;
1874 endinpos
= startinpos
+3;
1878 *p
++ = (Py_UNICODE
)ch
;
1882 if ((s
[1] & 0xc0) != 0x80 ||
1883 (s
[2] & 0xc0) != 0x80 ||
1884 (s
[3] & 0xc0) != 0x80) {
1885 errmsg
= "invalid data";
1886 startinpos
= s
-starts
;
1887 endinpos
= startinpos
+4;
1890 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
1891 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
1892 /* validate and convert to UTF-16 */
1893 if ((ch
< 0x10000) /* minimum value allowed for 4
1895 || (ch
> 0x10ffff)) /* maximum value allowed for
1898 errmsg
= "illegal encoding";
1899 startinpos
= s
-starts
;
1900 endinpos
= startinpos
+4;
1903 #ifdef Py_UNICODE_WIDE
1904 *p
++ = (Py_UNICODE
)ch
;
1906 /* compute and append the two surrogates: */
1908 /* translate from 10000..10FFFF to 0..FFFF */
1911 /* high surrogate = top 10 bits added to D800 */
1912 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
1914 /* low surrogate = bottom 10 bits added to DC00 */
1915 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
1920 /* Other sizes are only needed for UCS-4 */
1921 errmsg
= "unsupported Unicode code range";
1922 startinpos
= s
-starts
;
1923 endinpos
= startinpos
+n
;
1930 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1931 if (unicode_decode_call_errorhandler(
1932 errors
, &errorHandler
,
1934 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1935 (PyObject
**)&unicode
, &outpos
, &p
))
1939 *consumed
= s
-starts
;
1942 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
1945 Py_XDECREF(errorHandler
);
1947 return (PyObject
*)unicode
;
1950 Py_XDECREF(errorHandler
);
1956 /* Allocation strategy: if the string is short, convert into a stack buffer
1957 and allocate exactly as much space needed at the end. Else allocate the
1958 maximum possible needed (4 result bytes per Unicode character), and return
1959 the excess memory at the end.
1962 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
1966 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1968 Py_ssize_t i
; /* index into s of next input byte */
1969 PyObject
*v
; /* result string object */
1970 char *p
; /* next free byte in output buffer */
1971 Py_ssize_t nallocated
; /* number of result bytes allocated */
1972 Py_ssize_t nneeded
; /* number of result bytes needed */
1973 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
1978 if (size
<= MAX_SHORT_UNICHARS
) {
1979 /* Write into the stack buffer; nallocated can't overflow.
1980 * At the end, we'll allocate exactly as much heap space as it
1981 * turns out we need.
1983 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
1984 v
= NULL
; /* will allocate after we're done */
1988 /* Overallocate on the heap, and give the excess back at the end. */
1989 nallocated
= size
* 4;
1990 if (nallocated
/ 4 != size
) /* overflow! */
1991 return PyErr_NoMemory();
1992 v
= PyString_FromStringAndSize(NULL
, nallocated
);
1995 p
= PyString_AS_STRING(v
);
1998 for (i
= 0; i
< size
;) {
1999 Py_UCS4 ch
= s
[i
++];
2005 else if (ch
< 0x0800) {
2006 /* Encode Latin-1 */
2007 *p
++ = (char)(0xc0 | (ch
>> 6));
2008 *p
++ = (char)(0x80 | (ch
& 0x3f));
2011 /* Encode UCS2 Unicode ordinals */
2013 /* Special case: check for high surrogate */
2014 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
2016 /* Check for low surrogate and combine the two to
2017 form a UCS4 value */
2018 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2019 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
2023 /* Fall through: handles isolated high surrogates */
2025 *p
++ = (char)(0xe0 | (ch
>> 12));
2026 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2027 *p
++ = (char)(0x80 | (ch
& 0x3f));
2031 /* Encode UCS4 Unicode ordinals */
2032 *p
++ = (char)(0xf0 | (ch
>> 18));
2033 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
2034 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2035 *p
++ = (char)(0x80 | (ch
& 0x3f));
2040 /* This was stack allocated. */
2041 nneeded
= p
- stackbuf
;
2042 assert(nneeded
<= nallocated
);
2043 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
2046 /* Cut back to size actually needed. */
2047 nneeded
= p
- PyString_AS_STRING(v
);
2048 assert(nneeded
<= nallocated
);
2049 _PyString_Resize(&v
, nneeded
);
2053 #undef MAX_SHORT_UNICHARS
2056 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
2058 if (!PyUnicode_Check(unicode
)) {
2059 PyErr_BadArgument();
2062 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
2063 PyUnicode_GET_SIZE(unicode
),
2067 /* --- UTF-32 Codec ------------------------------------------------------- */
2070 PyUnicode_DecodeUTF32(const char *s
,
2075 return PyUnicode_DecodeUTF32Stateful(s
, size
, errors
, byteorder
, NULL
);
2079 PyUnicode_DecodeUTF32Stateful(const char *s
,
2083 Py_ssize_t
*consumed
)
2085 const char *starts
= s
;
2086 Py_ssize_t startinpos
;
2087 Py_ssize_t endinpos
;
2089 PyUnicodeObject
*unicode
;
2091 #ifndef Py_UNICODE_WIDE
2094 const int pairs
= 0;
2096 const unsigned char *q
, *e
;
2097 int bo
= 0; /* assume native ordering by default */
2098 const char *errmsg
= "";
2099 /* Offsets from q for retrieving bytes in the right order. */
2100 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2101 int iorder
[] = {0, 1, 2, 3};
2103 int iorder
[] = {3, 2, 1, 0};
2105 PyObject
*errorHandler
= NULL
;
2106 PyObject
*exc
= NULL
;
2107 /* On narrow builds we split characters outside the BMP into two
2108 codepoints => count how much extra space we need. */
2109 #ifndef Py_UNICODE_WIDE
2110 for (i
= pairs
= 0; i
< size
/4; i
++)
2111 if (((Py_UCS4
*)s
)[i
] >= 0x10000)
2115 /* This might be one to much, because of a BOM */
2116 unicode
= _PyUnicode_New((size
+3)/4+pairs
);
2120 return (PyObject
*)unicode
;
2122 /* Unpack UTF-32 encoded data */
2124 q
= (unsigned char *)s
;
2130 /* Check for BOM marks (U+FEFF) in the input and adjust current
2131 byte order setting accordingly. In native mode, the leading BOM
2132 mark is skipped, in all other modes, it is copied to the output
2133 stream as-is (giving a ZWNBSP character). */
2136 const Py_UCS4 bom
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2137 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2138 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2139 if (bom
== 0x0000FEFF) {
2143 else if (bom
== 0xFFFE0000) {
2148 if (bom
== 0x0000FEFF) {
2152 else if (bom
== 0xFFFE0000) {
2177 /* remaining bytes at the end? (size should be divisible by 4) */
2181 errmsg
= "truncated data";
2182 startinpos
= ((const char *)q
)-starts
;
2183 endinpos
= ((const char *)e
)-starts
;
2185 /* The remaining input chars are ignored if the callback
2186 chooses to skip the input */
2188 ch
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2189 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2193 errmsg
= "codepoint not in range(0x110000)";
2194 startinpos
= ((const char *)q
)-starts
;
2195 endinpos
= startinpos
+4;
2198 #ifndef Py_UNICODE_WIDE
2201 *p
++ = 0xD800 | ((ch
-0x10000) >> 10);
2202 *p
++ = 0xDC00 | ((ch
-0x10000) & 0x3FF);
2210 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2211 if (unicode_decode_call_errorhandler(
2212 errors
, &errorHandler
,
2214 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2215 (PyObject
**)&unicode
, &outpos
, &p
))
2223 *consumed
= (const char *)q
-starts
;
2226 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2229 Py_XDECREF(errorHandler
);
2231 return (PyObject
*)unicode
;
2235 Py_XDECREF(errorHandler
);
2241 PyUnicode_EncodeUTF32(const Py_UNICODE
*s
,
2248 #ifndef Py_UNICODE_WIDE
2251 const int pairs
= 0;
2253 /* Offsets from p for storing byte pairs in the right order. */
2254 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2255 int iorder
[] = {0, 1, 2, 3};
2257 int iorder
[] = {3, 2, 1, 0};
2260 #define STORECHAR(CH) \
2262 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2263 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2264 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2265 p[iorder[0]] = (CH) & 0xff; \
2269 /* In narrow builds we can output surrogate pairs as one codepoint,
2270 so we need less space. */
2271 #ifndef Py_UNICODE_WIDE
2272 for (i
= pairs
= 0; i
< size
-1; i
++)
2273 if (0xD800 <= s
[i
] && s
[i
] <= 0xDBFF &&
2274 0xDC00 <= s
[i
+1] && s
[i
+1] <= 0xDFFF)
2277 v
= PyString_FromStringAndSize(NULL
,
2278 4 * (size
- pairs
+ (byteorder
== 0)));
2282 p
= (unsigned char *)PyString_AS_STRING(v
);
2288 if (byteorder
== -1) {
2295 else if (byteorder
== 1) {
2303 while (size
-- > 0) {
2305 #ifndef Py_UNICODE_WIDE
2306 if (0xD800 <= ch
&& ch
<= 0xDBFF && size
> 0) {
2308 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2309 ch
= (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2321 PyObject
*PyUnicode_AsUTF32String(PyObject
*unicode
)
2323 if (!PyUnicode_Check(unicode
)) {
2324 PyErr_BadArgument();
2327 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode
),
2328 PyUnicode_GET_SIZE(unicode
),
2333 /* --- UTF-16 Codec ------------------------------------------------------- */
2336 PyUnicode_DecodeUTF16(const char *s
,
2341 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
2345 PyUnicode_DecodeUTF16Stateful(const char *s
,
2349 Py_ssize_t
*consumed
)
2351 const char *starts
= s
;
2352 Py_ssize_t startinpos
;
2353 Py_ssize_t endinpos
;
2355 PyUnicodeObject
*unicode
;
2357 const unsigned char *q
, *e
;
2358 int bo
= 0; /* assume native ordering by default */
2359 const char *errmsg
= "";
2360 /* Offsets from q for retrieving byte pairs in the right order. */
2361 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2362 int ihi
= 1, ilo
= 0;
2364 int ihi
= 0, ilo
= 1;
2366 PyObject
*errorHandler
= NULL
;
2367 PyObject
*exc
= NULL
;
2369 /* Note: size will always be longer than the resulting Unicode
2371 unicode
= _PyUnicode_New(size
);
2375 return (PyObject
*)unicode
;
2377 /* Unpack UTF-16 encoded data */
2379 q
= (unsigned char *)s
;
2385 /* Check for BOM marks (U+FEFF) in the input and adjust current
2386 byte order setting accordingly. In native mode, the leading BOM
2387 mark is skipped, in all other modes, it is copied to the output
2388 stream as-is (giving a ZWNBSP character). */
2391 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
2392 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2393 if (bom
== 0xFEFF) {
2397 else if (bom
== 0xFFFE) {
2402 if (bom
== 0xFEFF) {
2406 else if (bom
== 0xFFFE) {
2427 /* remaining bytes at the end? (size should be even) */
2431 errmsg
= "truncated data";
2432 startinpos
= ((const char *)q
)-starts
;
2433 endinpos
= ((const char *)e
)-starts
;
2435 /* The remaining input chars are ignored if the callback
2436 chooses to skip the input */
2438 ch
= (q
[ihi
] << 8) | q
[ilo
];
2442 if (ch
< 0xD800 || ch
> 0xDFFF) {
2447 /* UTF-16 code pair: */
2449 errmsg
= "unexpected end of data";
2450 startinpos
= (((const char *)q
)-2)-starts
;
2451 endinpos
= ((const char *)e
)-starts
;
2454 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
2455 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
2457 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2458 #ifndef Py_UNICODE_WIDE
2462 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2467 errmsg
= "illegal UTF-16 surrogate";
2468 startinpos
= (((const char *)q
)-4)-starts
;
2469 endinpos
= startinpos
+2;
2474 errmsg
= "illegal encoding";
2475 startinpos
= (((const char *)q
)-2)-starts
;
2476 endinpos
= startinpos
+2;
2477 /* Fall through to report the error */
2480 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2481 if (unicode_decode_call_errorhandler(
2482 errors
, &errorHandler
,
2484 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2485 (PyObject
**)&unicode
, &outpos
, &p
))
2493 *consumed
= (const char *)q
-starts
;
2496 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2499 Py_XDECREF(errorHandler
);
2501 return (PyObject
*)unicode
;
2505 Py_XDECREF(errorHandler
);
2511 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
2518 #ifdef Py_UNICODE_WIDE
2521 const int pairs
= 0;
2523 /* Offsets from p for storing byte pairs in the right order. */
2524 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2525 int ihi
= 1, ilo
= 0;
2527 int ihi
= 0, ilo
= 1;
2530 #define STORECHAR(CH) \
2532 p[ihi] = ((CH) >> 8) & 0xff; \
2533 p[ilo] = (CH) & 0xff; \
2537 #ifdef Py_UNICODE_WIDE
2538 for (i
= pairs
= 0; i
< size
; i
++)
2539 if (s
[i
] >= 0x10000)
2542 v
= PyString_FromStringAndSize(NULL
,
2543 2 * (size
+ pairs
+ (byteorder
== 0)));
2547 p
= (unsigned char *)PyString_AS_STRING(v
);
2553 if (byteorder
== -1) {
2558 else if (byteorder
== 1) {
2564 while (size
-- > 0) {
2565 Py_UNICODE ch
= *s
++;
2567 #ifdef Py_UNICODE_WIDE
2568 if (ch
>= 0x10000) {
2569 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
2570 ch
= 0xD800 | ((ch
-0x10000) >> 10);
2581 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
2583 if (!PyUnicode_Check(unicode
)) {
2584 PyErr_BadArgument();
2587 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
2588 PyUnicode_GET_SIZE(unicode
),
2593 /* --- Unicode Escape Codec ----------------------------------------------- */
2595 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
2597 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
2601 const char *starts
= s
;
2602 Py_ssize_t startinpos
;
2603 Py_ssize_t endinpos
;
2610 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
2611 PyObject
*errorHandler
= NULL
;
2612 PyObject
*exc
= NULL
;
2614 /* Escaped strings will always be longer than the resulting
2615 Unicode string, so we start with size here and then reduce the
2616 length after conversion to the true value.
2617 (but if the error callback returns a long replacement string
2618 we'll have to allocate more space) */
2619 v
= _PyUnicode_New(size
);
2623 return (PyObject
*)v
;
2625 p
= PyUnicode_AS_UNICODE(v
);
2633 /* Non-escape characters are interpreted as Unicode ordinals */
2635 *p
++ = (unsigned char) *s
++;
2639 startinpos
= s
-starts
;
2644 c
= '\0'; /* Invalid after \ */
2649 case '\\': *p
++ = '\\'; break;
2650 case '\'': *p
++ = '\''; break;
2651 case '\"': *p
++ = '\"'; break;
2652 case 'b': *p
++ = '\b'; break;
2653 case 'f': *p
++ = '\014'; break; /* FF */
2654 case 't': *p
++ = '\t'; break;
2655 case 'n': *p
++ = '\n'; break;
2656 case 'r': *p
++ = '\r'; break;
2657 case 'v': *p
++ = '\013'; break; /* VT */
2658 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
2660 /* \OOO (octal) escapes */
2661 case '0': case '1': case '2': case '3':
2662 case '4': case '5': case '6': case '7':
2664 if (s
< end
&& '0' <= *s
&& *s
<= '7') {
2665 x
= (x
<<3) + *s
++ - '0';
2666 if (s
< end
&& '0' <= *s
&& *s
<= '7')
2667 x
= (x
<<3) + *s
++ - '0';
2676 message
= "truncated \\xXX escape";
2682 message
= "truncated \\uXXXX escape";
2688 message
= "truncated \\UXXXXXXXX escape";
2691 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2694 if (unicode_decode_call_errorhandler(
2695 errors
, &errorHandler
,
2696 "unicodeescape", "end of string in escape sequence",
2697 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2698 (PyObject
**)&v
, &outpos
, &p
))
2702 for (i
= 0; i
< digits
; ++i
) {
2703 c
= (unsigned char) s
[i
];
2705 endinpos
= (s
+i
+1)-starts
;
2706 if (unicode_decode_call_errorhandler(
2707 errors
, &errorHandler
,
2708 "unicodeescape", message
,
2709 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2710 (PyObject
**)&v
, &outpos
, &p
))
2714 chr
= (chr
<<4) & ~0xF;
2715 if (c
>= '0' && c
<= '9')
2717 else if (c
>= 'a' && c
<= 'f')
2718 chr
+= 10 + c
- 'a';
2720 chr
+= 10 + c
- 'A';
2723 if (chr
== 0xffffffff && PyErr_Occurred())
2724 /* _decoding_error will have already written into the
2728 /* when we get here, chr is a 32-bit unicode character */
2730 /* UCS-2 character */
2731 *p
++ = (Py_UNICODE
) chr
;
2732 else if (chr
<= 0x10ffff) {
2733 /* UCS-4 character. Either store directly, or as
2735 #ifdef Py_UNICODE_WIDE
2739 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
2740 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
2743 endinpos
= s
-starts
;
2744 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2745 if (unicode_decode_call_errorhandler(
2746 errors
, &errorHandler
,
2747 "unicodeescape", "illegal Unicode character",
2748 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2749 (PyObject
**)&v
, &outpos
, &p
))
2756 message
= "malformed \\N character escape";
2757 if (ucnhash_CAPI
== NULL
) {
2758 /* load the unicode data module */
2760 m
= PyImport_ImportModuleNoBlock("unicodedata");
2763 api
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
2767 ucnhash_CAPI
= (_PyUnicode_Name_CAPI
*)PyCObject_AsVoidPtr(api
);
2769 if (ucnhash_CAPI
== NULL
)
2773 const char *start
= s
+1;
2774 /* look for the closing brace */
2775 while (*s
!= '}' && s
< end
)
2777 if (s
> start
&& s
< end
&& *s
== '}') {
2778 /* found a name. look it up in the unicode database */
2779 message
= "unknown Unicode character name";
2781 if (ucnhash_CAPI
->getcode(NULL
, start
, (int)(s
-start
-1), &chr
))
2785 endinpos
= s
-starts
;
2786 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2787 if (unicode_decode_call_errorhandler(
2788 errors
, &errorHandler
,
2789 "unicodeescape", message
,
2790 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2791 (PyObject
**)&v
, &outpos
, &p
))
2797 message
= "\\ at end of string";
2799 endinpos
= s
-starts
;
2800 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2801 if (unicode_decode_call_errorhandler(
2802 errors
, &errorHandler
,
2803 "unicodeescape", message
,
2804 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2805 (PyObject
**)&v
, &outpos
, &p
))
2810 *p
++ = (unsigned char)s
[-1];
2817 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2819 Py_XDECREF(errorHandler
);
2821 return (PyObject
*)v
;
2826 "\\N escapes not supported (can't load unicodedata module)"
2829 Py_XDECREF(errorHandler
);
2835 Py_XDECREF(errorHandler
);
2840 /* Return a Unicode-Escape string version of the Unicode object.
2842 If quotes is true, the string is enclosed in u"" or u'' quotes as
2847 Py_LOCAL_INLINE(const Py_UNICODE
*) findchar(const Py_UNICODE
*s
,
2851 /* like wcschr, but doesn't stop at NULL characters */
2853 while (size
-- > 0) {
2863 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
2870 static const char *hexdigit
= "0123456789abcdef";
2872 /* XXX(nnorwitz): rather than over-allocating, it would be
2873 better to choose a different scheme. Perhaps scan the
2874 first N-chars of the string and allocate based on that size.
2876 /* Initial allocation is based on the longest-possible unichr
2879 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2880 unichr, so in this case it's the longest unichr escape. In
2881 narrow (UTF-16) builds this is five chars per source unichr
2882 since there are two unichrs in the surrogate pair, so in narrow
2883 (UTF-16) builds it's not the longest unichr escape.
2885 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2886 so in the narrow (UTF-16) build case it's the longest unichr
2890 repr
= PyString_FromStringAndSize(NULL
,
2892 #ifdef Py_UNICODE_WIDE
2901 p
= PyString_AS_STRING(repr
);
2905 *p
++ = (findchar(s
, size
, '\'') &&
2906 !findchar(s
, size
, '"')) ? '"' : '\'';
2908 while (size
-- > 0) {
2909 Py_UNICODE ch
= *s
++;
2911 /* Escape quotes and backslashes */
2913 ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1]) || ch
== '\\') {
2919 #ifdef Py_UNICODE_WIDE
2920 /* Map 21-bit characters to '\U00xxxxxx' */
2921 else if (ch
>= 0x10000) {
2924 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
2925 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
2926 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
2927 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
2928 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
2929 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
2930 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
2931 *p
++ = hexdigit
[ch
& 0x0000000F];
2935 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2936 else if (ch
>= 0xD800 && ch
< 0xDC00) {
2942 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
2943 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
2946 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
2947 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
2948 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
2949 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
2950 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
2951 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
2952 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
2953 *p
++ = hexdigit
[ucs
& 0x0000000F];
2956 /* Fall through: isolated surrogates are copied as-is */
2962 /* Map 16-bit characters to '\uxxxx' */
2966 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
2967 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
2968 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
2969 *p
++ = hexdigit
[ch
& 0x000F];
2972 /* Map special whitespace to '\t', \n', '\r' */
2973 else if (ch
== '\t') {
2977 else if (ch
== '\n') {
2981 else if (ch
== '\r') {
2986 /* Map non-printable US ASCII to '\xhh' */
2987 else if (ch
< ' ' || ch
>= 0x7F) {
2990 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
2991 *p
++ = hexdigit
[ch
& 0x000F];
2994 /* Copy everything else as-is */
2999 *p
++ = PyString_AS_STRING(repr
)[1];
3002 _PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
));
3006 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
3009 return unicodeescape_string(s
, size
, 0);
3012 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
3014 if (!PyUnicode_Check(unicode
)) {
3015 PyErr_BadArgument();
3018 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3019 PyUnicode_GET_SIZE(unicode
));
3022 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3024 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
3028 const char *starts
= s
;
3029 Py_ssize_t startinpos
;
3030 Py_ssize_t endinpos
;
3036 PyObject
*errorHandler
= NULL
;
3037 PyObject
*exc
= NULL
;
3039 /* Escaped strings will always be longer than the resulting
3040 Unicode string, so we start with size here and then reduce the
3041 length after conversion to the true value. (But decoding error
3042 handler might have to resize the string) */
3043 v
= _PyUnicode_New(size
);
3047 return (PyObject
*)v
;
3048 p
= PyUnicode_AS_UNICODE(v
);
3056 /* Non-escape characters are interpreted as Unicode ordinals */
3058 *p
++ = (unsigned char)*s
++;
3061 startinpos
= s
-starts
;
3063 /* \u-escapes are only interpreted iff the number of leading
3064 backslashes if odd */
3069 *p
++ = (unsigned char)*s
++;
3071 if (((s
- bs
) & 1) == 0 ||
3073 (*s
!= 'u' && *s
!= 'U')) {
3077 count
= *s
=='u' ? 4 : 8;
3080 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3081 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3082 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
3083 c
= (unsigned char)*s
;
3085 endinpos
= s
-starts
;
3086 if (unicode_decode_call_errorhandler(
3087 errors
, &errorHandler
,
3088 "rawunicodeescape", "truncated \\uXXXX",
3089 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3090 (PyObject
**)&v
, &outpos
, &p
))
3095 if (c
>= '0' && c
<= '9')
3097 else if (c
>= 'a' && c
<= 'f')
3103 /* UCS-2 character */
3104 *p
++ = (Py_UNICODE
) x
;
3105 else if (x
<= 0x10ffff) {
3106 /* UCS-4 character. Either store directly, or as
3108 #ifdef Py_UNICODE_WIDE
3109 *p
++ = (Py_UNICODE
) x
;
3112 *p
++ = 0xD800 + (Py_UNICODE
) (x
>> 10);
3113 *p
++ = 0xDC00 + (Py_UNICODE
) (x
& 0x03FF);
3116 endinpos
= s
-starts
;
3117 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3118 if (unicode_decode_call_errorhandler(
3119 errors
, &errorHandler
,
3120 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3121 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3122 (PyObject
**)&v
, &outpos
, &p
))
3128 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3130 Py_XDECREF(errorHandler
);
3132 return (PyObject
*)v
;
3136 Py_XDECREF(errorHandler
);
3141 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
3148 static const char *hexdigit
= "0123456789abcdef";
3150 #ifdef Py_UNICODE_WIDE
3151 repr
= PyString_FromStringAndSize(NULL
, 10 * size
);
3153 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
3160 p
= q
= PyString_AS_STRING(repr
);
3161 while (size
-- > 0) {
3162 Py_UNICODE ch
= *s
++;
3163 #ifdef Py_UNICODE_WIDE
3164 /* Map 32-bit characters to '\Uxxxxxxxx' */
3165 if (ch
>= 0x10000) {
3168 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
3169 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
3170 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
3171 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
3172 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3173 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3174 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3175 *p
++ = hexdigit
[ch
& 15];
3179 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3180 if (ch
>= 0xD800 && ch
< 0xDC00) {
3186 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3187 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3190 *p
++ = hexdigit
[(ucs
>> 28) & 0xf];
3191 *p
++ = hexdigit
[(ucs
>> 24) & 0xf];
3192 *p
++ = hexdigit
[(ucs
>> 20) & 0xf];
3193 *p
++ = hexdigit
[(ucs
>> 16) & 0xf];
3194 *p
++ = hexdigit
[(ucs
>> 12) & 0xf];
3195 *p
++ = hexdigit
[(ucs
>> 8) & 0xf];
3196 *p
++ = hexdigit
[(ucs
>> 4) & 0xf];
3197 *p
++ = hexdigit
[ucs
& 0xf];
3200 /* Fall through: isolated surrogates are copied as-is */
3205 /* Map 16-bit characters to '\uxxxx' */
3209 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3210 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3211 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3212 *p
++ = hexdigit
[ch
& 15];
3214 /* Copy everything else as-is */
3219 _PyString_Resize(&repr
, p
- q
);
3223 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
3225 if (!PyUnicode_Check(unicode
)) {
3226 PyErr_BadArgument();
3229 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3230 PyUnicode_GET_SIZE(unicode
));
3233 /* --- Unicode Internal Codec ------------------------------------------- */
3235 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
3239 const char *starts
= s
;
3240 Py_ssize_t startinpos
;
3241 Py_ssize_t endinpos
;
3247 PyObject
*errorHandler
= NULL
;
3248 PyObject
*exc
= NULL
;
3250 #ifdef Py_UNICODE_WIDE
3251 Py_UNICODE unimax
= PyUnicode_GetMax();
3254 /* XXX overflow detection missing */
3255 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
3258 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
3259 return (PyObject
*)v
;
3260 p
= PyUnicode_AS_UNICODE(v
);
3264 memcpy(p
, s
, sizeof(Py_UNICODE
));
3265 /* We have to sanity check the raw data, otherwise doom looms for
3266 some malformed UCS-4 data. */
3268 #ifdef Py_UNICODE_WIDE
3269 *p
> unimax
|| *p
< 0 ||
3271 end
-s
< Py_UNICODE_SIZE
3274 startinpos
= s
- starts
;
3275 if (end
-s
< Py_UNICODE_SIZE
) {
3276 endinpos
= end
-starts
;
3277 reason
= "truncated input";
3280 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
3281 reason
= "illegal code point (> 0x10FFFF)";
3283 outpos
= p
- PyUnicode_AS_UNICODE(v
);
3284 if (unicode_decode_call_errorhandler(
3285 errors
, &errorHandler
,
3286 "unicode_internal", reason
,
3287 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3288 (PyObject
**)&v
, &outpos
, &p
)) {
3294 s
+= Py_UNICODE_SIZE
;
3298 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3300 Py_XDECREF(errorHandler
);
3302 return (PyObject
*)v
;
3306 Py_XDECREF(errorHandler
);
3311 /* --- Latin-1 Codec ------------------------------------------------------ */
3313 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
3320 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3322 Py_UNICODE r
= *(unsigned char*)s
;
3323 return PyUnicode_FromUnicode(&r
, 1);
3326 v
= _PyUnicode_New(size
);
3330 return (PyObject
*)v
;
3331 p
= PyUnicode_AS_UNICODE(v
);
3333 *p
++ = (unsigned char)*s
++;
3334 return (PyObject
*)v
;
3341 /* create or adjust a UnicodeEncodeError */
3342 static void make_encode_exception(PyObject
**exceptionObject
,
3343 const char *encoding
,
3344 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3345 Py_ssize_t startpos
, Py_ssize_t endpos
,
3348 if (*exceptionObject
== NULL
) {
3349 *exceptionObject
= PyUnicodeEncodeError_Create(
3350 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3353 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
3355 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
3357 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
3361 Py_DECREF(*exceptionObject
);
3362 *exceptionObject
= NULL
;
3366 /* raises a UnicodeEncodeError */
3367 static void raise_encode_exception(PyObject
**exceptionObject
,
3368 const char *encoding
,
3369 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3370 Py_ssize_t startpos
, Py_ssize_t endpos
,
3373 make_encode_exception(exceptionObject
,
3374 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3375 if (*exceptionObject
!= NULL
)
3376 PyCodec_StrictErrors(*exceptionObject
);
3379 /* error handling callback helper:
3380 build arguments, call the callback and check the arguments,
3381 put the result into newpos and return the replacement string, which
3382 has to be freed by the caller */
3383 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
3384 PyObject
**errorHandler
,
3385 const char *encoding
, const char *reason
,
3386 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
3387 Py_ssize_t startpos
, Py_ssize_t endpos
,
3390 static char *argparse
= "O!n;encoding error handler must return (unicode, int) tuple";
3393 PyObject
*resunicode
;
3395 if (*errorHandler
== NULL
) {
3396 *errorHandler
= PyCodec_LookupError(errors
);
3397 if (*errorHandler
== NULL
)
3401 make_encode_exception(exceptionObject
,
3402 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3403 if (*exceptionObject
== NULL
)
3406 restuple
= PyObject_CallFunctionObjArgs(
3407 *errorHandler
, *exceptionObject
, NULL
);
3408 if (restuple
== NULL
)
3410 if (!PyTuple_Check(restuple
)) {
3411 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
3412 Py_DECREF(restuple
);
3415 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3416 &resunicode
, newpos
)) {
3417 Py_DECREF(restuple
);
3421 *newpos
= size
+*newpos
;
3422 if (*newpos
<0 || *newpos
>size
) {
3423 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
3424 Py_DECREF(restuple
);
3427 Py_INCREF(resunicode
);
3428 Py_DECREF(restuple
);
3432 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
3439 /* pointers to the beginning and end+1 of input */
3440 const Py_UNICODE
*startp
= p
;
3441 const Py_UNICODE
*endp
= p
+ size
;
3442 /* pointer to the beginning of the unencodable characters */
3443 /* const Py_UNICODE *badp = NULL; */
3444 /* pointer into the output */
3446 /* current output position */
3447 Py_ssize_t respos
= 0;
3449 const char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
3450 const char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3451 PyObject
*errorHandler
= NULL
;
3452 PyObject
*exc
= NULL
;
3453 /* the following variable is used for caching string comparisons
3454 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3455 int known_errorHandler
= -1;
3457 /* allocate enough for a simple encoding without
3458 replacements, if we need more, we'll resize */
3459 res
= PyString_FromStringAndSize(NULL
, size
);
3464 str
= PyString_AS_STRING(res
);
3470 /* can we encode this? */
3472 /* no overflow check, because we know that the space is enough */
3477 Py_ssize_t unicodepos
= p
-startp
;
3478 Py_ssize_t requiredsize
;
3479 PyObject
*repunicode
;
3484 /* startpos for collecting unencodable chars */
3485 const Py_UNICODE
*collstart
= p
;
3486 const Py_UNICODE
*collend
= p
;
3487 /* find all unecodable characters */
3488 while ((collend
< endp
) && ((*collend
)>=limit
))
3490 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3491 if (known_errorHandler
==-1) {
3492 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3493 known_errorHandler
= 1;
3494 else if (!strcmp(errors
, "replace"))
3495 known_errorHandler
= 2;
3496 else if (!strcmp(errors
, "ignore"))
3497 known_errorHandler
= 3;
3498 else if (!strcmp(errors
, "xmlcharrefreplace"))
3499 known_errorHandler
= 4;
3501 known_errorHandler
= 0;
3503 switch (known_errorHandler
) {
3504 case 1: /* strict */
3505 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
3507 case 2: /* replace */
3508 while (collstart
++<collend
)
3509 *str
++ = '?'; /* fall through */
3510 case 3: /* ignore */
3513 case 4: /* xmlcharrefreplace */
3514 respos
= str
-PyString_AS_STRING(res
);
3515 /* determine replacement size (temporarily (mis)uses p) */
3516 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
3525 #ifndef Py_UNICODE_WIDE
3531 else if (*p
<1000000)
3537 requiredsize
= respos
+repsize
+(endp
-collend
);
3538 if (requiredsize
> ressize
) {
3539 if (requiredsize
<2*ressize
)
3540 requiredsize
= 2*ressize
;
3541 if (_PyString_Resize(&res
, requiredsize
))
3543 str
= PyString_AS_STRING(res
) + respos
;
3544 ressize
= requiredsize
;
3546 /* generate replacement (temporarily (mis)uses p) */
3547 for (p
= collstart
; p
< collend
; ++p
) {
3548 str
+= sprintf(str
, "&#%d;", (int)*p
);
3553 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
3554 encoding
, reason
, startp
, size
, &exc
,
3555 collstart
-startp
, collend
-startp
, &newpos
);
3556 if (repunicode
== NULL
)
3558 /* need more space? (at least enough for what we
3559 have+the replacement+the rest of the string, so
3560 we won't have to check space for encodable characters) */
3561 respos
= str
-PyString_AS_STRING(res
);
3562 repsize
= PyUnicode_GET_SIZE(repunicode
);
3563 requiredsize
= respos
+repsize
+(endp
-collend
);
3564 if (requiredsize
> ressize
) {
3565 if (requiredsize
<2*ressize
)
3566 requiredsize
= 2*ressize
;
3567 if (_PyString_Resize(&res
, requiredsize
)) {
3568 Py_DECREF(repunicode
);
3571 str
= PyString_AS_STRING(res
) + respos
;
3572 ressize
= requiredsize
;
3574 /* check if there is anything unencodable in the replacement
3575 and copy it to the output */
3576 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
3579 raise_encode_exception(&exc
, encoding
, startp
, size
,
3580 unicodepos
, unicodepos
+1, reason
);
3581 Py_DECREF(repunicode
);
3586 p
= startp
+ newpos
;
3587 Py_DECREF(repunicode
);
3591 /* Resize if we allocated to much */
3592 respos
= str
-PyString_AS_STRING(res
);
3594 /* If this falls res will be NULL */
3595 _PyString_Resize(&res
, respos
);
3596 Py_XDECREF(errorHandler
);
3602 Py_XDECREF(errorHandler
);
3607 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
3611 return unicode_encode_ucs1(p
, size
, errors
, 256);
3614 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
3616 if (!PyUnicode_Check(unicode
)) {
3617 PyErr_BadArgument();
3620 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
3621 PyUnicode_GET_SIZE(unicode
),
3625 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3627 PyObject
*PyUnicode_DecodeASCII(const char *s
,
3631 const char *starts
= s
;
3634 Py_ssize_t startinpos
;
3635 Py_ssize_t endinpos
;
3638 PyObject
*errorHandler
= NULL
;
3639 PyObject
*exc
= NULL
;
3641 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3642 if (size
== 1 && *(unsigned char*)s
< 128) {
3643 Py_UNICODE r
= *(unsigned char*)s
;
3644 return PyUnicode_FromUnicode(&r
, 1);
3647 v
= _PyUnicode_New(size
);
3651 return (PyObject
*)v
;
3652 p
= PyUnicode_AS_UNICODE(v
);
3655 register unsigned char c
= (unsigned char)*s
;
3661 startinpos
= s
-starts
;
3662 endinpos
= startinpos
+ 1;
3663 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
3664 if (unicode_decode_call_errorhandler(
3665 errors
, &errorHandler
,
3666 "ascii", "ordinal not in range(128)",
3667 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3668 (PyObject
**)&v
, &outpos
, &p
))
3672 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
3673 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3675 Py_XDECREF(errorHandler
);
3677 return (PyObject
*)v
;
3681 Py_XDECREF(errorHandler
);
3686 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
3690 return unicode_encode_ucs1(p
, size
, errors
, 128);
3693 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
3695 if (!PyUnicode_Check(unicode
)) {
3696 PyErr_BadArgument();
3699 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
3700 PyUnicode_GET_SIZE(unicode
),
3704 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3706 /* --- MBCS codecs for Windows -------------------------------------------- */
3708 #if SIZEOF_INT < SIZEOF_SSIZE_T
3712 /* XXX This code is limited to "true" double-byte encodings, as
3713 a) it assumes an incomplete character consists of a single byte, and
3714 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3715 encodings, see IsDBCSLeadByteEx documentation. */
3717 static int is_dbcs_lead_byte(const char *s
, int offset
)
3719 const char *curr
= s
+ offset
;
3721 if (IsDBCSLeadByte(*curr
)) {
3722 const char *prev
= CharPrev(s
, curr
);
3723 return (prev
== curr
) || !IsDBCSLeadByte(*prev
) || (curr
- prev
== 2);
3729 * Decode MBCS string into unicode object. If 'final' is set, converts
3730 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3732 static int decode_mbcs(PyUnicodeObject
**v
,
3733 const char *s
, /* MBCS string */
3734 int size
, /* sizeof MBCS string */
3743 /* Skip trailing lead-byte unless 'final' is set */
3744 if (!final
&& size
>= 1 && is_dbcs_lead_byte(s
, size
- 1))
3747 /* First get the size of the result */
3749 usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
3751 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3757 /* Create unicode object */
3758 *v
= _PyUnicode_New(usize
);
3763 /* Extend unicode object */
3764 n
= PyUnicode_GET_SIZE(*v
);
3765 if (_PyUnicode_Resize(v
, n
+ usize
) < 0)
3769 /* Do the conversion */
3771 p
= PyUnicode_AS_UNICODE(*v
) + n
;
3772 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
3773 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3781 PyObject
*PyUnicode_DecodeMBCSStateful(const char *s
,
3784 Py_ssize_t
*consumed
)
3786 PyUnicodeObject
*v
= NULL
;
3795 done
= decode_mbcs(&v
, s
, INT_MAX
, 0);
3798 done
= decode_mbcs(&v
, s
, (int)size
, !consumed
);
3809 if (size
> INT_MAX
) {
3816 return (PyObject
*)v
;
3819 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
3823 return PyUnicode_DecodeMBCSStateful(s
, size
, errors
, NULL
);
3827 * Convert unicode into string object (MBCS).
3828 * Returns 0 if succeed, -1 otherwise.
3830 static int encode_mbcs(PyObject
**repr
,
3831 const Py_UNICODE
*p
, /* unicode */
3832 int size
) /* size of unicode */
3839 /* First get the size of the result */
3841 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
3842 if (mbcssize
== 0) {
3843 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3848 if (*repr
== NULL
) {
3849 /* Create string object */
3850 *repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
3855 /* Extend string object */
3856 n
= PyString_Size(*repr
);
3857 if (_PyString_Resize(repr
, n
+ mbcssize
) < 0)
3861 /* Do the conversion */
3863 char *s
= PyString_AS_STRING(*repr
) + n
;
3864 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
3865 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3873 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
3877 PyObject
*repr
= NULL
;
3883 ret
= encode_mbcs(&repr
, p
, INT_MAX
);
3886 ret
= encode_mbcs(&repr
, p
, (int)size
);
3894 if (size
> INT_MAX
) {
3904 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
3906 if (!PyUnicode_Check(unicode
)) {
3907 PyErr_BadArgument();
3910 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
3911 PyUnicode_GET_SIZE(unicode
),
3917 #endif /* MS_WINDOWS */
3919 /* --- Character Mapping Codec -------------------------------------------- */
3921 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
3926 const char *starts
= s
;
3927 Py_ssize_t startinpos
;
3928 Py_ssize_t endinpos
;
3933 Py_ssize_t extrachars
= 0;
3934 PyObject
*errorHandler
= NULL
;
3935 PyObject
*exc
= NULL
;
3936 Py_UNICODE
*mapstring
= NULL
;
3937 Py_ssize_t maplen
= 0;
3939 /* Default to Latin-1 */
3940 if (mapping
== NULL
)
3941 return PyUnicode_DecodeLatin1(s
, size
, errors
);
3943 v
= _PyUnicode_New(size
);
3947 return (PyObject
*)v
;
3948 p
= PyUnicode_AS_UNICODE(v
);
3950 if (PyUnicode_CheckExact(mapping
)) {
3951 mapstring
= PyUnicode_AS_UNICODE(mapping
);
3952 maplen
= PyUnicode_GET_SIZE(mapping
);
3954 unsigned char ch
= *s
;
3955 Py_UNICODE x
= 0xfffe; /* illegal value */
3961 /* undefined mapping */
3962 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3963 startinpos
= s
-starts
;
3964 endinpos
= startinpos
+1;
3965 if (unicode_decode_call_errorhandler(
3966 errors
, &errorHandler
,
3967 "charmap", "character maps to <undefined>",
3968 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3969 (PyObject
**)&v
, &outpos
, &p
)) {
3980 unsigned char ch
= *s
;
3983 /* Get mapping (char ordinal -> integer, Unicode char or None) */
3984 w
= PyInt_FromLong((long)ch
);
3987 x
= PyObject_GetItem(mapping
, w
);
3990 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3991 /* No mapping found means: mapping is undefined. */
4000 if (PyInt_Check(x
)) {
4001 long value
= PyInt_AS_LONG(x
);
4002 if (value
< 0 || value
> 65535) {
4003 PyErr_SetString(PyExc_TypeError
,
4004 "character mapping must be in range(65536)");
4008 *p
++ = (Py_UNICODE
)value
;
4010 else if (x
== Py_None
) {
4011 /* undefined mapping */
4012 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4013 startinpos
= s
-starts
;
4014 endinpos
= startinpos
+1;
4015 if (unicode_decode_call_errorhandler(
4016 errors
, &errorHandler
,
4017 "charmap", "character maps to <undefined>",
4018 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4019 (PyObject
**)&v
, &outpos
, &p
)) {
4026 else if (PyUnicode_Check(x
)) {
4027 Py_ssize_t targetsize
= PyUnicode_GET_SIZE(x
);
4029 if (targetsize
== 1)
4031 *p
++ = *PyUnicode_AS_UNICODE(x
);
4033 else if (targetsize
> 1) {
4035 if (targetsize
> extrachars
) {
4037 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
4038 Py_ssize_t needed
= (targetsize
- extrachars
) + \
4040 extrachars
+= needed
;
4041 /* XXX overflow detection missing */
4042 if (_PyUnicode_Resize(&v
,
4043 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
4047 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
4050 PyUnicode_AS_UNICODE(x
),
4053 extrachars
-= targetsize
;
4055 /* 1-0 mapping: skip the character */
4058 /* wrong return value */
4059 PyErr_SetString(PyExc_TypeError
,
4060 "character mapping must return integer, None or unicode");
4068 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
4069 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
4071 Py_XDECREF(errorHandler
);
4073 return (PyObject
*)v
;
4076 Py_XDECREF(errorHandler
);
4082 /* Charmap encoding: the lookup table */
4084 struct encoding_map
{
4086 unsigned char level1
[32];
4088 unsigned char level23
[1];
4092 encoding_map_size(PyObject
*obj
, PyObject
* args
)
4094 struct encoding_map
*map
= (struct encoding_map
*)obj
;
4095 return PyInt_FromLong(sizeof(*map
) - 1 + 16*map
->count2
+
4099 static PyMethodDef encoding_map_methods
[] = {
4100 {"size", encoding_map_size
, METH_NOARGS
,
4101 PyDoc_STR("Return the size (in bytes) of this object") },
4106 encoding_map_dealloc(PyObject
* o
)
4111 static PyTypeObject EncodingMapType
= {
4112 PyVarObject_HEAD_INIT(NULL
, 0)
4113 "EncodingMap", /*tp_name*/
4114 sizeof(struct encoding_map
), /*tp_basicsize*/
4117 encoding_map_dealloc
, /*tp_dealloc*/
4124 0, /*tp_as_sequence*/
4125 0, /*tp_as_mapping*/
4132 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
4136 0, /*tp_richcompare*/
4137 0, /*tp_weaklistoffset*/
4140 encoding_map_methods
, /*tp_methods*/
4147 0, /*tp_dictoffset*/
4156 PyUnicode_BuildEncodingMap(PyObject
* string
)
4160 struct encoding_map
*mresult
;
4163 unsigned char level1
[32];
4164 unsigned char level2
[512];
4165 unsigned char *mlevel1
, *mlevel2
, *mlevel3
;
4166 int count2
= 0, count3
= 0;
4168 if (!PyUnicode_Check(string
) || PyUnicode_GetSize(string
) != 256) {
4169 PyErr_BadArgument();
4172 decode
= PyUnicode_AS_UNICODE(string
);
4173 memset(level1
, 0xFF, sizeof level1
);
4174 memset(level2
, 0xFF, sizeof level2
);
4176 /* If there isn't a one-to-one mapping of NULL to \0,
4177 or if there are non-BMP characters, we need to use
4178 a mapping dictionary. */
4181 for (i
= 1; i
< 256; i
++) {
4184 #ifdef Py_UNICODE_WIDE
4185 || decode
[i
] > 0xFFFF
4191 if (decode
[i
] == 0xFFFE)
4192 /* unmapped character */
4194 l1
= decode
[i
] >> 11;
4195 l2
= decode
[i
] >> 7;
4196 if (level1
[l1
] == 0xFF)
4197 level1
[l1
] = count2
++;
4198 if (level2
[l2
] == 0xFF)
4199 level2
[l2
] = count3
++;
4202 if (count2
>= 0xFF || count3
>= 0xFF)
4206 PyObject
*result
= PyDict_New();
4207 PyObject
*key
, *value
;
4210 for (i
= 0; i
< 256; i
++) {
4212 key
= PyInt_FromLong(decode
[i
]);
4213 value
= PyInt_FromLong(i
);
4216 if (PyDict_SetItem(result
, key
, value
) == -1)
4229 /* Create a three-level trie */
4230 result
= PyObject_MALLOC(sizeof(struct encoding_map
) +
4231 16*count2
+ 128*count3
- 1);
4233 return PyErr_NoMemory();
4234 PyObject_Init(result
, &EncodingMapType
);
4235 mresult
= (struct encoding_map
*)result
;
4236 mresult
->count2
= count2
;
4237 mresult
->count3
= count3
;
4238 mlevel1
= mresult
->level1
;
4239 mlevel2
= mresult
->level23
;
4240 mlevel3
= mresult
->level23
+ 16*count2
;
4241 memcpy(mlevel1
, level1
, 32);
4242 memset(mlevel2
, 0xFF, 16*count2
);
4243 memset(mlevel3
, 0, 128*count3
);
4245 for (i
= 1; i
< 256; i
++) {
4246 int o1
, o2
, o3
, i2
, i3
;
4247 if (decode
[i
] == 0xFFFE)
4248 /* unmapped character */
4251 o2
= (decode
[i
]>>7) & 0xF;
4252 i2
= 16*mlevel1
[o1
] + o2
;
4253 if (mlevel2
[i2
] == 0xFF)
4254 mlevel2
[i2
] = count3
++;
4255 o3
= decode
[i
] & 0x7F;
4256 i3
= 128*mlevel2
[i2
] + o3
;
4263 encoding_map_lookup(Py_UNICODE c
, PyObject
*mapping
)
4265 struct encoding_map
*map
= (struct encoding_map
*)mapping
;
4267 int l2
= (c
>>7) & 0xF;
4271 #ifdef Py_UNICODE_WIDE
4279 i
= map
->level1
[l1
];
4284 i
= map
->level23
[16*i
+l2
];
4289 i
= map
->level23
[16*map
->count2
+ 128*i
+ l3
];
4296 /* Lookup the character ch in the mapping. If the character
4297 can't be found, Py_None is returned (or NULL, if another
4299 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
4301 PyObject
*w
= PyInt_FromLong((long)c
);
4306 x
= PyObject_GetItem(mapping
, w
);
4309 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4310 /* No mapping found means: mapping is undefined. */
4318 else if (x
== Py_None
)
4320 else if (PyInt_Check(x
)) {
4321 long value
= PyInt_AS_LONG(x
);
4322 if (value
< 0 || value
> 255) {
4323 PyErr_SetString(PyExc_TypeError
,
4324 "character mapping must be in range(256)");
4330 else if (PyString_Check(x
))
4333 /* wrong return value */
4334 PyErr_SetString(PyExc_TypeError
,
4335 "character mapping must return integer, None or str");
4342 charmapencode_resize(PyObject
**outobj
, Py_ssize_t
*outpos
, Py_ssize_t requiredsize
)
4344 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4345 /* exponentially overallocate to minimize reallocations */
4346 if (requiredsize
< 2*outsize
)
4347 requiredsize
= 2*outsize
;
4348 if (_PyString_Resize(outobj
, requiredsize
)) {
4354 typedef enum charmapencode_result
{
4355 enc_SUCCESS
, enc_FAILED
, enc_EXCEPTION
4356 }charmapencode_result
;
4357 /* lookup the character, put the result in the output string and adjust
4358 various state variables. Reallocate the output string if not enough
4359 space is available. Return a new reference to the object that
4360 was put in the output buffer, or Py_None, if the mapping was undefined
4361 (in which case no character was written) or NULL, if a
4362 reallocation error occurred. The caller must decref the result */
4364 charmapencode_result
charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
4365 PyObject
**outobj
, Py_ssize_t
*outpos
)
4369 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4371 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4372 int res
= encoding_map_lookup(c
, mapping
);
4373 Py_ssize_t requiredsize
= *outpos
+1;
4376 if (outsize
<requiredsize
)
4377 if (!charmapencode_resize(outobj
, outpos
, requiredsize
))
4378 return enc_EXCEPTION
;
4379 outstart
= PyString_AS_STRING(*outobj
);
4380 outstart
[(*outpos
)++] = (char)res
;
4384 rep
= charmapencode_lookup(c
, mapping
);
4386 return enc_EXCEPTION
;
4387 else if (rep
==Py_None
) {
4391 if (PyInt_Check(rep
)) {
4392 Py_ssize_t requiredsize
= *outpos
+1;
4393 if (outsize
<requiredsize
)
4394 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4396 return enc_EXCEPTION
;
4398 outstart
= PyString_AS_STRING(*outobj
);
4399 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
4402 const char *repchars
= PyString_AS_STRING(rep
);
4403 Py_ssize_t repsize
= PyString_GET_SIZE(rep
);
4404 Py_ssize_t requiredsize
= *outpos
+repsize
;
4405 if (outsize
<requiredsize
)
4406 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4408 return enc_EXCEPTION
;
4410 outstart
= PyString_AS_STRING(*outobj
);
4411 memcpy(outstart
+ *outpos
, repchars
, repsize
);
4419 /* handle an error in PyUnicode_EncodeCharmap
4420 Return 0 on success, -1 on error */
4422 int charmap_encoding_error(
4423 const Py_UNICODE
*p
, Py_ssize_t size
, Py_ssize_t
*inpos
, PyObject
*mapping
,
4424 PyObject
**exceptionObject
,
4425 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
4426 PyObject
**res
, Py_ssize_t
*respos
)
4428 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4432 /* startpos for collecting unencodable chars */
4433 Py_ssize_t collstartpos
= *inpos
;
4434 Py_ssize_t collendpos
= *inpos
+1;
4436 char *encoding
= "charmap";
4437 char *reason
= "character maps to <undefined>";
4438 charmapencode_result x
;
4440 /* find all unencodable characters */
4441 while (collendpos
< size
) {
4443 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4444 int res
= encoding_map_lookup(p
[collendpos
], mapping
);
4451 rep
= charmapencode_lookup(p
[collendpos
], mapping
);
4454 else if (rep
!=Py_None
) {
4461 /* cache callback name lookup
4462 * (if not done yet, i.e. it's the first error) */
4463 if (*known_errorHandler
==-1) {
4464 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4465 *known_errorHandler
= 1;
4466 else if (!strcmp(errors
, "replace"))
4467 *known_errorHandler
= 2;
4468 else if (!strcmp(errors
, "ignore"))
4469 *known_errorHandler
= 3;
4470 else if (!strcmp(errors
, "xmlcharrefreplace"))
4471 *known_errorHandler
= 4;
4473 *known_errorHandler
= 0;
4475 switch (*known_errorHandler
) {
4476 case 1: /* strict */
4477 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4479 case 2: /* replace */
4480 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
4481 x
= charmapencode_output('?', mapping
, res
, respos
);
4482 if (x
==enc_EXCEPTION
) {
4485 else if (x
==enc_FAILED
) {
4486 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4491 case 3: /* ignore */
4492 *inpos
= collendpos
;
4494 case 4: /* xmlcharrefreplace */
4495 /* generate replacement (temporarily (mis)uses p) */
4496 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
4497 char buffer
[2+29+1+1];
4499 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
4500 for (cp
= buffer
; *cp
; ++cp
) {
4501 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
4502 if (x
==enc_EXCEPTION
)
4504 else if (x
==enc_FAILED
) {
4505 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4510 *inpos
= collendpos
;
4513 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
4514 encoding
, reason
, p
, size
, exceptionObject
,
4515 collstartpos
, collendpos
, &newpos
);
4516 if (repunicode
== NULL
)
4518 /* generate replacement */
4519 repsize
= PyUnicode_GET_SIZE(repunicode
);
4520 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
4521 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
4522 if (x
==enc_EXCEPTION
) {
4525 else if (x
==enc_FAILED
) {
4526 Py_DECREF(repunicode
);
4527 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4532 Py_DECREF(repunicode
);
4537 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
4543 PyObject
*res
= NULL
;
4544 /* current input position */
4545 Py_ssize_t inpos
= 0;
4546 /* current output position */
4547 Py_ssize_t respos
= 0;
4548 PyObject
*errorHandler
= NULL
;
4549 PyObject
*exc
= NULL
;
4550 /* the following variable is used for caching string comparisons
4551 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4552 * 3=ignore, 4=xmlcharrefreplace */
4553 int known_errorHandler
= -1;
4555 /* Default to Latin-1 */
4556 if (mapping
== NULL
)
4557 return PyUnicode_EncodeLatin1(p
, size
, errors
);
4559 /* allocate enough for a simple encoding without
4560 replacements, if we need more, we'll resize */
4561 res
= PyString_FromStringAndSize(NULL
, size
);
4567 while (inpos
<size
) {
4568 /* try to encode it */
4569 charmapencode_result x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
4570 if (x
==enc_EXCEPTION
) /* error */
4572 if (x
==enc_FAILED
) { /* unencodable character */
4573 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
4575 &known_errorHandler
, &errorHandler
, errors
,
4581 /* done with this character => adjust input position */
4585 /* Resize if we allocated to much */
4586 if (respos
<PyString_GET_SIZE(res
)) {
4587 if (_PyString_Resize(&res
, respos
))
4591 Py_XDECREF(errorHandler
);
4597 Py_XDECREF(errorHandler
);
4601 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
4604 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
4605 PyErr_BadArgument();
4608 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
4609 PyUnicode_GET_SIZE(unicode
),
4614 /* create or adjust a UnicodeTranslateError */
4615 static void make_translate_exception(PyObject
**exceptionObject
,
4616 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4617 Py_ssize_t startpos
, Py_ssize_t endpos
,
4620 if (*exceptionObject
== NULL
) {
4621 *exceptionObject
= PyUnicodeTranslateError_Create(
4622 unicode
, size
, startpos
, endpos
, reason
);
4625 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
4627 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
4629 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
4633 Py_DECREF(*exceptionObject
);
4634 *exceptionObject
= NULL
;
4638 /* raises a UnicodeTranslateError */
4639 static void raise_translate_exception(PyObject
**exceptionObject
,
4640 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4641 Py_ssize_t startpos
, Py_ssize_t endpos
,
4644 make_translate_exception(exceptionObject
,
4645 unicode
, size
, startpos
, endpos
, reason
);
4646 if (*exceptionObject
!= NULL
)
4647 PyCodec_StrictErrors(*exceptionObject
);
4650 /* error handling callback helper:
4651 build arguments, call the callback and check the arguments,
4652 put the result into newpos and return the replacement string, which
4653 has to be freed by the caller */
4654 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
4655 PyObject
**errorHandler
,
4657 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
4658 Py_ssize_t startpos
, Py_ssize_t endpos
,
4661 static char *argparse
= "O!n;translating error handler must return (unicode, int) tuple";
4663 Py_ssize_t i_newpos
;
4665 PyObject
*resunicode
;
4667 if (*errorHandler
== NULL
) {
4668 *errorHandler
= PyCodec_LookupError(errors
);
4669 if (*errorHandler
== NULL
)
4673 make_translate_exception(exceptionObject
,
4674 unicode
, size
, startpos
, endpos
, reason
);
4675 if (*exceptionObject
== NULL
)
4678 restuple
= PyObject_CallFunctionObjArgs(
4679 *errorHandler
, *exceptionObject
, NULL
);
4680 if (restuple
== NULL
)
4682 if (!PyTuple_Check(restuple
)) {
4683 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
4684 Py_DECREF(restuple
);
4687 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
4688 &resunicode
, &i_newpos
)) {
4689 Py_DECREF(restuple
);
4693 *newpos
= size
+i_newpos
;
4696 if (*newpos
<0 || *newpos
>size
) {
4697 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
4698 Py_DECREF(restuple
);
4701 Py_INCREF(resunicode
);
4702 Py_DECREF(restuple
);
4706 /* Lookup the character ch in the mapping and put the result in result,
4707 which must be decrefed by the caller.
4708 Return 0 on success, -1 on error */
4710 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
4712 PyObject
*w
= PyInt_FromLong((long)c
);
4717 x
= PyObject_GetItem(mapping
, w
);
4720 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4721 /* No mapping found means: use 1:1 mapping. */
4728 else if (x
== Py_None
) {
4732 else if (PyInt_Check(x
)) {
4733 long value
= PyInt_AS_LONG(x
);
4734 long max
= PyUnicode_GetMax();
4735 if (value
< 0 || value
> max
) {
4736 PyErr_Format(PyExc_TypeError
,
4737 "character mapping must be in range(0x%lx)", max
+1);
4744 else if (PyUnicode_Check(x
)) {
4749 /* wrong return value */
4750 PyErr_SetString(PyExc_TypeError
,
4751 "character mapping must return integer, None or unicode");
4756 /* ensure that *outobj is at least requiredsize characters long,
4757 if not reallocate and adjust various state variables.
4758 Return 0 on success, -1 on error */
4760 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
4761 Py_ssize_t requiredsize
)
4763 Py_ssize_t oldsize
= PyUnicode_GET_SIZE(*outobj
);
4764 if (requiredsize
> oldsize
) {
4765 /* remember old output position */
4766 Py_ssize_t outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
4767 /* exponentially overallocate to minimize reallocations */
4768 if (requiredsize
< 2 * oldsize
)
4769 requiredsize
= 2 * oldsize
;
4770 if (_PyUnicode_Resize(outobj
, requiredsize
) < 0)
4772 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
4776 /* lookup the character, put the result in the output string and adjust
4777 various state variables. Return a new reference to the object that
4778 was put in the output buffer in *result, or Py_None, if the mapping was
4779 undefined (in which case no character was written).
4780 The called must decref result.
4781 Return 0 on success, -1 on error. */
4783 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
4784 Py_ssize_t insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
4787 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
4790 /* not found => default to 1:1 mapping */
4791 *(*outp
)++ = *curinp
;
4793 else if (*res
==Py_None
)
4795 else if (PyInt_Check(*res
)) {
4796 /* no overflow check, because we know that the space is enough */
4797 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
4799 else if (PyUnicode_Check(*res
)) {
4800 Py_ssize_t repsize
= PyUnicode_GET_SIZE(*res
);
4802 /* no overflow check, because we know that the space is enough */
4803 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
4805 else if (repsize
!=0) {
4806 /* more than one character */
4807 Py_ssize_t requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
4808 (insize
- (curinp
-startinp
)) +
4810 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
4812 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
4821 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
4827 PyObject
*res
= NULL
;
4828 /* pointers to the beginning and end+1 of input */
4829 const Py_UNICODE
*startp
= p
;
4830 const Py_UNICODE
*endp
= p
+ size
;
4831 /* pointer into the output */
4833 /* current output position */
4834 Py_ssize_t respos
= 0;
4835 char *reason
= "character maps to <undefined>";
4836 PyObject
*errorHandler
= NULL
;
4837 PyObject
*exc
= NULL
;
4838 /* the following variable is used for caching string comparisons
4839 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4840 * 3=ignore, 4=xmlcharrefreplace */
4841 int known_errorHandler
= -1;
4843 if (mapping
== NULL
) {
4844 PyErr_BadArgument();
4848 /* allocate enough for a simple 1:1 translation without
4849 replacements, if we need more, we'll resize */
4850 res
= PyUnicode_FromUnicode(NULL
, size
);
4855 str
= PyUnicode_AS_UNICODE(res
);
4858 /* try to encode it */
4860 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
4865 if (x
!=Py_None
) /* it worked => adjust input pointer */
4867 else { /* untranslatable character */
4868 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4872 /* startpos for collecting untranslatable chars */
4873 const Py_UNICODE
*collstart
= p
;
4874 const Py_UNICODE
*collend
= p
+1;
4875 const Py_UNICODE
*coll
;
4877 /* find all untranslatable characters */
4878 while (collend
< endp
) {
4879 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
4886 /* cache callback name lookup
4887 * (if not done yet, i.e. it's the first error) */
4888 if (known_errorHandler
==-1) {
4889 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4890 known_errorHandler
= 1;
4891 else if (!strcmp(errors
, "replace"))
4892 known_errorHandler
= 2;
4893 else if (!strcmp(errors
, "ignore"))
4894 known_errorHandler
= 3;
4895 else if (!strcmp(errors
, "xmlcharrefreplace"))
4896 known_errorHandler
= 4;
4898 known_errorHandler
= 0;
4900 switch (known_errorHandler
) {
4901 case 1: /* strict */
4902 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
4904 case 2: /* replace */
4905 /* No need to check for space, this is a 1:1 replacement */
4906 for (coll
= collstart
; coll
<collend
; ++coll
)
4909 case 3: /* ignore */
4912 case 4: /* xmlcharrefreplace */
4913 /* generate replacement (temporarily (mis)uses p) */
4914 for (p
= collstart
; p
< collend
; ++p
) {
4915 char buffer
[2+29+1+1];
4917 sprintf(buffer
, "&#%d;", (int)*p
);
4918 if (charmaptranslate_makespace(&res
, &str
,
4919 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
4921 for (cp
= buffer
; *cp
; ++cp
)
4927 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
4928 reason
, startp
, size
, &exc
,
4929 collstart
-startp
, collend
-startp
, &newpos
);
4930 if (repunicode
== NULL
)
4932 /* generate replacement */
4933 repsize
= PyUnicode_GET_SIZE(repunicode
);
4934 if (charmaptranslate_makespace(&res
, &str
,
4935 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
4936 Py_DECREF(repunicode
);
4939 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
4941 p
= startp
+ newpos
;
4942 Py_DECREF(repunicode
);
4946 /* Resize if we allocated to much */
4947 respos
= str
-PyUnicode_AS_UNICODE(res
);
4948 if (respos
<PyUnicode_GET_SIZE(res
)) {
4949 if (_PyUnicode_Resize(&res
, respos
) < 0)
4953 Py_XDECREF(errorHandler
);
4959 Py_XDECREF(errorHandler
);
4963 PyObject
*PyUnicode_Translate(PyObject
*str
,
4969 str
= PyUnicode_FromObject(str
);
4972 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
4973 PyUnicode_GET_SIZE(str
),
4984 /* --- Decimal Encoder ---------------------------------------------------- */
4986 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
4991 Py_UNICODE
*p
, *end
;
4992 PyObject
*errorHandler
= NULL
;
4993 PyObject
*exc
= NULL
;
4994 const char *encoding
= "decimal";
4995 const char *reason
= "invalid decimal Unicode string";
4996 /* the following variable is used for caching string comparisons
4997 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4998 int known_errorHandler
= -1;
5000 if (output
== NULL
) {
5001 PyErr_BadArgument();
5008 register Py_UNICODE ch
= *p
;
5010 PyObject
*repunicode
;
5014 Py_UNICODE
*collstart
;
5015 Py_UNICODE
*collend
;
5017 if (Py_UNICODE_ISSPACE(ch
)) {
5022 decimal
= Py_UNICODE_TODECIMAL(ch
);
5024 *output
++ = '0' + decimal
;
5028 if (0 < ch
&& ch
< 256) {
5029 *output
++ = (char)ch
;
5033 /* All other characters are considered unencodable */
5036 while (collend
< end
) {
5037 if ((0 < *collend
&& *collend
< 256) ||
5038 !Py_UNICODE_ISSPACE(*collend
) ||
5039 Py_UNICODE_TODECIMAL(*collend
))
5042 /* cache callback name lookup
5043 * (if not done yet, i.e. it's the first error) */
5044 if (known_errorHandler
==-1) {
5045 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5046 known_errorHandler
= 1;
5047 else if (!strcmp(errors
, "replace"))
5048 known_errorHandler
= 2;
5049 else if (!strcmp(errors
, "ignore"))
5050 known_errorHandler
= 3;
5051 else if (!strcmp(errors
, "xmlcharrefreplace"))
5052 known_errorHandler
= 4;
5054 known_errorHandler
= 0;
5056 switch (known_errorHandler
) {
5057 case 1: /* strict */
5058 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
5060 case 2: /* replace */
5061 for (p
= collstart
; p
< collend
; ++p
)
5064 case 3: /* ignore */
5067 case 4: /* xmlcharrefreplace */
5068 /* generate replacement (temporarily (mis)uses p) */
5069 for (p
= collstart
; p
< collend
; ++p
)
5070 output
+= sprintf(output
, "&#%d;", (int)*p
);
5074 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
5075 encoding
, reason
, s
, length
, &exc
,
5076 collstart
-s
, collend
-s
, &newpos
);
5077 if (repunicode
== NULL
)
5079 /* generate replacement */
5080 repsize
= PyUnicode_GET_SIZE(repunicode
);
5081 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
5082 Py_UNICODE ch
= *uni2
;
5083 if (Py_UNICODE_ISSPACE(ch
))
5086 decimal
= Py_UNICODE_TODECIMAL(ch
);
5088 *output
++ = '0' + decimal
;
5089 else if (0 < ch
&& ch
< 256)
5090 *output
++ = (char)ch
;
5092 Py_DECREF(repunicode
);
5093 raise_encode_exception(&exc
, encoding
,
5094 s
, length
, collstart
-s
, collend
-s
, reason
);
5100 Py_DECREF(repunicode
);
5103 /* 0-terminate the output string */
5106 Py_XDECREF(errorHandler
);
5111 Py_XDECREF(errorHandler
);
5115 /* --- Helpers ------------------------------------------------------------ */
5117 #include "stringlib/unicodedefs.h"
5119 #define FROM_UNICODE
5121 #include "stringlib/fastsearch.h"
5123 #include "stringlib/count.h"
5124 #include "stringlib/find.h"
5125 #include "stringlib/partition.h"
5127 /* helper macro to fixup start/end slice values */
5128 #define FIX_START_END(obj) \
5130 start += (obj)->length; \
5133 if (end > (obj)->length) \
5134 end = (obj)->length; \
5136 end += (obj)->length; \
5140 Py_ssize_t
PyUnicode_Count(PyObject
*str
,
5146 PyUnicodeObject
* str_obj
;
5147 PyUnicodeObject
* sub_obj
;
5149 str_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(str
);
5152 sub_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(substr
);
5158 FIX_START_END(str_obj
);
5160 result
= stringlib_count(
5161 str_obj
->str
+ start
, end
- start
, sub_obj
->str
, sub_obj
->length
5170 Py_ssize_t
PyUnicode_Find(PyObject
*str
,
5178 str
= PyUnicode_FromObject(str
);
5181 sub
= PyUnicode_FromObject(sub
);
5188 result
= stringlib_find_slice(
5189 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5190 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5194 result
= stringlib_rfind_slice(
5195 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5196 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5207 int tailmatch(PyUnicodeObject
*self
,
5208 PyUnicodeObject
*substring
,
5213 if (substring
->length
== 0)
5216 FIX_START_END(self
);
5218 end
-= substring
->length
;
5222 if (direction
> 0) {
5223 if (Py_UNICODE_MATCH(self
, end
, substring
))
5226 if (Py_UNICODE_MATCH(self
, start
, substring
))
5233 Py_ssize_t
PyUnicode_Tailmatch(PyObject
*str
,
5241 str
= PyUnicode_FromObject(str
);
5244 substr
= PyUnicode_FromObject(substr
);
5245 if (substr
== NULL
) {
5250 result
= tailmatch((PyUnicodeObject
*)str
,
5251 (PyUnicodeObject
*)substr
,
5252 start
, end
, direction
);
5258 /* Apply fixfct filter to the Unicode object self and return a
5259 reference to the modified object */
5262 PyObject
*fixup(PyUnicodeObject
*self
,
5263 int (*fixfct
)(PyUnicodeObject
*s
))
5268 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5272 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5274 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
5275 /* fixfct should return TRUE if it modified the buffer. If
5276 FALSE, return a reference to the original buffer instead
5277 (to save space, not time) */
5280 return (PyObject
*) self
;
5282 return (PyObject
*) u
;
5286 int fixupper(PyUnicodeObject
*self
)
5288 Py_ssize_t len
= self
->length
;
5289 Py_UNICODE
*s
= self
->str
;
5293 register Py_UNICODE ch
;
5295 ch
= Py_UNICODE_TOUPPER(*s
);
5307 int fixlower(PyUnicodeObject
*self
)
5309 Py_ssize_t len
= self
->length
;
5310 Py_UNICODE
*s
= self
->str
;
5314 register Py_UNICODE ch
;
5316 ch
= Py_UNICODE_TOLOWER(*s
);
5328 int fixswapcase(PyUnicodeObject
*self
)
5330 Py_ssize_t len
= self
->length
;
5331 Py_UNICODE
*s
= self
->str
;
5335 if (Py_UNICODE_ISUPPER(*s
)) {
5336 *s
= Py_UNICODE_TOLOWER(*s
);
5338 } else if (Py_UNICODE_ISLOWER(*s
)) {
5339 *s
= Py_UNICODE_TOUPPER(*s
);
5349 int fixcapitalize(PyUnicodeObject
*self
)
5351 Py_ssize_t len
= self
->length
;
5352 Py_UNICODE
*s
= self
->str
;
5357 if (Py_UNICODE_ISLOWER(*s
)) {
5358 *s
= Py_UNICODE_TOUPPER(*s
);
5363 if (Py_UNICODE_ISUPPER(*s
)) {
5364 *s
= Py_UNICODE_TOLOWER(*s
);
5373 int fixtitle(PyUnicodeObject
*self
)
5375 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5376 register Py_UNICODE
*e
;
5377 int previous_is_cased
;
5379 /* Shortcut for single character strings */
5380 if (PyUnicode_GET_SIZE(self
) == 1) {
5381 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
5390 e
= p
+ PyUnicode_GET_SIZE(self
);
5391 previous_is_cased
= 0;
5392 for (; p
< e
; p
++) {
5393 register const Py_UNICODE ch
= *p
;
5395 if (previous_is_cased
)
5396 *p
= Py_UNICODE_TOLOWER(ch
);
5398 *p
= Py_UNICODE_TOTITLE(ch
);
5400 if (Py_UNICODE_ISLOWER(ch
) ||
5401 Py_UNICODE_ISUPPER(ch
) ||
5402 Py_UNICODE_ISTITLE(ch
))
5403 previous_is_cased
= 1;
5405 previous_is_cased
= 0;
5411 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
5413 PyObject
*internal_separator
= NULL
;
5414 const Py_UNICODE blank
= ' ';
5415 const Py_UNICODE
*sep
= &blank
;
5416 Py_ssize_t seplen
= 1;
5417 PyUnicodeObject
*res
= NULL
; /* the result */
5418 Py_ssize_t res_alloc
= 100; /* # allocated bytes for string in res */
5419 Py_ssize_t res_used
; /* # used bytes */
5420 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
5421 PyObject
*fseq
; /* PySequence_Fast(seq) */
5422 Py_ssize_t seqlen
; /* len(fseq) -- number of items in sequence */
5426 fseq
= PySequence_Fast(seq
, "");
5431 /* Grrrr. A codec may be invoked to convert str objects to
5432 * Unicode, and so it's possible to call back into Python code
5433 * during PyUnicode_FromObject(), and so it's possible for a sick
5434 * codec to change the size of fseq (if seq is a list). Therefore
5435 * we have to keep refetching the size -- can't assume seqlen
5438 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5439 /* If empty sequence, return u"". */
5441 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
5444 /* If singleton sequence with an exact Unicode, return that. */
5446 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
5447 if (PyUnicode_CheckExact(item
)) {
5449 res
= (PyUnicodeObject
*)item
;
5454 /* At least two items to join, or one that isn't exact Unicode. */
5456 /* Set up sep and seplen -- they're needed. */
5457 if (separator
== NULL
) {
5462 internal_separator
= PyUnicode_FromObject(separator
);
5463 if (internal_separator
== NULL
)
5465 sep
= PyUnicode_AS_UNICODE(internal_separator
);
5466 seplen
= PyUnicode_GET_SIZE(internal_separator
);
5467 /* In case PyUnicode_FromObject() mutated seq. */
5468 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5473 res
= _PyUnicode_New(res_alloc
);
5476 res_p
= PyUnicode_AS_UNICODE(res
);
5479 for (i
= 0; i
< seqlen
; ++i
) {
5481 Py_ssize_t new_res_used
;
5483 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
5484 /* Convert item to Unicode. */
5485 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
5486 PyErr_Format(PyExc_TypeError
,
5487 "sequence item %zd: expected string or Unicode,"
5489 i
, Py_TYPE(item
)->tp_name
);
5492 item
= PyUnicode_FromObject(item
);
5495 /* We own a reference to item from here on. */
5497 /* In case PyUnicode_FromObject() mutated seq. */
5498 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5500 /* Make sure we have enough space for the separator and the item. */
5501 itemlen
= PyUnicode_GET_SIZE(item
);
5502 new_res_used
= res_used
+ itemlen
;
5503 if (new_res_used
< 0)
5505 if (i
< seqlen
- 1) {
5506 new_res_used
+= seplen
;
5507 if (new_res_used
< 0)
5510 if (new_res_used
> res_alloc
) {
5511 /* double allocated size until it's big enough */
5513 res_alloc
+= res_alloc
;
5516 } while (new_res_used
> res_alloc
);
5517 if (_PyUnicode_Resize(&res
, res_alloc
) < 0) {
5521 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
5524 /* Copy item, and maybe the separator. */
5525 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), itemlen
);
5527 if (i
< seqlen
- 1) {
5528 Py_UNICODE_COPY(res_p
, sep
, seplen
);
5532 res_used
= new_res_used
;
5535 /* Shrink res to match the used area; this probably can't fail,
5536 * but it's cheap to check.
5538 if (_PyUnicode_Resize(&res
, res_used
) < 0)
5542 Py_XDECREF(internal_separator
);
5544 return (PyObject
*)res
;
5547 PyErr_SetString(PyExc_OverflowError
,
5548 "join() result is too long for a Python string");
5553 Py_XDECREF(internal_separator
);
5560 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
5572 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
5577 u
= _PyUnicode_New(left
+ self
->length
+ right
);
5580 Py_UNICODE_FILL(u
->str
, fill
, left
);
5581 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
5583 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
5589 #define SPLIT_APPEND(data, left, right) \
5590 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5593 if (PyList_Append(list, str)) { \
5601 PyObject
*split_whitespace(PyUnicodeObject
*self
,
5603 Py_ssize_t maxcount
)
5605 register Py_ssize_t i
;
5606 register Py_ssize_t j
;
5607 Py_ssize_t len
= self
->length
;
5609 register const Py_UNICODE
*buf
= self
->str
;
5611 for (i
= j
= 0; i
< len
; ) {
5613 while (i
< len
&& Py_UNICODE_ISSPACE(buf
[i
]))
5616 while (i
< len
&& !Py_UNICODE_ISSPACE(buf
[i
]))
5619 if (maxcount
-- <= 0)
5621 SPLIT_APPEND(buf
, j
, i
);
5622 while (i
< len
&& Py_UNICODE_ISSPACE(buf
[i
]))
5628 SPLIT_APPEND(buf
, j
, len
);
5637 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
5640 register Py_ssize_t i
;
5641 register Py_ssize_t j
;
5647 string
= PyUnicode_FromObject(string
);
5650 data
= PyUnicode_AS_UNICODE(string
);
5651 len
= PyUnicode_GET_SIZE(string
);
5653 list
= PyList_New(0);
5657 for (i
= j
= 0; i
< len
; ) {
5660 /* Find a line and append it */
5661 while (i
< len
&& !BLOOM_LINEBREAK(data
[i
]))
5664 /* Skip the line break reading CRLF as one line break */
5667 if (data
[i
] == '\r' && i
+ 1 < len
&&
5675 SPLIT_APPEND(data
, j
, eol
);
5679 SPLIT_APPEND(data
, j
, len
);
5692 PyObject
*split_char(PyUnicodeObject
*self
,
5695 Py_ssize_t maxcount
)
5697 register Py_ssize_t i
;
5698 register Py_ssize_t j
;
5699 Py_ssize_t len
= self
->length
;
5701 register const Py_UNICODE
*buf
= self
->str
;
5703 for (i
= j
= 0; i
< len
; ) {
5705 if (maxcount
-- <= 0)
5707 SPLIT_APPEND(buf
, j
, i
);
5713 SPLIT_APPEND(buf
, j
, len
);
5723 PyObject
*split_substring(PyUnicodeObject
*self
,
5725 PyUnicodeObject
*substring
,
5726 Py_ssize_t maxcount
)
5728 register Py_ssize_t i
;
5729 register Py_ssize_t j
;
5730 Py_ssize_t len
= self
->length
;
5731 Py_ssize_t sublen
= substring
->length
;
5734 for (i
= j
= 0; i
<= len
- sublen
; ) {
5735 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
5736 if (maxcount
-- <= 0)
5738 SPLIT_APPEND(self
->str
, j
, i
);
5744 SPLIT_APPEND(self
->str
, j
, len
);
5754 PyObject
*rsplit_whitespace(PyUnicodeObject
*self
,
5756 Py_ssize_t maxcount
)
5758 register Py_ssize_t i
;
5759 register Py_ssize_t j
;
5760 Py_ssize_t len
= self
->length
;
5762 register const Py_UNICODE
*buf
= self
->str
;
5764 for (i
= j
= len
- 1; i
>= 0; ) {
5766 while (i
>= 0 && Py_UNICODE_ISSPACE(buf
[i
]))
5769 while (i
>= 0 && !Py_UNICODE_ISSPACE(buf
[i
]))
5772 if (maxcount
-- <= 0)
5774 SPLIT_APPEND(buf
, i
+ 1, j
+ 1);
5775 while (i
>= 0 && Py_UNICODE_ISSPACE(buf
[i
]))
5781 SPLIT_APPEND(buf
, 0, j
+ 1);
5783 if (PyList_Reverse(list
) < 0)
5793 PyObject
*rsplit_char(PyUnicodeObject
*self
,
5796 Py_ssize_t maxcount
)
5798 register Py_ssize_t i
;
5799 register Py_ssize_t j
;
5800 Py_ssize_t len
= self
->length
;
5802 register const Py_UNICODE
*buf
= self
->str
;
5804 for (i
= j
= len
- 1; i
>= 0; ) {
5806 if (maxcount
-- <= 0)
5808 SPLIT_APPEND(buf
, i
+ 1, j
+ 1);
5814 SPLIT_APPEND(buf
, 0, j
+ 1);
5816 if (PyList_Reverse(list
) < 0)
5826 PyObject
*rsplit_substring(PyUnicodeObject
*self
,
5828 PyUnicodeObject
*substring
,
5829 Py_ssize_t maxcount
)
5831 register Py_ssize_t i
;
5832 register Py_ssize_t j
;
5833 Py_ssize_t len
= self
->length
;
5834 Py_ssize_t sublen
= substring
->length
;
5837 for (i
= len
- sublen
, j
= len
; i
>= 0; ) {
5838 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
5839 if (maxcount
-- <= 0)
5841 SPLIT_APPEND(self
->str
, i
+ sublen
, j
);
5848 SPLIT_APPEND(self
->str
, 0, j
);
5850 if (PyList_Reverse(list
) < 0)
5862 PyObject
*split(PyUnicodeObject
*self
,
5863 PyUnicodeObject
*substring
,
5864 Py_ssize_t maxcount
)
5869 maxcount
= PY_SSIZE_T_MAX
;
5871 list
= PyList_New(0);
5875 if (substring
== NULL
)
5876 return split_whitespace(self
,list
,maxcount
);
5878 else if (substring
->length
== 1)
5879 return split_char(self
,list
,substring
->str
[0],maxcount
);
5881 else if (substring
->length
== 0) {
5883 PyErr_SetString(PyExc_ValueError
, "empty separator");
5887 return split_substring(self
,list
,substring
,maxcount
);
5891 PyObject
*rsplit(PyUnicodeObject
*self
,
5892 PyUnicodeObject
*substring
,
5893 Py_ssize_t maxcount
)
5898 maxcount
= PY_SSIZE_T_MAX
;
5900 list
= PyList_New(0);
5904 if (substring
== NULL
)
5905 return rsplit_whitespace(self
,list
,maxcount
);
5907 else if (substring
->length
== 1)
5908 return rsplit_char(self
,list
,substring
->str
[0],maxcount
);
5910 else if (substring
->length
== 0) {
5912 PyErr_SetString(PyExc_ValueError
, "empty separator");
5916 return rsplit_substring(self
,list
,substring
,maxcount
);
5920 PyObject
*replace(PyUnicodeObject
*self
,
5921 PyUnicodeObject
*str1
,
5922 PyUnicodeObject
*str2
,
5923 Py_ssize_t maxcount
)
5928 maxcount
= PY_SSIZE_T_MAX
;
5930 if (str1
->length
== str2
->length
) {
5933 if (str1
->length
== 1) {
5934 /* replace characters */
5936 if (!findchar(self
->str
, self
->length
, str1
->str
[0]))
5938 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5941 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5944 for (i
= 0; i
< u
->length
; i
++)
5945 if (u
->str
[i
] == u1
) {
5952 self
->str
, self
->length
, str1
->str
, str1
->length
, FAST_SEARCH
5956 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5959 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5960 while (i
<= self
->length
- str1
->length
)
5961 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
5964 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5971 Py_ssize_t n
, i
, j
, e
;
5972 Py_ssize_t product
, new_size
, delta
;
5975 /* replace strings */
5976 n
= stringlib_count(self
->str
, self
->length
, str1
->str
, str1
->length
);
5981 /* new_size = self->length + n * (str2->length - str1->length)); */
5982 delta
= (str2
->length
- str1
->length
);
5984 new_size
= self
->length
;
5986 product
= n
* (str2
->length
- str1
->length
);
5987 if ((product
/ (str2
->length
- str1
->length
)) != n
) {
5988 PyErr_SetString(PyExc_OverflowError
,
5989 "replace string is too long");
5992 new_size
= self
->length
+ product
;
5994 PyErr_SetString(PyExc_OverflowError
,
5995 "replace string is too long");
5999 u
= _PyUnicode_New(new_size
);
6004 e
= self
->length
- str1
->length
;
6005 if (str1
->length
> 0) {
6007 /* look for next match */
6010 if (Py_UNICODE_MATCH(self
, j
, str1
))
6017 /* copy unchanged part [i:j] */
6018 Py_UNICODE_COPY(p
, self
->str
+i
, j
-i
);
6021 /* copy substitution string */
6022 if (str2
->length
> 0) {
6023 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
6026 i
= j
+ str1
->length
;
6028 if (i
< self
->length
)
6029 /* copy tail [i:] */
6030 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
6034 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
6038 *p
++ = self
->str
[i
++];
6040 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
6043 return (PyObject
*) u
;
6046 /* nothing to replace; return original string (when possible) */
6047 if (PyUnicode_CheckExact(self
)) {
6049 return (PyObject
*) self
;
6051 return PyUnicode_FromUnicode(self
->str
, self
->length
);
6054 /* --- Unicode Object Methods --------------------------------------------- */
6056 PyDoc_STRVAR(title__doc__
,
6057 "S.title() -> unicode\n\
6059 Return a titlecased version of S, i.e. words start with title case\n\
6060 characters, all remaining cased characters have lower case.");
6063 unicode_title(PyUnicodeObject
*self
)
6065 return fixup(self
, fixtitle
);
6068 PyDoc_STRVAR(capitalize__doc__
,
6069 "S.capitalize() -> unicode\n\
6071 Return a capitalized version of S, i.e. make the first character\n\
6075 unicode_capitalize(PyUnicodeObject
*self
)
6077 return fixup(self
, fixcapitalize
);
6081 PyDoc_STRVAR(capwords__doc__
,
6082 "S.capwords() -> unicode\n\
6084 Apply .capitalize() to all words in S and return the result with\n\
6085 normalized whitespace (all whitespace strings are replaced by ' ').");
6088 unicode_capwords(PyUnicodeObject
*self
)
6094 /* Split into words */
6095 list
= split(self
, NULL
, -1);
6099 /* Capitalize each word */
6100 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
6101 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
6105 Py_DECREF(PyList_GET_ITEM(list
, i
));
6106 PyList_SET_ITEM(list
, i
, item
);
6109 /* Join the words to form a new string */
6110 item
= PyUnicode_Join(NULL
, list
);
6114 return (PyObject
*)item
;
6118 /* Argument converter. Coerces to a single unicode character */
6121 convert_uc(PyObject
*obj
, void *addr
)
6123 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
6127 uniobj
= PyUnicode_FromObject(obj
);
6128 if (uniobj
== NULL
) {
6129 PyErr_SetString(PyExc_TypeError
,
6130 "The fill character cannot be converted to Unicode");
6133 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
6134 PyErr_SetString(PyExc_TypeError
,
6135 "The fill character must be exactly one character long");
6139 unistr
= PyUnicode_AS_UNICODE(uniobj
);
6140 *fillcharloc
= unistr
[0];
6145 PyDoc_STRVAR(center__doc__
,
6146 "S.center(width[, fillchar]) -> unicode\n\
6148 Return S centered in a Unicode string of length width. Padding is\n\
6149 done using the specified fill character (default is a space)");
6152 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
6154 Py_ssize_t marg
, left
;
6156 Py_UNICODE fillchar
= ' ';
6158 if (!PyArg_ParseTuple(args
, "n|O&:center", &width
, convert_uc
, &fillchar
))
6161 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6163 return (PyObject
*) self
;
6166 marg
= width
- self
->length
;
6167 left
= marg
/ 2 + (marg
& width
& 1);
6169 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
6174 /* This code should go into some future Unicode collation support
6175 module. The basic comparison should compare ordinals on a naive
6176 basis (this is what Java does and thus JPython too). */
6178 /* speedy UTF-16 code point order comparison */
6180 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6182 static short utf16Fixup
[32] =
6184 0, 0, 0, 0, 0, 0, 0, 0,
6185 0, 0, 0, 0, 0, 0, 0, 0,
6186 0, 0, 0, 0, 0, 0, 0, 0,
6187 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6191 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6193 Py_ssize_t len1
, len2
;
6195 Py_UNICODE
*s1
= str1
->str
;
6196 Py_UNICODE
*s2
= str2
->str
;
6198 len1
= str1
->length
;
6199 len2
= str2
->length
;
6201 while (len1
> 0 && len2
> 0) {
6207 if (c1
> (1<<11) * 26)
6208 c1
+= utf16Fixup
[c1
>>11];
6209 if (c2
> (1<<11) * 26)
6210 c2
+= utf16Fixup
[c2
>>11];
6211 /* now c1 and c2 are in UTF-32-compatible order */
6214 return (c1
< c2
) ? -1 : 1;
6219 return (len1
< len2
) ? -1 : (len1
!= len2
);
6225 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6227 register Py_ssize_t len1
, len2
;
6229 Py_UNICODE
*s1
= str1
->str
;
6230 Py_UNICODE
*s2
= str2
->str
;
6232 len1
= str1
->length
;
6233 len2
= str2
->length
;
6235 while (len1
> 0 && len2
> 0) {
6242 return (c1
< c2
) ? -1 : 1;
6247 return (len1
< len2
) ? -1 : (len1
!= len2
);
6252 int PyUnicode_Compare(PyObject
*left
,
6255 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
6258 /* Coerce the two arguments */
6259 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6262 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6266 /* Shortcut for empty or interned objects */
6273 result
= unicode_compare(u
, v
);
6285 PyObject
*PyUnicode_RichCompare(PyObject
*left
,
6291 result
= PyUnicode_Compare(left
, right
);
6292 if (result
== -1 && PyErr_Occurred())
6295 /* Convert the return value to a Boolean */
6298 result
= (result
== 0);
6301 result
= (result
!= 0);
6304 result
= (result
<= 0);
6307 result
= (result
>= 0);
6310 result
= (result
== -1);
6313 result
= (result
== 1);
6316 return PyBool_FromLong(result
);
6322 Type errors mean that PyUnicode_FromObject() could not convert
6323 one of the arguments (usually the right hand side) to Unicode,
6324 ie. we can't handle the comparison request. However, it is
6325 possible that the other object knows a comparison method, which
6326 is why we return Py_NotImplemented to give the other object a
6330 if (PyErr_ExceptionMatches(PyExc_TypeError
)) {
6332 Py_INCREF(Py_NotImplemented
);
6333 return Py_NotImplemented
;
6335 if (op
!= Py_EQ
&& op
!= Py_NE
)
6338 /* Equality comparison.
6340 This is a special case: we silence any PyExc_UnicodeDecodeError
6341 and instead turn it into a PyErr_UnicodeWarning.
6344 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
))
6347 if (PyErr_Warn(PyExc_UnicodeWarning
,
6349 "Unicode equal comparison "
6350 "failed to convert both arguments to Unicode - "
6351 "interpreting them as being unequal" :
6352 "Unicode unequal comparison "
6353 "failed to convert both arguments to Unicode - "
6354 "interpreting them as being unequal"
6357 result
= (op
== Py_NE
);
6358 return PyBool_FromLong(result
);
6361 int PyUnicode_Contains(PyObject
*container
,
6364 PyObject
*str
, *sub
;
6367 /* Coerce the two arguments */
6368 sub
= PyUnicode_FromObject(element
);
6370 PyErr_SetString(PyExc_TypeError
,
6371 "'in <string>' requires string as left operand");
6375 str
= PyUnicode_FromObject(container
);
6381 result
= stringlib_contains_obj(str
, sub
);
6389 /* Concat to string or Unicode object giving a new Unicode object. */
6391 PyObject
*PyUnicode_Concat(PyObject
*left
,
6394 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
6396 /* Coerce the two arguments */
6397 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6400 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6405 if (v
== unicode_empty
) {
6407 return (PyObject
*)u
;
6409 if (u
== unicode_empty
) {
6411 return (PyObject
*)v
;
6414 /* Concat the two Unicode strings */
6415 w
= _PyUnicode_New(u
->length
+ v
->length
);
6418 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
6419 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
6423 return (PyObject
*)w
;
6431 PyDoc_STRVAR(count__doc__
,
6432 "S.count(sub[, start[, end]]) -> int\n\
6434 Return the number of non-overlapping occurrences of substring sub in\n\
6435 Unicode string S[start:end]. Optional arguments start and end are\n\
6436 interpreted as in slice notation.");
6439 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
6441 PyUnicodeObject
*substring
;
6442 Py_ssize_t start
= 0;
6443 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6446 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
6447 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6450 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6451 (PyObject
*)substring
);
6452 if (substring
== NULL
)
6455 FIX_START_END(self
);
6457 result
= PyInt_FromSsize_t(
6458 stringlib_count(self
->str
+ start
, end
- start
,
6459 substring
->str
, substring
->length
)
6462 Py_DECREF(substring
);
6467 PyDoc_STRVAR(encode__doc__
,
6468 "S.encode([encoding[,errors]]) -> string or unicode\n\
6470 Encodes S using the codec registered for encoding. encoding defaults\n\
6471 to the default encoding. errors may be given to set a different error\n\
6472 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6473 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6474 'xmlcharrefreplace' as well as any other name registered with\n\
6475 codecs.register_error that can handle UnicodeEncodeErrors.");
6478 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
6480 char *encoding
= NULL
;
6481 char *errors
= NULL
;
6484 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
6486 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
6489 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6490 PyErr_Format(PyExc_TypeError
,
6491 "encoder did not return a string/unicode object "
6493 Py_TYPE(v
)->tp_name
);
6503 PyDoc_STRVAR(decode__doc__
,
6504 "S.decode([encoding[,errors]]) -> string or unicode\n\
6506 Decodes S using the codec registered for encoding. encoding defaults\n\
6507 to the default encoding. errors may be given to set a different error\n\
6508 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6509 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6510 as well as any other name registerd with codecs.register_error that is\n\
6511 able to handle UnicodeDecodeErrors.");
6514 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
)
6516 char *encoding
= NULL
;
6517 char *errors
= NULL
;
6520 if (!PyArg_ParseTuple(args
, "|ss:decode", &encoding
, &errors
))
6522 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
6525 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6526 PyErr_Format(PyExc_TypeError
,
6527 "decoder did not return a string/unicode object "
6529 Py_TYPE(v
)->tp_name
);
6539 PyDoc_STRVAR(expandtabs__doc__
,
6540 "S.expandtabs([tabsize]) -> unicode\n\
6542 Return a copy of S where all tab characters are expanded using spaces.\n\
6543 If tabsize is not given, a tab size of 8 characters is assumed.");
6546 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
6552 Py_ssize_t i
, j
, incr
;
6556 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
6559 /* First pass: determine size of output string */
6560 i
= 0; /* chars up to and including most recent \n or \r */
6561 j
= 0; /* chars since most recent \n or \r (use in tab calculations) */
6562 e
= self
->str
+ self
->length
; /* end of input */
6563 for (p
= self
->str
; p
< e
; p
++)
6566 incr
= tabsize
- (j
% tabsize
); /* cannot overflow */
6567 if (j
> PY_SSIZE_T_MAX
- incr
)
6573 if (j
> PY_SSIZE_T_MAX
- 1)
6576 if (*p
== '\n' || *p
== '\r') {
6577 if (i
> PY_SSIZE_T_MAX
- j
)
6584 if (i
> PY_SSIZE_T_MAX
- j
)
6587 /* Second pass: create output string and fill it */
6588 u
= _PyUnicode_New(i
+ j
);
6592 j
= 0; /* same as in first pass */
6593 q
= u
->str
; /* next output char */
6594 qe
= u
->str
+ u
->length
; /* end of output */
6596 for (p
= self
->str
; p
< e
; p
++)
6599 i
= tabsize
- (j
% tabsize
);
6613 if (*p
== '\n' || *p
== '\r')
6617 return (PyObject
*) u
;
6622 PyErr_SetString(PyExc_OverflowError
, "new string is too long");
6626 PyDoc_STRVAR(find__doc__
,
6627 "S.find(sub [,start [,end]]) -> int\n\
6629 Return the lowest index in S where substring sub is found,\n\
6630 such that sub is contained within s[start:end]. Optional\n\
6631 arguments start and end are interpreted as in slice notation.\n\
6633 Return -1 on failure.");
6636 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
6638 PyObject
*substring
;
6643 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6646 result
= stringlib_find_slice(
6647 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6648 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6652 Py_DECREF(substring
);
6654 return PyInt_FromSsize_t(result
);
6658 unicode_getitem(PyUnicodeObject
*self
, Py_ssize_t index
)
6660 if (index
< 0 || index
>= self
->length
) {
6661 PyErr_SetString(PyExc_IndexError
, "string index out of range");
6665 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
6669 unicode_hash(PyUnicodeObject
*self
)
6671 /* Since Unicode objects compare equal to their ASCII string
6672 counterparts, they should use the individual character values
6673 as basis for their hash value. This is needed to assure that
6674 strings and Unicode objects behave in the same way as
6677 register Py_ssize_t len
;
6678 register Py_UNICODE
*p
;
6681 if (self
->hash
!= -1)
6683 len
= PyUnicode_GET_SIZE(self
);
6684 p
= PyUnicode_AS_UNICODE(self
);
6687 x
= (1000003*x
) ^ *p
++;
6688 x
^= PyUnicode_GET_SIZE(self
);
6695 PyDoc_STRVAR(index__doc__
,
6696 "S.index(sub [,start [,end]]) -> int\n\
6698 Like S.find() but raise ValueError when the substring is not found.");
6701 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
6704 PyObject
*substring
;
6708 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6711 result
= stringlib_find_slice(
6712 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6713 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6717 Py_DECREF(substring
);
6720 PyErr_SetString(PyExc_ValueError
, "substring not found");
6724 return PyInt_FromSsize_t(result
);
6727 PyDoc_STRVAR(islower__doc__
,
6728 "S.islower() -> bool\n\
6730 Return True if all cased characters in S are lowercase and there is\n\
6731 at least one cased character in S, False otherwise.");
6734 unicode_islower(PyUnicodeObject
*self
)
6736 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6737 register const Py_UNICODE
*e
;
6740 /* Shortcut for single character strings */
6741 if (PyUnicode_GET_SIZE(self
) == 1)
6742 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
6744 /* Special case for empty strings */
6745 if (PyUnicode_GET_SIZE(self
) == 0)
6746 return PyBool_FromLong(0);
6748 e
= p
+ PyUnicode_GET_SIZE(self
);
6750 for (; p
< e
; p
++) {
6751 register const Py_UNICODE ch
= *p
;
6753 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
6754 return PyBool_FromLong(0);
6755 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
6758 return PyBool_FromLong(cased
);
6761 PyDoc_STRVAR(isupper__doc__
,
6762 "S.isupper() -> bool\n\
6764 Return True if all cased characters in S are uppercase and there is\n\
6765 at least one cased character in S, False otherwise.");
6768 unicode_isupper(PyUnicodeObject
*self
)
6770 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6771 register const Py_UNICODE
*e
;
6774 /* Shortcut for single character strings */
6775 if (PyUnicode_GET_SIZE(self
) == 1)
6776 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
6778 /* Special case for empty strings */
6779 if (PyUnicode_GET_SIZE(self
) == 0)
6780 return PyBool_FromLong(0);
6782 e
= p
+ PyUnicode_GET_SIZE(self
);
6784 for (; p
< e
; p
++) {
6785 register const Py_UNICODE ch
= *p
;
6787 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
6788 return PyBool_FromLong(0);
6789 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
6792 return PyBool_FromLong(cased
);
6795 PyDoc_STRVAR(istitle__doc__
,
6796 "S.istitle() -> bool\n\
6798 Return True if S is a titlecased string and there is at least one\n\
6799 character in S, i.e. upper- and titlecase characters may only\n\
6800 follow uncased characters and lowercase characters only cased ones.\n\
6801 Return False otherwise.");
6804 unicode_istitle(PyUnicodeObject
*self
)
6806 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6807 register const Py_UNICODE
*e
;
6808 int cased
, previous_is_cased
;
6810 /* Shortcut for single character strings */
6811 if (PyUnicode_GET_SIZE(self
) == 1)
6812 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
6813 (Py_UNICODE_ISUPPER(*p
) != 0));
6815 /* Special case for empty strings */
6816 if (PyUnicode_GET_SIZE(self
) == 0)
6817 return PyBool_FromLong(0);
6819 e
= p
+ PyUnicode_GET_SIZE(self
);
6821 previous_is_cased
= 0;
6822 for (; p
< e
; p
++) {
6823 register const Py_UNICODE ch
= *p
;
6825 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
6826 if (previous_is_cased
)
6827 return PyBool_FromLong(0);
6828 previous_is_cased
= 1;
6831 else if (Py_UNICODE_ISLOWER(ch
)) {
6832 if (!previous_is_cased
)
6833 return PyBool_FromLong(0);
6834 previous_is_cased
= 1;
6838 previous_is_cased
= 0;
6840 return PyBool_FromLong(cased
);
6843 PyDoc_STRVAR(isspace__doc__
,
6844 "S.isspace() -> bool\n\
6846 Return True if all characters in S are whitespace\n\
6847 and there is at least one character in S, False otherwise.");
6850 unicode_isspace(PyUnicodeObject
*self
)
6852 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6853 register const Py_UNICODE
*e
;
6855 /* Shortcut for single character strings */
6856 if (PyUnicode_GET_SIZE(self
) == 1 &&
6857 Py_UNICODE_ISSPACE(*p
))
6858 return PyBool_FromLong(1);
6860 /* Special case for empty strings */
6861 if (PyUnicode_GET_SIZE(self
) == 0)
6862 return PyBool_FromLong(0);
6864 e
= p
+ PyUnicode_GET_SIZE(self
);
6865 for (; p
< e
; p
++) {
6866 if (!Py_UNICODE_ISSPACE(*p
))
6867 return PyBool_FromLong(0);
6869 return PyBool_FromLong(1);
6872 PyDoc_STRVAR(isalpha__doc__
,
6873 "S.isalpha() -> bool\n\
6875 Return True if all characters in S are alphabetic\n\
6876 and there is at least one character in S, False otherwise.");
6879 unicode_isalpha(PyUnicodeObject
*self
)
6881 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6882 register const Py_UNICODE
*e
;
6884 /* Shortcut for single character strings */
6885 if (PyUnicode_GET_SIZE(self
) == 1 &&
6886 Py_UNICODE_ISALPHA(*p
))
6887 return PyBool_FromLong(1);
6889 /* Special case for empty strings */
6890 if (PyUnicode_GET_SIZE(self
) == 0)
6891 return PyBool_FromLong(0);
6893 e
= p
+ PyUnicode_GET_SIZE(self
);
6894 for (; p
< e
; p
++) {
6895 if (!Py_UNICODE_ISALPHA(*p
))
6896 return PyBool_FromLong(0);
6898 return PyBool_FromLong(1);
6901 PyDoc_STRVAR(isalnum__doc__
,
6902 "S.isalnum() -> bool\n\
6904 Return True if all characters in S are alphanumeric\n\
6905 and there is at least one character in S, False otherwise.");
6908 unicode_isalnum(PyUnicodeObject
*self
)
6910 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6911 register const Py_UNICODE
*e
;
6913 /* Shortcut for single character strings */
6914 if (PyUnicode_GET_SIZE(self
) == 1 &&
6915 Py_UNICODE_ISALNUM(*p
))
6916 return PyBool_FromLong(1);
6918 /* Special case for empty strings */
6919 if (PyUnicode_GET_SIZE(self
) == 0)
6920 return PyBool_FromLong(0);
6922 e
= p
+ PyUnicode_GET_SIZE(self
);
6923 for (; p
< e
; p
++) {
6924 if (!Py_UNICODE_ISALNUM(*p
))
6925 return PyBool_FromLong(0);
6927 return PyBool_FromLong(1);
6930 PyDoc_STRVAR(isdecimal__doc__
,
6931 "S.isdecimal() -> bool\n\
6933 Return True if there are only decimal characters in S,\n\
6937 unicode_isdecimal(PyUnicodeObject
*self
)
6939 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6940 register const Py_UNICODE
*e
;
6942 /* Shortcut for single character strings */
6943 if (PyUnicode_GET_SIZE(self
) == 1 &&
6944 Py_UNICODE_ISDECIMAL(*p
))
6945 return PyBool_FromLong(1);
6947 /* Special case for empty strings */
6948 if (PyUnicode_GET_SIZE(self
) == 0)
6949 return PyBool_FromLong(0);
6951 e
= p
+ PyUnicode_GET_SIZE(self
);
6952 for (; p
< e
; p
++) {
6953 if (!Py_UNICODE_ISDECIMAL(*p
))
6954 return PyBool_FromLong(0);
6956 return PyBool_FromLong(1);
6959 PyDoc_STRVAR(isdigit__doc__
,
6960 "S.isdigit() -> bool\n\
6962 Return True if all characters in S are digits\n\
6963 and there is at least one character in S, False otherwise.");
6966 unicode_isdigit(PyUnicodeObject
*self
)
6968 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6969 register const Py_UNICODE
*e
;
6971 /* Shortcut for single character strings */
6972 if (PyUnicode_GET_SIZE(self
) == 1 &&
6973 Py_UNICODE_ISDIGIT(*p
))
6974 return PyBool_FromLong(1);
6976 /* Special case for empty strings */
6977 if (PyUnicode_GET_SIZE(self
) == 0)
6978 return PyBool_FromLong(0);
6980 e
= p
+ PyUnicode_GET_SIZE(self
);
6981 for (; p
< e
; p
++) {
6982 if (!Py_UNICODE_ISDIGIT(*p
))
6983 return PyBool_FromLong(0);
6985 return PyBool_FromLong(1);
6988 PyDoc_STRVAR(isnumeric__doc__
,
6989 "S.isnumeric() -> bool\n\
6991 Return True if there are only numeric characters in S,\n\
6995 unicode_isnumeric(PyUnicodeObject
*self
)
6997 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6998 register const Py_UNICODE
*e
;
7000 /* Shortcut for single character strings */
7001 if (PyUnicode_GET_SIZE(self
) == 1 &&
7002 Py_UNICODE_ISNUMERIC(*p
))
7003 return PyBool_FromLong(1);
7005 /* Special case for empty strings */
7006 if (PyUnicode_GET_SIZE(self
) == 0)
7007 return PyBool_FromLong(0);
7009 e
= p
+ PyUnicode_GET_SIZE(self
);
7010 for (; p
< e
; p
++) {
7011 if (!Py_UNICODE_ISNUMERIC(*p
))
7012 return PyBool_FromLong(0);
7014 return PyBool_FromLong(1);
7017 PyDoc_STRVAR(join__doc__
,
7018 "S.join(sequence) -> unicode\n\
7020 Return a string which is the concatenation of the strings in the\n\
7021 sequence. The separator between elements is S.");
7024 unicode_join(PyObject
*self
, PyObject
*data
)
7026 return PyUnicode_Join(self
, data
);
7030 unicode_length(PyUnicodeObject
*self
)
7032 return self
->length
;
7035 PyDoc_STRVAR(ljust__doc__
,
7036 "S.ljust(width[, fillchar]) -> int\n\
7038 Return S left justified in a Unicode string of length width. Padding is\n\
7039 done using the specified fill character (default is a space).");
7042 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
7045 Py_UNICODE fillchar
= ' ';
7047 if (!PyArg_ParseTuple(args
, "n|O&:ljust", &width
, convert_uc
, &fillchar
))
7050 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7052 return (PyObject
*) self
;
7055 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
7058 PyDoc_STRVAR(lower__doc__
,
7059 "S.lower() -> unicode\n\
7061 Return a copy of the string S converted to lowercase.");
7064 unicode_lower(PyUnicodeObject
*self
)
7066 return fixup(self
, fixlower
);
7070 #define RIGHTSTRIP 1
7073 /* Arrays indexed by above */
7074 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7076 #define STRIPNAME(i) (stripformat[i]+3)
7078 /* externally visible for str.strip(unicode) */
7080 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
7082 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
7083 Py_ssize_t len
= PyUnicode_GET_SIZE(self
);
7084 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
7085 Py_ssize_t seplen
= PyUnicode_GET_SIZE(sepobj
);
7088 BLOOM_MASK sepmask
= make_bloom_mask(sep
, seplen
);
7091 if (striptype
!= RIGHTSTRIP
) {
7092 while (i
< len
&& BLOOM_MEMBER(sepmask
, s
[i
], sep
, seplen
)) {
7098 if (striptype
!= LEFTSTRIP
) {
7101 } while (j
>= i
&& BLOOM_MEMBER(sepmask
, s
[j
], sep
, seplen
));
7105 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7107 return (PyObject
*)self
;
7110 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7115 do_strip(PyUnicodeObject
*self
, int striptype
)
7117 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
7118 Py_ssize_t len
= PyUnicode_GET_SIZE(self
), i
, j
;
7121 if (striptype
!= RIGHTSTRIP
) {
7122 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
7128 if (striptype
!= LEFTSTRIP
) {
7131 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
7135 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7137 return (PyObject
*)self
;
7140 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7145 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
7147 PyObject
*sep
= NULL
;
7149 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
7152 if (sep
!= NULL
&& sep
!= Py_None
) {
7153 if (PyUnicode_Check(sep
))
7154 return _PyUnicode_XStrip(self
, striptype
, sep
);
7155 else if (PyString_Check(sep
)) {
7157 sep
= PyUnicode_FromObject(sep
);
7160 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
7165 PyErr_Format(PyExc_TypeError
,
7166 "%s arg must be None, unicode or str",
7167 STRIPNAME(striptype
));
7172 return do_strip(self
, striptype
);
7176 PyDoc_STRVAR(strip__doc__
,
7177 "S.strip([chars]) -> unicode\n\
7179 Return a copy of the string S with leading and trailing\n\
7180 whitespace removed.\n\
7181 If chars is given and not None, remove characters in chars instead.\n\
7182 If chars is a str, it will be converted to unicode before stripping");
7185 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
7187 if (PyTuple_GET_SIZE(args
) == 0)
7188 return do_strip(self
, BOTHSTRIP
); /* Common case */
7190 return do_argstrip(self
, BOTHSTRIP
, args
);
7194 PyDoc_STRVAR(lstrip__doc__
,
7195 "S.lstrip([chars]) -> unicode\n\
7197 Return a copy of the string S with leading whitespace removed.\n\
7198 If chars is given and not None, remove characters in chars instead.\n\
7199 If chars is a str, it will be converted to unicode before stripping");
7202 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
7204 if (PyTuple_GET_SIZE(args
) == 0)
7205 return do_strip(self
, LEFTSTRIP
); /* Common case */
7207 return do_argstrip(self
, LEFTSTRIP
, args
);
7211 PyDoc_STRVAR(rstrip__doc__
,
7212 "S.rstrip([chars]) -> unicode\n\
7214 Return a copy of the string S with trailing whitespace removed.\n\
7215 If chars is given and not None, remove characters in chars instead.\n\
7216 If chars is a str, it will be converted to unicode before stripping");
7219 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
7221 if (PyTuple_GET_SIZE(args
) == 0)
7222 return do_strip(self
, RIGHTSTRIP
); /* Common case */
7224 return do_argstrip(self
, RIGHTSTRIP
, args
);
7229 unicode_repeat(PyUnicodeObject
*str
, Py_ssize_t len
)
7239 if (len
== 1 && PyUnicode_CheckExact(str
)) {
7240 /* no repeat, return original string */
7242 return (PyObject
*) str
;
7245 /* ensure # of chars needed doesn't overflow int and # of bytes
7246 * needed doesn't overflow size_t
7248 nchars
= len
* str
->length
;
7249 if (len
&& nchars
/ len
!= str
->length
) {
7250 PyErr_SetString(PyExc_OverflowError
,
7251 "repeated string is too long");
7254 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
7255 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
7256 PyErr_SetString(PyExc_OverflowError
,
7257 "repeated string is too long");
7260 u
= _PyUnicode_New(nchars
);
7266 if (str
->length
== 1 && len
> 0) {
7267 Py_UNICODE_FILL(p
, str
->str
[0], len
);
7269 Py_ssize_t done
= 0; /* number of characters copied this far */
7270 if (done
< nchars
) {
7271 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
7274 while (done
< nchars
) {
7275 Py_ssize_t n
= (done
<= nchars
-done
) ? done
: nchars
-done
;
7276 Py_UNICODE_COPY(p
+done
, p
, n
);
7281 return (PyObject
*) u
;
7284 PyObject
*PyUnicode_Replace(PyObject
*obj
,
7287 Py_ssize_t maxcount
)
7294 self
= PyUnicode_FromObject(obj
);
7297 str1
= PyUnicode_FromObject(subobj
);
7302 str2
= PyUnicode_FromObject(replobj
);
7308 result
= replace((PyUnicodeObject
*)self
,
7309 (PyUnicodeObject
*)str1
,
7310 (PyUnicodeObject
*)str2
,
7318 PyDoc_STRVAR(replace__doc__
,
7319 "S.replace (old, new[, count]) -> unicode\n\
7321 Return a copy of S with all occurrences of substring\n\
7322 old replaced by new. If the optional argument count is\n\
7323 given, only the first count occurrences are replaced.");
7326 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
7328 PyUnicodeObject
*str1
;
7329 PyUnicodeObject
*str2
;
7330 Py_ssize_t maxcount
= -1;
7333 if (!PyArg_ParseTuple(args
, "OO|n:replace", &str1
, &str2
, &maxcount
))
7335 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
7338 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
7344 result
= replace(self
, str1
, str2
, maxcount
);
7352 PyObject
*unicode_repr(PyObject
*unicode
)
7354 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
7355 PyUnicode_GET_SIZE(unicode
),
7359 PyDoc_STRVAR(rfind__doc__
,
7360 "S.rfind(sub [,start [,end]]) -> int\n\
7362 Return the highest index in S where substring sub is found,\n\
7363 such that sub is contained within s[start:end]. Optional\n\
7364 arguments start and end are interpreted as in slice notation.\n\
7366 Return -1 on failure.");
7369 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
7371 PyObject
*substring
;
7376 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7379 result
= stringlib_rfind_slice(
7380 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7381 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7385 Py_DECREF(substring
);
7387 return PyInt_FromSsize_t(result
);
7390 PyDoc_STRVAR(rindex__doc__
,
7391 "S.rindex(sub [,start [,end]]) -> int\n\
7393 Like S.rfind() but raise ValueError when the substring is not found.");
7396 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
7398 PyObject
*substring
;
7403 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7406 result
= stringlib_rfind_slice(
7407 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7408 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7412 Py_DECREF(substring
);
7415 PyErr_SetString(PyExc_ValueError
, "substring not found");
7418 return PyInt_FromSsize_t(result
);
7421 PyDoc_STRVAR(rjust__doc__
,
7422 "S.rjust(width[, fillchar]) -> unicode\n\
7424 Return S right justified in a Unicode string of length width. Padding is\n\
7425 done using the specified fill character (default is a space).");
7428 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
7431 Py_UNICODE fillchar
= ' ';
7433 if (!PyArg_ParseTuple(args
, "n|O&:rjust", &width
, convert_uc
, &fillchar
))
7436 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7438 return (PyObject
*) self
;
7441 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
7445 unicode_slice(PyUnicodeObject
*self
, Py_ssize_t start
, Py_ssize_t end
)
7447 /* standard clamping */
7452 if (end
> self
->length
)
7454 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
7455 /* full slice, return original string */
7457 return (PyObject
*) self
;
7462 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
7466 PyObject
*PyUnicode_Split(PyObject
*s
,
7468 Py_ssize_t maxsplit
)
7472 s
= PyUnicode_FromObject(s
);
7476 sep
= PyUnicode_FromObject(sep
);
7483 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7490 PyDoc_STRVAR(split__doc__
,
7491 "S.split([sep [,maxsplit]]) -> list of strings\n\
7493 Return a list of the words in S, using sep as the\n\
7494 delimiter string. If maxsplit is given, at most maxsplit\n\
7495 splits are done. If sep is not specified or is None, any\n\
7496 whitespace string is a separator and empty strings are\n\
7497 removed from the result.");
7500 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
7502 PyObject
*substring
= Py_None
;
7503 Py_ssize_t maxcount
= -1;
7505 if (!PyArg_ParseTuple(args
, "|On:split", &substring
, &maxcount
))
7508 if (substring
== Py_None
)
7509 return split(self
, NULL
, maxcount
);
7510 else if (PyUnicode_Check(substring
))
7511 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
7513 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
7517 PyUnicode_Partition(PyObject
*str_in
, PyObject
*sep_in
)
7523 str_obj
= PyUnicode_FromObject(str_in
);
7526 sep_obj
= PyUnicode_FromObject(sep_in
);
7532 out
= stringlib_partition(
7533 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7534 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7545 PyUnicode_RPartition(PyObject
*str_in
, PyObject
*sep_in
)
7551 str_obj
= PyUnicode_FromObject(str_in
);
7554 sep_obj
= PyUnicode_FromObject(sep_in
);
7560 out
= stringlib_rpartition(
7561 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7562 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7571 PyDoc_STRVAR(partition__doc__
,
7572 "S.partition(sep) -> (head, sep, tail)\n\
7574 Searches for the separator sep in S, and returns the part before it,\n\
7575 the separator itself, and the part after it. If the separator is not\n\
7576 found, returns S and two empty strings.");
7579 unicode_partition(PyUnicodeObject
*self
, PyObject
*separator
)
7581 return PyUnicode_Partition((PyObject
*)self
, separator
);
7584 PyDoc_STRVAR(rpartition__doc__
,
7585 "S.rpartition(sep) -> (tail, sep, head)\n\
7587 Searches for the separator sep in S, starting at the end of S, and returns\n\
7588 the part before it, the separator itself, and the part after it. If the\n\
7589 separator is not found, returns two empty strings and S.");
7592 unicode_rpartition(PyUnicodeObject
*self
, PyObject
*separator
)
7594 return PyUnicode_RPartition((PyObject
*)self
, separator
);
7597 PyObject
*PyUnicode_RSplit(PyObject
*s
,
7599 Py_ssize_t maxsplit
)
7603 s
= PyUnicode_FromObject(s
);
7607 sep
= PyUnicode_FromObject(sep
);
7614 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7621 PyDoc_STRVAR(rsplit__doc__
,
7622 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7624 Return a list of the words in S, using sep as the\n\
7625 delimiter string, starting at the end of the string and\n\
7626 working to the front. If maxsplit is given, at most maxsplit\n\
7627 splits are done. If sep is not specified, any whitespace string\n\
7631 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
7633 PyObject
*substring
= Py_None
;
7634 Py_ssize_t maxcount
= -1;
7636 if (!PyArg_ParseTuple(args
, "|On:rsplit", &substring
, &maxcount
))
7639 if (substring
== Py_None
)
7640 return rsplit(self
, NULL
, maxcount
);
7641 else if (PyUnicode_Check(substring
))
7642 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
7644 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
7647 PyDoc_STRVAR(splitlines__doc__
,
7648 "S.splitlines([keepends]]) -> list of strings\n\
7650 Return a list of the lines in S, breaking at line boundaries.\n\
7651 Line breaks are not included in the resulting list unless keepends\n\
7652 is given and true.");
7655 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
7659 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
7662 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
7666 PyObject
*unicode_str(PyUnicodeObject
*self
)
7668 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
7671 PyDoc_STRVAR(swapcase__doc__
,
7672 "S.swapcase() -> unicode\n\
7674 Return a copy of S with uppercase characters converted to lowercase\n\
7678 unicode_swapcase(PyUnicodeObject
*self
)
7680 return fixup(self
, fixswapcase
);
7683 PyDoc_STRVAR(translate__doc__
,
7684 "S.translate(table) -> unicode\n\
7686 Return a copy of the string S, where all characters have been mapped\n\
7687 through the given translation table, which must be a mapping of\n\
7688 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7689 Unmapped characters are left untouched. Characters mapped to None\n\
7693 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
7695 return PyUnicode_TranslateCharmap(self
->str
,
7701 PyDoc_STRVAR(upper__doc__
,
7702 "S.upper() -> unicode\n\
7704 Return a copy of S converted to uppercase.");
7707 unicode_upper(PyUnicodeObject
*self
)
7709 return fixup(self
, fixupper
);
7712 PyDoc_STRVAR(zfill__doc__
,
7713 "S.zfill(width) -> unicode\n\
7715 Pad a numeric string x with zeros on the left, to fill a field\n\
7716 of the specified width. The string x is never truncated.");
7719 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
7725 if (!PyArg_ParseTuple(args
, "n:zfill", &width
))
7728 if (self
->length
>= width
) {
7729 if (PyUnicode_CheckExact(self
)) {
7731 return (PyObject
*) self
;
7734 return PyUnicode_FromUnicode(
7735 PyUnicode_AS_UNICODE(self
),
7736 PyUnicode_GET_SIZE(self
)
7740 fill
= width
- self
->length
;
7742 u
= pad(self
, fill
, 0, '0');
7747 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
7748 /* move sign to beginning of string */
7749 u
->str
[0] = u
->str
[fill
];
7753 return (PyObject
*) u
;
7758 free_listsize(PyUnicodeObject
*self
)
7760 return PyInt_FromLong(numfree
);
7764 PyDoc_STRVAR(startswith__doc__
,
7765 "S.startswith(prefix[, start[, end]]) -> bool\n\
7767 Return True if S starts with the specified prefix, False otherwise.\n\
7768 With optional start, test S beginning at that position.\n\
7769 With optional end, stop comparing S at that position.\n\
7770 prefix can also be a tuple of strings to try.");
7773 unicode_startswith(PyUnicodeObject
*self
,
7777 PyUnicodeObject
*substring
;
7778 Py_ssize_t start
= 0;
7779 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7782 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &subobj
,
7783 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7785 if (PyTuple_Check(subobj
)) {
7787 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7788 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7789 PyTuple_GET_ITEM(subobj
, i
));
7790 if (substring
== NULL
)
7792 result
= tailmatch(self
, substring
, start
, end
, -1);
7793 Py_DECREF(substring
);
7798 /* nothing matched */
7801 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7802 if (substring
== NULL
)
7804 result
= tailmatch(self
, substring
, start
, end
, -1);
7805 Py_DECREF(substring
);
7806 return PyBool_FromLong(result
);
7810 PyDoc_STRVAR(endswith__doc__
,
7811 "S.endswith(suffix[, start[, end]]) -> bool\n\
7813 Return True if S ends with the specified suffix, False otherwise.\n\
7814 With optional start, test S beginning at that position.\n\
7815 With optional end, stop comparing S at that position.\n\
7816 suffix can also be a tuple of strings to try.");
7819 unicode_endswith(PyUnicodeObject
*self
,
7823 PyUnicodeObject
*substring
;
7824 Py_ssize_t start
= 0;
7825 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7828 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &subobj
,
7829 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7831 if (PyTuple_Check(subobj
)) {
7833 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7834 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7835 PyTuple_GET_ITEM(subobj
, i
));
7836 if (substring
== NULL
)
7838 result
= tailmatch(self
, substring
, start
, end
, +1);
7839 Py_DECREF(substring
);
7846 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7847 if (substring
== NULL
)
7850 result
= tailmatch(self
, substring
, start
, end
, +1);
7851 Py_DECREF(substring
);
7852 return PyBool_FromLong(result
);
7856 /* Implements do_string_format, which is unicode because of stringlib */
7857 #include "stringlib/string_format.h"
7859 PyDoc_STRVAR(format__doc__
,
7860 "S.format(*args, **kwargs) -> unicode\n\
7865 unicode__format__(PyObject
*self
, PyObject
*args
)
7867 PyObject
*format_spec
;
7868 PyObject
*result
= NULL
;
7869 PyObject
*tmp
= NULL
;
7871 /* If 2.x, convert format_spec to the same type as value */
7872 /* This is to allow things like u''.format('') */
7873 if (!PyArg_ParseTuple(args
, "O:__format__", &format_spec
))
7875 if (!(PyBytes_Check(format_spec
) || PyUnicode_Check(format_spec
))) {
7876 PyErr_Format(PyExc_TypeError
, "__format__ arg must be str "
7877 "or unicode, not %s", Py_TYPE(format_spec
)->tp_name
);
7880 tmp
= PyObject_Unicode(format_spec
);
7885 result
= _PyUnicode_FormatAdvanced(self
,
7886 PyUnicode_AS_UNICODE(format_spec
),
7887 PyUnicode_GET_SIZE(format_spec
));
7893 PyDoc_STRVAR(p_format__doc__
,
7894 "S.__format__(format_spec) -> unicode\n\
7899 unicode__sizeof__(PyUnicodeObject
*v
)
7901 PyObject
*res
= NULL
, *defsize
= NULL
;
7903 res
= PyInt_FromSsize_t(sizeof(PyUnicodeObject
) +
7904 sizeof(Py_UNICODE
) * (v
->length
+ 1));
7906 defsize
= PyObject_CallMethod(v
->defenc
, "__sizeof__", NULL
);
7907 if (defsize
== NULL
) {
7911 res
= PyNumber_Add(res
, defsize
);
7917 PyDoc_STRVAR(sizeof__doc__
,
7918 "S.__sizeof__() -> size of S in memory, in bytes\n\
7923 unicode_getnewargs(PyUnicodeObject
*v
)
7925 return Py_BuildValue("(u#)", v
->str
, v
->length
);
7929 static PyMethodDef unicode_methods
[] = {
7931 /* Order is according to common usage: often used methods should
7932 appear first, since lookup is done sequentially. */
7934 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
, encode__doc__
},
7935 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
7936 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
7937 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
7938 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
7939 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
7940 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
7941 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
7942 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
7943 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
7944 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
7945 {"partition", (PyCFunction
) unicode_partition
, METH_O
, partition__doc__
},
7946 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
7947 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
7948 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
7949 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
7950 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
, decode__doc__
},
7951 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7952 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
7953 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
7954 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
7955 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
7956 {"rpartition", (PyCFunction
) unicode_rpartition
, METH_O
, rpartition__doc__
},
7957 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
7958 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
7959 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
7960 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
7961 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
7962 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
7963 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
7964 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
7965 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
7966 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
7967 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
7968 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
7969 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
7970 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
7971 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
7972 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
7973 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
7974 {"format", (PyCFunction
) do_string_format
, METH_VARARGS
| METH_KEYWORDS
, format__doc__
},
7975 {"__format__", (PyCFunction
) unicode__format__
, METH_VARARGS
, p_format__doc__
},
7976 {"_formatter_field_name_split", (PyCFunction
) formatter_field_name_split
, METH_NOARGS
},
7977 {"_formatter_parser", (PyCFunction
) formatter_parser
, METH_NOARGS
},
7978 {"__sizeof__", (PyCFunction
) unicode__sizeof__
, METH_NOARGS
, sizeof__doc__
},
7980 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
7984 /* This one is just used for debugging the implementation. */
7985 {"freelistsize", (PyCFunction
) free_listsize
, METH_NOARGS
},
7988 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
7993 unicode_mod(PyObject
*v
, PyObject
*w
)
7995 if (!PyUnicode_Check(v
)) {
7996 Py_INCREF(Py_NotImplemented
);
7997 return Py_NotImplemented
;
7999 return PyUnicode_Format(v
, w
);
8002 static PyNumberMethods unicode_as_number
= {
8007 unicode_mod
, /*nb_remainder*/
8010 static PySequenceMethods unicode_as_sequence
= {
8011 (lenfunc
) unicode_length
, /* sq_length */
8012 PyUnicode_Concat
, /* sq_concat */
8013 (ssizeargfunc
) unicode_repeat
, /* sq_repeat */
8014 (ssizeargfunc
) unicode_getitem
, /* sq_item */
8015 (ssizessizeargfunc
) unicode_slice
, /* sq_slice */
8016 0, /* sq_ass_item */
8017 0, /* sq_ass_slice */
8018 PyUnicode_Contains
, /* sq_contains */
8022 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
8024 if (PyIndex_Check(item
)) {
8025 Py_ssize_t i
= PyNumber_AsSsize_t(item
, PyExc_IndexError
);
8026 if (i
== -1 && PyErr_Occurred())
8029 i
+= PyUnicode_GET_SIZE(self
);
8030 return unicode_getitem(self
, i
);
8031 } else if (PySlice_Check(item
)) {
8032 Py_ssize_t start
, stop
, step
, slicelength
, cur
, i
;
8033 Py_UNICODE
* source_buf
;
8034 Py_UNICODE
* result_buf
;
8037 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyUnicode_GET_SIZE(self
),
8038 &start
, &stop
, &step
, &slicelength
) < 0) {
8042 if (slicelength
<= 0) {
8043 return PyUnicode_FromUnicode(NULL
, 0);
8044 } else if (start
== 0 && step
== 1 && slicelength
== self
->length
&&
8045 PyUnicode_CheckExact(self
)) {
8047 return (PyObject
*)self
;
8048 } else if (step
== 1) {
8049 return PyUnicode_FromUnicode(self
->str
+ start
, slicelength
);
8051 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
8052 result_buf
= (Py_UNICODE
*)PyObject_MALLOC(slicelength
*
8053 sizeof(Py_UNICODE
));
8055 if (result_buf
== NULL
)
8056 return PyErr_NoMemory();
8058 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
8059 result_buf
[i
] = source_buf
[cur
];
8062 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
8063 PyObject_FREE(result_buf
);
8067 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
8072 static PyMappingMethods unicode_as_mapping
= {
8073 (lenfunc
)unicode_length
, /* mp_length */
8074 (binaryfunc
)unicode_subscript
, /* mp_subscript */
8075 (objobjargproc
)0, /* mp_ass_subscript */
8079 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
8084 PyErr_SetString(PyExc_SystemError
,
8085 "accessing non-existent unicode segment");
8088 *ptr
= (void *) self
->str
;
8089 return PyUnicode_GET_DATA_SIZE(self
);
8093 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, Py_ssize_t index
,
8096 PyErr_SetString(PyExc_TypeError
,
8097 "cannot use unicode as modifiable buffer");
8102 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
8106 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
8111 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
8118 PyErr_SetString(PyExc_SystemError
,
8119 "accessing non-existent unicode segment");
8122 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
8125 *ptr
= (void *) PyString_AS_STRING(str
);
8126 return PyString_GET_SIZE(str
);
8129 /* Helpers for PyUnicode_Format() */
8132 getnextarg(PyObject
*args
, Py_ssize_t arglen
, Py_ssize_t
*p_argidx
)
8134 Py_ssize_t argidx
= *p_argidx
;
8135 if (argidx
< arglen
) {
8140 return PyTuple_GetItem(args
, argidx
);
8142 PyErr_SetString(PyExc_TypeError
,
8143 "not enough arguments for format string");
8147 #define F_LJUST (1<<0)
8148 #define F_SIGN (1<<1)
8149 #define F_BLANK (1<<2)
8150 #define F_ALT (1<<3)
8151 #define F_ZERO (1<<4)
8154 strtounicode(Py_UNICODE
*buffer
, const char *charbuffer
)
8156 register Py_ssize_t i
;
8157 Py_ssize_t len
= strlen(charbuffer
);
8158 for (i
= len
- 1; i
>= 0; i
--)
8159 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
8165 doubletounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, double x
)
8169 PyOS_ascii_formatd((char *)buffer
, len
, format
, x
);
8170 result
= strtounicode(buffer
, (char *)buffer
);
8171 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
8175 longtounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, long x
)
8179 PyOS_snprintf((char *)buffer
, len
, format
, x
);
8180 result
= strtounicode(buffer
, (char *)buffer
);
8181 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
8184 /* XXX To save some code duplication, formatfloat/long/int could have been
8185 shared with stringobject.c, converting from 8-bit to Unicode after the
8186 formatting is done. */
8189 formatfloat(Py_UNICODE
*buf
,
8196 /* fmt = '%#.' + `prec` + `type`
8197 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8201 x
= PyFloat_AsDouble(v
);
8202 if (x
== -1.0 && PyErr_Occurred())
8206 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
8208 /* Worst case length calc to ensure no buffer overrun:
8212 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8213 for any double rep.)
8214 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8217 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8218 len = 1 + 50 + 1 + prec = 52 + prec
8220 If prec=0 the effective precision is 1 (the leading digit is
8221 always given), therefore increase the length by one.
8224 if (((type
== 'g' || type
== 'G') &&
8225 buflen
<= (size_t)10 + (size_t)prec
) ||
8226 (type
== 'f' && buflen
<= (size_t)53 + (size_t)prec
)) {
8227 PyErr_SetString(PyExc_OverflowError
,
8228 "formatted float is too long (precision too large?)");
8231 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%d%c",
8232 (flags
&F_ALT
) ? "#" : "",
8234 return doubletounicode(buf
, buflen
, fmt
, x
);
8238 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
8242 PyObject
*str
; /* temporary string object. */
8243 PyUnicodeObject
*result
;
8245 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
8248 result
= _PyUnicode_New(len
);
8253 for (i
= 0; i
< len
; i
++)
8254 result
->str
[i
] = buf
[i
];
8255 result
->str
[len
] = 0;
8257 return (PyObject
*)result
;
8261 formatint(Py_UNICODE
*buf
,
8268 /* fmt = '%#.' + `prec` + 'l' + `type`
8269 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8273 char fmt
[64]; /* plenty big enough! */
8277 x
= PyInt_AsLong(v
);
8278 if (x
== -1 && PyErr_Occurred())
8280 if (x
< 0 && type
== 'u') {
8283 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
8290 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8291 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8293 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
8294 PyErr_SetString(PyExc_OverflowError
,
8295 "formatted integer is too long (precision too large?)");
8299 if ((flags
& F_ALT
) &&
8300 (type
== 'x' || type
== 'X')) {
8301 /* When converting under %#x or %#X, there are a number
8302 * of issues that cause pain:
8303 * - when 0 is being converted, the C standard leaves off
8304 * the '0x' or '0X', which is inconsistent with other
8305 * %#x/%#X conversions and inconsistent with Python's
8307 * - there are platforms that violate the standard and
8308 * convert 0 with the '0x' or '0X'
8309 * (Metrowerks, Compaq Tru64)
8310 * - there are platforms that give '0x' when converting
8311 * under %#X, but convert 0 in accordance with the
8312 * standard (OS/2 EMX)
8314 * We can achieve the desired consistency by inserting our
8315 * own '0x' or '0X' prefix, and substituting %x/%X in place
8318 * Note that this is the same approach as used in
8319 * formatint() in stringobject.c
8321 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
8322 sign
, type
, prec
, type
);
8325 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
8326 sign
, (flags
&F_ALT
) ? "#" : "",
8330 return longtounicode(buf
, buflen
, fmt
, -x
);
8332 return longtounicode(buf
, buflen
, fmt
, x
);
8336 formatchar(Py_UNICODE
*buf
,
8340 /* presume that the buffer is at least 2 characters long */
8341 if (PyUnicode_Check(v
)) {
8342 if (PyUnicode_GET_SIZE(v
) != 1)
8344 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
8347 else if (PyString_Check(v
)) {
8348 if (PyString_GET_SIZE(v
) != 1)
8350 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
8354 /* Integer input truncated to a character */
8356 x
= PyInt_AsLong(v
);
8357 if (x
== -1 && PyErr_Occurred())
8359 #ifdef Py_UNICODE_WIDE
8360 if (x
< 0 || x
> 0x10ffff) {
8361 PyErr_SetString(PyExc_OverflowError
,
8362 "%c arg not in range(0x110000) "
8363 "(wide Python build)");
8367 if (x
< 0 || x
> 0xffff) {
8368 PyErr_SetString(PyExc_OverflowError
,
8369 "%c arg not in range(0x10000) "
8370 "(narrow Python build)");
8374 buf
[0] = (Py_UNICODE
) x
;
8380 PyErr_SetString(PyExc_TypeError
,
8381 "%c requires int or char");
8385 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8387 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8388 chars are formatted. XXX This is a magic number. Each formatting
8389 routine does bounds checking to ensure no overflow, but a better
8390 solution may be to malloc a buffer of appropriate size for each
8391 format. For now, the current solution is sufficient.
8393 #define FORMATBUFLEN (size_t)120
8395 PyObject
*PyUnicode_Format(PyObject
*format
,
8398 Py_UNICODE
*fmt
, *res
;
8399 Py_ssize_t fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
8401 PyUnicodeObject
*result
= NULL
;
8402 PyObject
*dict
= NULL
;
8405 if (format
== NULL
|| args
== NULL
) {
8406 PyErr_BadInternalCall();
8409 uformat
= PyUnicode_FromObject(format
);
8410 if (uformat
== NULL
)
8412 fmt
= PyUnicode_AS_UNICODE(uformat
);
8413 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
8415 reslen
= rescnt
= fmtcnt
+ 100;
8416 result
= _PyUnicode_New(reslen
);
8419 res
= PyUnicode_AS_UNICODE(result
);
8421 if (PyTuple_Check(args
)) {
8422 arglen
= PyTuple_Size(args
);
8429 if (Py_TYPE(args
)->tp_as_mapping
&& !PyTuple_Check(args
) &&
8430 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
8433 while (--fmtcnt
>= 0) {
8436 rescnt
= fmtcnt
+ 100;
8438 if (_PyUnicode_Resize(&result
, reslen
) < 0)
8440 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
8446 /* Got a format specifier */
8448 Py_ssize_t width
= -1;
8450 Py_UNICODE c
= '\0';
8454 PyObject
*temp
= NULL
;
8458 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
8462 Py_UNICODE
*keystart
;
8468 PyErr_SetString(PyExc_TypeError
,
8469 "format requires a mapping");
8475 /* Skip over balanced parentheses */
8476 while (pcount
> 0 && --fmtcnt
>= 0) {
8479 else if (*fmt
== '(')
8483 keylen
= fmt
- keystart
- 1;
8484 if (fmtcnt
< 0 || pcount
> 0) {
8485 PyErr_SetString(PyExc_ValueError
,
8486 "incomplete format key");
8490 /* keys are converted to strings using UTF-8 and
8491 then looked up since Python uses strings to hold
8492 variables names etc. in its namespaces and we
8493 wouldn't want to break common idioms. */
8494 key
= PyUnicode_EncodeUTF8(keystart
,
8498 key
= PyUnicode_FromUnicode(keystart
, keylen
);
8506 args
= PyObject_GetItem(dict
, key
);
8515 while (--fmtcnt
>= 0) {
8516 switch (c
= *fmt
++) {
8517 case '-': flags
|= F_LJUST
; continue;
8518 case '+': flags
|= F_SIGN
; continue;
8519 case ' ': flags
|= F_BLANK
; continue;
8520 case '#': flags
|= F_ALT
; continue;
8521 case '0': flags
|= F_ZERO
; continue;
8526 v
= getnextarg(args
, arglen
, &argidx
);
8529 if (!PyInt_Check(v
)) {
8530 PyErr_SetString(PyExc_TypeError
,
8534 width
= PyInt_AsLong(v
);
8542 else if (c
>= '0' && c
<= '9') {
8544 while (--fmtcnt
>= 0) {
8546 if (c
< '0' || c
> '9')
8548 if ((width
*10) / 10 != width
) {
8549 PyErr_SetString(PyExc_ValueError
,
8553 width
= width
*10 + (c
- '0');
8561 v
= getnextarg(args
, arglen
, &argidx
);
8564 if (!PyInt_Check(v
)) {
8565 PyErr_SetString(PyExc_TypeError
,
8569 prec
= PyInt_AsLong(v
);
8575 else if (c
>= '0' && c
<= '9') {
8577 while (--fmtcnt
>= 0) {
8578 c
= Py_CHARMASK(*fmt
++);
8579 if (c
< '0' || c
> '9')
8581 if ((prec
*10) / 10 != prec
) {
8582 PyErr_SetString(PyExc_ValueError
,
8586 prec
= prec
*10 + (c
- '0');
8591 if (c
== 'h' || c
== 'l' || c
== 'L') {
8597 PyErr_SetString(PyExc_ValueError
,
8598 "incomplete format");
8602 v
= getnextarg(args
, arglen
, &argidx
);
8612 /* presume that buffer length is at least 1 */
8619 if (PyUnicode_Check(v
) && c
== 's') {
8626 temp
= PyObject_Unicode(v
);
8628 temp
= PyObject_Repr(v
);
8631 if (PyUnicode_Check(temp
))
8632 /* nothing to do */;
8633 else if (PyString_Check(temp
)) {
8634 /* convert to string to Unicode */
8635 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
8636 PyString_GET_SIZE(temp
),
8646 PyErr_SetString(PyExc_TypeError
,
8647 "%s argument has non-string str()");
8651 pbuf
= PyUnicode_AS_UNICODE(temp
);
8652 len
= PyUnicode_GET_SIZE(temp
);
8653 if (prec
>= 0 && len
> prec
)
8666 if (PyNumber_Check(v
)) {
8667 PyObject
*iobj
=NULL
;
8669 if (PyInt_Check(v
) || (PyLong_Check(v
))) {
8674 iobj
= PyNumber_Int(v
);
8675 if (iobj
==NULL
) iobj
= PyNumber_Long(v
);
8678 if (PyInt_Check(iobj
)) {
8681 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8682 flags
, prec
, c
, iobj
);
8688 else if (PyLong_Check(iobj
)) {
8690 temp
= formatlong(iobj
, flags
, prec
, c
);
8694 pbuf
= PyUnicode_AS_UNICODE(temp
);
8695 len
= PyUnicode_GET_SIZE(temp
);
8704 PyErr_Format(PyExc_TypeError
,
8705 "%%%c format: a number is required, "
8706 "not %.200s", (char)c
, Py_TYPE(v
)->tp_name
);
8722 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8733 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
8739 PyErr_Format(PyExc_ValueError
,
8740 "unsupported format character '%c' (0x%x) "
8742 (31<=c
&& c
<=126) ? (char)c
: '?',
8744 (Py_ssize_t
)(fmt
- 1 -
8745 PyUnicode_AS_UNICODE(uformat
)));
8749 if (*pbuf
== '-' || *pbuf
== '+') {
8753 else if (flags
& F_SIGN
)
8755 else if (flags
& F_BLANK
)
8762 if (rescnt
- (sign
!= 0) < width
) {
8764 rescnt
= width
+ fmtcnt
+ 100;
8771 if (_PyUnicode_Resize(&result
, reslen
) < 0) {
8775 res
= PyUnicode_AS_UNICODE(result
)
8785 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8786 assert(pbuf
[0] == '0');
8787 assert(pbuf
[1] == c
);
8798 if (width
> len
&& !(flags
& F_LJUST
)) {
8802 } while (--width
> len
);
8807 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8808 assert(pbuf
[0] == '0');
8809 assert(pbuf
[1] == c
);
8814 Py_UNICODE_COPY(res
, pbuf
, len
);
8817 while (--width
>= len
) {
8821 if (dict
&& (argidx
< arglen
) && c
!= '%') {
8822 PyErr_SetString(PyExc_TypeError
,
8823 "not all arguments converted during string formatting");
8830 if (argidx
< arglen
&& !dict
) {
8831 PyErr_SetString(PyExc_TypeError
,
8832 "not all arguments converted during string formatting");
8836 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
8842 return (PyObject
*)result
;
8853 static PyBufferProcs unicode_as_buffer
= {
8854 (readbufferproc
) unicode_buffer_getreadbuf
,
8855 (writebufferproc
) unicode_buffer_getwritebuf
,
8856 (segcountproc
) unicode_buffer_getsegcount
,
8857 (charbufferproc
) unicode_buffer_getcharbuf
,
8861 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
8864 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8867 static char *kwlist
[] = {"string", "encoding", "errors", 0};
8868 char *encoding
= NULL
;
8869 char *errors
= NULL
;
8871 if (type
!= &PyUnicode_Type
)
8872 return unicode_subtype_new(type
, args
, kwds
);
8873 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
8874 kwlist
, &x
, &encoding
, &errors
))
8877 return (PyObject
*)_PyUnicode_New(0);
8878 if (encoding
== NULL
&& errors
== NULL
)
8879 return PyObject_Unicode(x
);
8881 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
8885 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8887 PyUnicodeObject
*tmp
, *pnew
;
8890 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
8891 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
8894 assert(PyUnicode_Check(tmp
));
8895 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
8900 pnew
->str
= (Py_UNICODE
*) PyObject_MALLOC(sizeof(Py_UNICODE
) * (n
+1));
8901 if (pnew
->str
== NULL
) {
8902 _Py_ForgetReference((PyObject
*)pnew
);
8905 return PyErr_NoMemory();
8907 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
8909 pnew
->hash
= tmp
->hash
;
8911 return (PyObject
*)pnew
;
8914 PyDoc_STRVAR(unicode_doc
,
8915 "unicode(string [, encoding[, errors]]) -> object\n\
8917 Create a new Unicode object from the given encoded string.\n\
8918 encoding defaults to the current default string encoding.\n\
8919 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8921 PyTypeObject PyUnicode_Type
= {
8922 PyVarObject_HEAD_INIT(&PyType_Type
, 0)
8923 "unicode", /* tp_name */
8924 sizeof(PyUnicodeObject
), /* tp_size */
8925 0, /* tp_itemsize */
8927 (destructor
)unicode_dealloc
, /* tp_dealloc */
8932 unicode_repr
, /* tp_repr */
8933 &unicode_as_number
, /* tp_as_number */
8934 &unicode_as_sequence
, /* tp_as_sequence */
8935 &unicode_as_mapping
, /* tp_as_mapping */
8936 (hashfunc
) unicode_hash
, /* tp_hash*/
8938 (reprfunc
) unicode_str
, /* tp_str */
8939 PyObject_GenericGetAttr
, /* tp_getattro */
8940 0, /* tp_setattro */
8941 &unicode_as_buffer
, /* tp_as_buffer */
8942 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
8943 Py_TPFLAGS_BASETYPE
| Py_TPFLAGS_UNICODE_SUBCLASS
, /* tp_flags */
8944 unicode_doc
, /* tp_doc */
8945 0, /* tp_traverse */
8947 PyUnicode_RichCompare
, /* tp_richcompare */
8948 0, /* tp_weaklistoffset */
8950 0, /* tp_iternext */
8951 unicode_methods
, /* tp_methods */
8954 &PyBaseString_Type
, /* tp_base */
8956 0, /* tp_descr_get */
8957 0, /* tp_descr_set */
8958 0, /* tp_dictoffset */
8961 unicode_new
, /* tp_new */
8962 PyObject_Del
, /* tp_free */
8965 /* Initialize the Unicode implementation */
8967 void _PyUnicode_Init(void)
8971 /* XXX - move this array to unicodectype.c ? */
8972 Py_UNICODE linebreak
[] = {
8973 0x000A, /* LINE FEED */
8974 0x000D, /* CARRIAGE RETURN */
8975 0x001C, /* FILE SEPARATOR */
8976 0x001D, /* GROUP SEPARATOR */
8977 0x001E, /* RECORD SEPARATOR */
8978 0x0085, /* NEXT LINE */
8979 0x2028, /* LINE SEPARATOR */
8980 0x2029, /* PARAGRAPH SEPARATOR */
8983 /* Init the implementation */
8986 unicode_empty
= _PyUnicode_New(0);
8990 strcpy(unicode_default_encoding
, "ascii");
8991 for (i
= 0; i
< 256; i
++)
8992 unicode_latin1
[i
] = NULL
;
8993 if (PyType_Ready(&PyUnicode_Type
) < 0)
8994 Py_FatalError("Can't initialize 'unicode'");
8996 /* initialize the linebreak bloom filter */
8997 bloom_linebreak
= make_bloom_mask(
8998 linebreak
, sizeof(linebreak
) / sizeof(linebreak
[0])
9001 PyType_Ready(&EncodingMapType
);
9004 /* Finalize the Unicode implementation */
9007 PyUnicode_ClearFreeList(void)
9009 int freelist_size
= numfree
;
9012 for (u
= free_list
; u
!= NULL
;) {
9013 PyUnicodeObject
*v
= u
;
9014 u
= *(PyUnicodeObject
**)u
;
9016 PyObject_DEL(v
->str
);
9017 Py_XDECREF(v
->defenc
);
9022 assert(numfree
== 0);
9023 return freelist_size
;
9027 _PyUnicode_Fini(void)
9031 Py_XDECREF(unicode_empty
);
9032 unicode_empty
= NULL
;
9034 for (i
= 0; i
< 256; i
++) {
9035 if (unicode_latin1
[i
]) {
9036 Py_DECREF(unicode_latin1
[i
]);
9037 unicode_latin1
[i
] = NULL
;
9040 (void)PyUnicode_ClearFreeList();
9051 indent-tabs-mode: nil