3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
45 #include "unicodeobject.h"
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
80 # define BYTEORDER_IS_LITTLE_ENDIAN
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
95 /* Free list for Unicode objects */
96 static PyUnicodeObject
*free_list
;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject
*unicode_empty
;
102 /* Single character Unicode strings in the Latin-1 range are being
104 static PyUnicodeObject
*unicode_latin1
[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding
[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace
[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 // case 0x0009: /* HORIZONTAL TABULATION */
119 // case 0x000A: /* LINE FEED */
120 // case 0x000B: /* VERTICAL TABULATION */
121 // case 0x000C: /* FORM FEED */
122 // case 0x000D: /* CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 // case 0x001C: /* FILE SEPARATOR */
126 // case 0x001D: /* GROUP SEPARATOR */
127 // case 0x001E: /* RECORD SEPARATOR */
128 // case 0x001F: /* UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 // case 0x0020: /* SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak
[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 // 0x000A, /* LINE FEED */
150 // 0x000D, /* CARRIAGE RETURN */
151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 // 0x001C, /* FILE SEPARATOR */
154 // 0x001D, /* GROUP SEPARATOR */
155 // 0x001E, /* RECORD SEPARATOR */
156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
174 PyUnicode_GetMax(void)
176 #ifdef Py_UNICODE_WIDE
179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
185 /* --- Bloom Filters ----------------------------------------------------- */
187 /* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
191 /* the linebreak mask is set up by Unicode_Init below */
193 #define BLOOM_MASK unsigned long
195 static BLOOM_MASK bloom_linebreak
;
197 #define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
199 #define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
203 Py_LOCAL_INLINE(BLOOM_MASK
) make_bloom_mask(Py_UNICODE
* ptr
, Py_ssize_t len
)
205 /* calculate simple bloom-style bitmask for a given unicode string */
211 for (i
= 0; i
< len
; i
++)
212 mask
|= (1 << (ptr
[i
] & 0x1F));
217 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr
, Py_UNICODE
* set
, Py_ssize_t setlen
)
221 for (i
= 0; i
< setlen
; i
++)
228 #define BLOOM_MEMBER(mask, chr, set, setlen)\
229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
231 /* --- Unicode Object ----------------------------------------------------- */
234 int unicode_resize(register PyUnicodeObject
*unicode
,
239 /* Shortcut if there's nothing much to do. */
240 if (unicode
->length
== length
)
243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
247 if (unicode
== unicode_empty
||
248 (unicode
->length
== 1 &&
249 unicode
->str
[0] < 256U &&
250 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
251 PyErr_SetString(PyExc_SystemError
,
252 "can't resize shared unicode objects");
256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
258 safe to look at str[length] (without making any assumptions about what
261 oldstr
= unicode
->str
;
262 unicode
->str
= PyObject_REALLOC(unicode
->str
,
263 sizeof(Py_UNICODE
) * (length
+ 1));
265 unicode
->str
= (Py_UNICODE
*)oldstr
;
269 unicode
->str
[length
] = 0;
270 unicode
->length
= length
;
273 /* Reset the object caches */
274 if (unicode
->defenc
) {
275 Py_DECREF(unicode
->defenc
);
276 unicode
->defenc
= NULL
;
283 /* We allocate one more byte to make sure the string is
284 Ux0000 terminated -- XXX is this needed ?
286 XXX This allocator could further be enhanced by assuring that the
287 free list never reduces its size below 1.
292 PyUnicodeObject
*_PyUnicode_New(Py_ssize_t length
)
294 register PyUnicodeObject
*unicode
;
296 /* Optimization for empty strings */
297 if (length
== 0 && unicode_empty
!= NULL
) {
298 Py_INCREF(unicode_empty
);
299 return unicode_empty
;
302 /* Ensure we won't overflow the size. */
303 if (length
> ((PY_SSIZE_T_MAX
/ sizeof(Py_UNICODE
)) - 1)) {
304 return (PyUnicodeObject
*)PyErr_NoMemory();
307 /* Unicode freelist & memory allocation */
310 free_list
= *(PyUnicodeObject
**)unicode
;
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode
->length
< length
) &&
316 unicode_resize(unicode
, length
) < 0) {
317 PyObject_DEL(unicode
->str
);
322 size_t new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
323 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
325 PyObject_INIT(unicode
, &PyUnicode_Type
);
329 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
332 new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
333 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
340 /* Initialize the first element to guard against cases where
341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
348 unicode
->str
[length
] = 0;
349 unicode
->length
= length
;
351 unicode
->defenc
= NULL
;
355 /* XXX UNREF/NEWREF interface should be more symmetrical */
357 _Py_ForgetReference((PyObject
*)unicode
);
358 PyObject_Del(unicode
);
363 void unicode_dealloc(register PyUnicodeObject
*unicode
)
365 if (PyUnicode_CheckExact(unicode
) &&
366 numfree
< PyUnicode_MAXFREELIST
) {
367 /* Keep-Alive optimization */
368 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
369 PyObject_DEL(unicode
->str
);
373 if (unicode
->defenc
) {
374 Py_DECREF(unicode
->defenc
);
375 unicode
->defenc
= NULL
;
377 /* Add to free list */
378 *(PyUnicodeObject
**)unicode
= free_list
;
383 PyObject_DEL(unicode
->str
);
384 Py_XDECREF(unicode
->defenc
);
385 Py_TYPE(unicode
)->tp_free((PyObject
*)unicode
);
389 int PyUnicode_Resize(PyObject
**unicode
, Py_ssize_t length
)
391 register PyUnicodeObject
*v
;
393 /* Argument checks */
394 if (unicode
== NULL
) {
395 PyErr_BadInternalCall();
398 v
= (PyUnicodeObject
*)*unicode
;
399 if (v
== NULL
|| !PyUnicode_Check(v
) || Py_REFCNT(v
) != 1 || length
< 0) {
400 PyErr_BadInternalCall();
404 /* Resizing unicode_empty and single character objects is not
405 possible since these are being shared. We simply return a fresh
406 copy with the same Unicode content. */
407 if (v
->length
!= length
&&
408 (v
== unicode_empty
|| v
->length
== 1)) {
409 PyUnicodeObject
*w
= _PyUnicode_New(length
);
412 Py_UNICODE_COPY(w
->str
, v
->str
,
413 length
< v
->length
? length
: v
->length
);
415 *unicode
= (PyObject
*)w
;
419 /* Note that we don't have to modify *unicode for unshared Unicode
420 objects, since we can modify them in-place. */
421 return unicode_resize(v
, length
);
424 /* Internal API for use in unicodeobject.c only ! */
425 #define _PyUnicode_Resize(unicodevar, length) \
426 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
428 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
431 PyUnicodeObject
*unicode
;
433 /* If the Unicode data is known at construction time, we can apply
434 some optimizations which share commonly used objects. */
437 /* Optimization for empty strings */
438 if (size
== 0 && unicode_empty
!= NULL
) {
439 Py_INCREF(unicode_empty
);
440 return (PyObject
*)unicode_empty
;
443 /* Single character Unicode objects in the Latin-1 range are
444 shared when using this constructor */
445 if (size
== 1 && *u
< 256) {
446 unicode
= unicode_latin1
[*u
];
448 unicode
= _PyUnicode_New(1);
451 unicode
->str
[0] = *u
;
452 unicode_latin1
[*u
] = unicode
;
455 return (PyObject
*)unicode
;
459 unicode
= _PyUnicode_New(size
);
463 /* Copy the Unicode data into the new object */
465 Py_UNICODE_COPY(unicode
->str
, u
, size
);
467 return (PyObject
*)unicode
;
470 PyObject
*PyUnicode_FromStringAndSize(const char *u
, Py_ssize_t size
)
472 PyUnicodeObject
*unicode
;
475 PyErr_SetString(PyExc_SystemError
,
476 "Negative size passed to PyUnicode_FromStringAndSize");
480 /* If the Unicode data is known at construction time, we can apply
481 some optimizations which share commonly used objects.
482 Also, this means the input must be UTF-8, so fall back to the
483 UTF-8 decoder at the end. */
486 /* Optimization for empty strings */
487 if (size
== 0 && unicode_empty
!= NULL
) {
488 Py_INCREF(unicode_empty
);
489 return (PyObject
*)unicode_empty
;
492 /* Single characters are shared when using this constructor.
493 Restrict to ASCII, since the input must be UTF-8. */
494 if (size
== 1 && Py_CHARMASK(*u
) < 128) {
495 unicode
= unicode_latin1
[Py_CHARMASK(*u
)];
497 unicode
= _PyUnicode_New(1);
500 unicode
->str
[0] = Py_CHARMASK(*u
);
501 unicode_latin1
[Py_CHARMASK(*u
)] = unicode
;
504 return (PyObject
*)unicode
;
507 return PyUnicode_DecodeUTF8(u
, size
, NULL
);
510 unicode
= _PyUnicode_New(size
);
514 return (PyObject
*)unicode
;
517 PyObject
*PyUnicode_FromString(const char *u
)
519 size_t size
= strlen(u
);
520 if (size
> PY_SSIZE_T_MAX
) {
521 PyErr_SetString(PyExc_OverflowError
, "input too long");
525 return PyUnicode_FromStringAndSize(u
, size
);
530 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
533 PyUnicodeObject
*unicode
;
536 PyErr_BadInternalCall();
540 unicode
= _PyUnicode_New(size
);
544 /* Copy the wchar_t data into the new object */
545 #ifdef HAVE_USABLE_WCHAR_T
546 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
549 register Py_UNICODE
*u
;
550 register Py_ssize_t i
;
551 u
= PyUnicode_AS_UNICODE(unicode
);
552 for (i
= size
; i
> 0; i
--)
557 return (PyObject
*)unicode
;
561 makefmt(char *fmt
, int longflag
, int size_tflag
, int zeropad
, int width
, int precision
, char c
)
567 fmt
+= sprintf(fmt
, "%d", width
);
570 fmt
+= sprintf(fmt
, ".%d", precision
);
573 else if (size_tflag
) {
574 char *f
= PY_FORMAT_SIZE_T
;
582 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
585 PyUnicode_FromFormatV(const char *format
, va_list vargs
)
588 Py_ssize_t callcount
= 0;
589 PyObject
**callresults
= NULL
;
590 PyObject
**callresult
= NULL
;
598 /* used by sprintf */
600 /* use abuffer instead of buffer, if we need more space
601 * (which can happen if there's a format specifier with width). */
602 char *abuffer
= NULL
;
604 Py_ssize_t abuffersize
= 0;
605 char fmt
[60]; /* should be enough for %0width.precisionld */
608 #ifdef VA_LIST_IS_ARRAY
609 Py_MEMCPY(count
, vargs
, sizeof(va_list));
612 __va_copy(count
, vargs
);
617 /* step 1: count the number of %S/%R format specifications
618 * (we call PyObject_Str()/PyObject_Repr() for these objects
619 * once during step 3 and put the result in an array) */
620 for (f
= format
; *f
; f
++) {
621 if (*f
== '%' && (*(f
+1)=='S' || *(f
+1)=='R'))
624 /* step 2: allocate memory for the results of
625 * PyObject_Str()/PyObject_Repr() calls */
627 callresults
= PyObject_Malloc(sizeof(PyObject
*)*callcount
);
632 callresult
= callresults
;
634 /* step 3: figure out how large a buffer we need */
635 for (f
= format
; *f
; f
++) {
639 while (isdigit((unsigned)*f
))
640 width
= (width
*10) + *f
++ - '0';
641 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
644 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
645 * they don't affect the amount of space we reserve.
647 if ((*f
== 'l' || *f
== 'z') &&
648 (f
[1] == 'd' || f
[1] == 'u'))
653 (void)va_arg(count
, int);
654 /* fall through... */
658 case 'd': case 'u': case 'i': case 'x':
659 (void) va_arg(count
, int);
660 /* 20 bytes is enough to hold a 64-bit
661 integer. Decimal takes the most space.
662 This isn't enough for octal.
663 If a width is specified we need more
664 (which we allocate later). */
668 if (abuffersize
< width
)
675 s
= va_arg(count
, unsigned char*);
679 } else if (*s
< 0xc0) {
682 } else if (*s
< 0xc0) {
686 } else if (*s
< 0xe0) {
692 #ifdef Py_UNICODE_WIDE
707 PyObject
*obj
= va_arg(count
, PyObject
*);
708 assert(obj
&& PyUnicode_Check(obj
));
709 n
+= PyUnicode_GET_SIZE(obj
);
714 PyObject
*obj
= va_arg(count
, PyObject
*);
715 const char *str
= va_arg(count
, const char *);
717 assert(!obj
|| PyUnicode_Check(obj
));
719 n
+= PyUnicode_GET_SIZE(obj
);
726 PyObject
*obj
= va_arg(count
, PyObject
*);
729 str
= PyObject_Str(obj
);
732 n
+= PyUnicode_GET_SIZE(str
);
733 /* Remember the str and switch to the next slot */
739 PyObject
*obj
= va_arg(count
, PyObject
*);
742 repr
= PyObject_Repr(obj
);
745 n
+= PyUnicode_GET_SIZE(repr
);
746 /* Remember the repr and switch to the next slot */
747 *callresult
++ = repr
;
751 (void) va_arg(count
, int);
752 /* maximum 64-bit pointer representation:
754 * so 19 characters is enough.
755 * XXX I count 18 -- what's the extra for?
760 /* if we stumble upon an unknown
761 formatting code, copy the rest of
762 the format string to the output
763 string. (we cannot just skip the
764 code, since there's no way to know
765 what's in the argument list) */
773 if (abuffersize
> 20) {
774 abuffer
= PyObject_Malloc(abuffersize
);
779 realbuffer
= abuffer
;
783 /* step 4: fill the buffer */
784 /* Since we've analyzed how much space we need for the worst case,
785 we don't have to resize the string.
786 There can be no errors beyond this point. */
787 string
= PyUnicode_FromUnicode(NULL
, n
);
791 s
= PyUnicode_AS_UNICODE(string
);
792 callresult
= callresults
;
794 for (f
= format
; *f
; f
++) {
799 zeropad
= (*f
== '0');
800 /* parse the width.precision part */
802 while (isdigit((unsigned)*f
))
803 width
= (width
*10) + *f
++ - '0';
807 while (isdigit((unsigned)*f
))
808 precision
= (precision
*10) + *f
++ - '0';
810 /* handle the long flag, but only for %ld and %lu.
811 others can be added when necessary. */
812 if (*f
== 'l' && (f
[1] == 'd' || f
[1] == 'u')) {
816 /* handle the size_t flag. */
817 if (*f
== 'z' && (f
[1] == 'd' || f
[1] == 'u')) {
824 *s
++ = va_arg(vargs
, int);
827 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'd');
829 sprintf(realbuffer
, fmt
, va_arg(vargs
, long));
831 sprintf(realbuffer
, fmt
, va_arg(vargs
, Py_ssize_t
));
833 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
834 appendstring(realbuffer
);
837 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'u');
839 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned long));
841 sprintf(realbuffer
, fmt
, va_arg(vargs
, size_t));
843 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned int));
844 appendstring(realbuffer
);
847 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'i');
848 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
849 appendstring(realbuffer
);
852 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'x');
853 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
854 appendstring(realbuffer
);
858 /* Parameter must be UTF-8 encoded.
859 In case of encoding errors, use
860 the replacement character. */
862 p
= va_arg(vargs
, char*);
863 u
= PyUnicode_DecodeUTF8(p
, strlen(p
),
867 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(u
),
868 PyUnicode_GET_SIZE(u
));
869 s
+= PyUnicode_GET_SIZE(u
);
875 PyObject
*obj
= va_arg(vargs
, PyObject
*);
876 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
877 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
883 PyObject
*obj
= va_arg(vargs
, PyObject
*);
884 const char *str
= va_arg(vargs
, const char *);
886 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
887 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
900 /* unused, since we already have the result */
901 (void) va_arg(vargs
, PyObject
*);
902 ucopy
= PyUnicode_AS_UNICODE(*callresult
);
903 usize
= PyUnicode_GET_SIZE(*callresult
);
904 for (upos
= 0; upos
<usize
;)
905 *s
++ = ucopy
[upos
++];
906 /* We're done with the unicode()/repr() => forget it */
907 Py_DECREF(*callresult
);
908 /* switch to next unicode()/repr() result */
913 sprintf(buffer
, "%p", va_arg(vargs
, void*));
914 /* %p is ill-defined: ensure leading 0x. */
915 if (buffer
[1] == 'X')
917 else if (buffer
[1] != 'x') {
918 memmove(buffer
+2, buffer
, strlen(buffer
)+1);
922 appendstring(buffer
);
937 PyObject_Free(callresults
);
939 PyObject_Free(abuffer
);
940 _PyUnicode_Resize(&string
, s
- PyUnicode_AS_UNICODE(string
));
944 PyObject
**callresult2
= callresults
;
945 while (callresult2
< callresult
) {
946 Py_DECREF(*callresult2
);
949 PyObject_Free(callresults
);
952 PyObject_Free(abuffer
);
959 PyUnicode_FromFormat(const char *format
, ...)
964 #ifdef HAVE_STDARG_PROTOTYPES
965 va_start(vargs
, format
);
969 ret
= PyUnicode_FromFormatV(format
, vargs
);
974 Py_ssize_t
PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
978 if (unicode
== NULL
) {
979 PyErr_BadInternalCall();
983 /* If possible, try to copy the 0-termination as well */
984 if (size
> PyUnicode_GET_SIZE(unicode
))
985 size
= PyUnicode_GET_SIZE(unicode
) + 1;
987 #ifdef HAVE_USABLE_WCHAR_T
988 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
991 register Py_UNICODE
*u
;
992 register Py_ssize_t i
;
993 u
= PyUnicode_AS_UNICODE(unicode
);
994 for (i
= size
; i
> 0; i
--)
999 if (size
> PyUnicode_GET_SIZE(unicode
))
1000 return PyUnicode_GET_SIZE(unicode
);
1007 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
1011 #ifdef Py_UNICODE_WIDE
1012 if (ordinal
< 0 || ordinal
> 0x10ffff) {
1013 PyErr_SetString(PyExc_ValueError
,
1014 "unichr() arg not in range(0x110000) "
1015 "(wide Python build)");
1019 if (ordinal
< 0 || ordinal
> 0xffff) {
1020 PyErr_SetString(PyExc_ValueError
,
1021 "unichr() arg not in range(0x10000) "
1022 "(narrow Python build)");
1027 s
[0] = (Py_UNICODE
)ordinal
;
1028 return PyUnicode_FromUnicode(s
, 1);
1031 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
1033 /* XXX Perhaps we should make this API an alias of
1034 PyObject_Unicode() instead ?! */
1035 if (PyUnicode_CheckExact(obj
)) {
1039 if (PyUnicode_Check(obj
)) {
1040 /* For a Unicode subtype that's not a Unicode object,
1041 return a true Unicode object with the same data. */
1042 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
1043 PyUnicode_GET_SIZE(obj
));
1045 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
1048 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
1049 const char *encoding
,
1052 const char *s
= NULL
;
1057 PyErr_BadInternalCall();
1062 /* For b/w compatibility we also accept Unicode objects provided
1063 that no encodings is given and then redirect to
1064 PyObject_Unicode() which then applies the additional logic for
1067 NOTE: This API should really only be used for object which
1068 represent *encoded* Unicode !
1071 if (PyUnicode_Check(obj
)) {
1073 PyErr_SetString(PyExc_TypeError
,
1074 "decoding Unicode is not supported");
1077 return PyObject_Unicode(obj
);
1080 if (PyUnicode_Check(obj
)) {
1081 PyErr_SetString(PyExc_TypeError
,
1082 "decoding Unicode is not supported");
1088 if (PyString_Check(obj
)) {
1089 s
= PyString_AS_STRING(obj
);
1090 len
= PyString_GET_SIZE(obj
);
1092 else if (PyByteArray_Check(obj
)) {
1093 /* Python 2.x specific */
1094 PyErr_Format(PyExc_TypeError
,
1095 "decoding bytearray is not supported");
1098 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
1099 /* Overwrite the error message with something more useful in
1100 case of a TypeError. */
1101 if (PyErr_ExceptionMatches(PyExc_TypeError
))
1102 PyErr_Format(PyExc_TypeError
,
1103 "coercing to Unicode: need string or buffer, "
1105 Py_TYPE(obj
)->tp_name
);
1109 /* Convert to Unicode */
1111 Py_INCREF(unicode_empty
);
1112 v
= (PyObject
*)unicode_empty
;
1115 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
1123 PyObject
*PyUnicode_Decode(const char *s
,
1125 const char *encoding
,
1128 PyObject
*buffer
= NULL
, *unicode
;
1130 if (encoding
== NULL
)
1131 encoding
= PyUnicode_GetDefaultEncoding();
1133 /* Shortcuts for common default encodings */
1134 if (strcmp(encoding
, "utf-8") == 0)
1135 return PyUnicode_DecodeUTF8(s
, size
, errors
);
1136 else if (strcmp(encoding
, "latin-1") == 0)
1137 return PyUnicode_DecodeLatin1(s
, size
, errors
);
1138 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1139 else if (strcmp(encoding
, "mbcs") == 0)
1140 return PyUnicode_DecodeMBCS(s
, size
, errors
);
1142 else if (strcmp(encoding
, "ascii") == 0)
1143 return PyUnicode_DecodeASCII(s
, size
, errors
);
1145 /* Decode via the codec registry */
1146 buffer
= PyBuffer_FromMemory((void *)s
, size
);
1149 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
1150 if (unicode
== NULL
)
1152 if (!PyUnicode_Check(unicode
)) {
1153 PyErr_Format(PyExc_TypeError
,
1154 "decoder did not return an unicode object (type=%.400s)",
1155 Py_TYPE(unicode
)->tp_name
);
1167 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
1168 const char *encoding
,
1173 if (!PyUnicode_Check(unicode
)) {
1174 PyErr_BadArgument();
1178 if (encoding
== NULL
)
1179 encoding
= PyUnicode_GetDefaultEncoding();
1181 /* Decode via the codec registry */
1182 v
= PyCodec_Decode(unicode
, encoding
, errors
);
1191 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
1193 const char *encoding
,
1196 PyObject
*v
, *unicode
;
1198 unicode
= PyUnicode_FromUnicode(s
, size
);
1199 if (unicode
== NULL
)
1201 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
1206 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
1207 const char *encoding
,
1212 if (!PyUnicode_Check(unicode
)) {
1213 PyErr_BadArgument();
1217 if (encoding
== NULL
)
1218 encoding
= PyUnicode_GetDefaultEncoding();
1220 /* Encode via the codec registry */
1221 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1230 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
1231 const char *encoding
,
1236 if (!PyUnicode_Check(unicode
)) {
1237 PyErr_BadArgument();
1241 if (encoding
== NULL
)
1242 encoding
= PyUnicode_GetDefaultEncoding();
1244 /* Shortcuts for common default encodings */
1245 if (errors
== NULL
) {
1246 if (strcmp(encoding
, "utf-8") == 0)
1247 return PyUnicode_AsUTF8String(unicode
);
1248 else if (strcmp(encoding
, "latin-1") == 0)
1249 return PyUnicode_AsLatin1String(unicode
);
1250 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1251 else if (strcmp(encoding
, "mbcs") == 0)
1252 return PyUnicode_AsMBCSString(unicode
);
1254 else if (strcmp(encoding
, "ascii") == 0)
1255 return PyUnicode_AsASCIIString(unicode
);
1258 /* Encode via the codec registry */
1259 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1262 if (!PyString_Check(v
)) {
1263 PyErr_Format(PyExc_TypeError
,
1264 "encoder did not return a string object (type=%.400s)",
1265 Py_TYPE(v
)->tp_name
);
1275 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
1278 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
1282 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
1283 if (v
&& errors
== NULL
)
1284 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
1288 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
1290 if (!PyUnicode_Check(unicode
)) {
1291 PyErr_BadArgument();
1294 return PyUnicode_AS_UNICODE(unicode
);
1300 Py_ssize_t
PyUnicode_GetSize(PyObject
*unicode
)
1302 if (!PyUnicode_Check(unicode
)) {
1303 PyErr_BadArgument();
1306 return PyUnicode_GET_SIZE(unicode
);
1312 const char *PyUnicode_GetDefaultEncoding(void)
1314 return unicode_default_encoding
;
1317 int PyUnicode_SetDefaultEncoding(const char *encoding
)
1321 /* Make sure the encoding is valid. As side effect, this also
1322 loads the encoding into the codec registry cache. */
1323 v
= _PyCodec_Lookup(encoding
);
1327 strncpy(unicode_default_encoding
,
1329 sizeof(unicode_default_encoding
));
1336 /* error handling callback helper:
1337 build arguments, call the callback and check the arguments,
1338 if no exception occurred, copy the replacement to the output
1339 and adjust various state variables.
1340 return 0 on success, -1 on error
1344 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
1345 const char *encoding
, const char *reason
,
1346 const char *input
, Py_ssize_t insize
, Py_ssize_t
*startinpos
,
1347 Py_ssize_t
*endinpos
, PyObject
**exceptionObject
, const char **inptr
,
1348 PyObject
**output
, Py_ssize_t
*outpos
, Py_UNICODE
**outptr
)
1350 static char *argparse
= "O!n;decoding error handler must return (unicode, int) tuple";
1352 PyObject
*restuple
= NULL
;
1353 PyObject
*repunicode
= NULL
;
1354 Py_ssize_t outsize
= PyUnicode_GET_SIZE(*output
);
1355 Py_ssize_t requiredsize
;
1361 if (*errorHandler
== NULL
) {
1362 *errorHandler
= PyCodec_LookupError(errors
);
1363 if (*errorHandler
== NULL
)
1367 if (*exceptionObject
== NULL
) {
1368 *exceptionObject
= PyUnicodeDecodeError_Create(
1369 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
1370 if (*exceptionObject
== NULL
)
1374 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
1376 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
1378 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
1382 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
1383 if (restuple
== NULL
)
1385 if (!PyTuple_Check(restuple
)) {
1386 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
1389 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
1392 newpos
= insize
+newpos
;
1393 if (newpos
<0 || newpos
>insize
) {
1394 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", newpos
);
1398 /* need more space? (at least enough for what we
1399 have+the replacement+the rest of the string (starting
1400 at the new input position), so we won't have to check space
1401 when there are no errors in the rest of the string) */
1402 repptr
= PyUnicode_AS_UNICODE(repunicode
);
1403 repsize
= PyUnicode_GET_SIZE(repunicode
);
1404 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
1405 if (requiredsize
> outsize
) {
1406 if (requiredsize
<2*outsize
)
1407 requiredsize
= 2*outsize
;
1408 if (PyUnicode_Resize(output
, requiredsize
) < 0)
1410 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
1413 *inptr
= input
+ newpos
;
1414 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
1421 Py_XDECREF(restuple
);
1425 /* --- UTF-7 Codec -------------------------------------------------------- */
1427 /* see RFC2152 for details */
1430 char utf7_special
[128] = {
1431 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1435 2 - whitespace (optional)
1436 3 - RFC2152 Set O (optional) */
1437 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1438 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1439 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1440 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1441 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1443 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1448 /* Note: The comparison (c) <= 0 is a trick to work-around gcc
1449 warnings about the comparison always being false; since
1450 utf7_special[0] is 1, we can safely make that one comparison
1453 #define SPECIAL(c, encodeO, encodeWS) \
1454 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
1455 (encodeWS && (utf7_special[(c)] == 2)) || \
1456 (encodeO && (utf7_special[(c)] == 3)))
1459 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1460 #define B64CHAR(c) \
1461 (isalnum(c) || (c) == '+' || (c) == '/')
1463 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
1464 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
1466 #define ENCODE(out, ch, bits) \
1467 while (bits >= 6) { \
1468 *out++ = B64(ch >> (bits-6)); \
1472 #define DECODE(out, ch, bits, surrogate) \
1473 while (bits >= 16) { \
1474 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1477 /* We have already generated an error for the high surrogate \
1478 so let's not bother seeing if the low surrogate is correct or not */ \
1480 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
1481 /* This is a surrogate pair. Unfortunately we can't represent \
1482 it in a 16-bit character */ \
1484 errmsg = "code pairs are not supported"; \
1491 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
1495 return PyUnicode_DecodeUTF7Stateful(s
, size
, errors
, NULL
);
1498 PyObject
*PyUnicode_DecodeUTF7Stateful(const char *s
,
1501 Py_ssize_t
*consumed
)
1503 const char *starts
= s
;
1504 Py_ssize_t startinpos
;
1505 Py_ssize_t endinpos
;
1508 PyUnicodeObject
*unicode
;
1510 const char *errmsg
= "";
1512 unsigned int bitsleft
= 0;
1513 unsigned long charsleft
= 0;
1515 PyObject
*errorHandler
= NULL
;
1516 PyObject
*exc
= NULL
;
1518 unicode
= _PyUnicode_New(size
);
1524 return (PyObject
*)unicode
;
1533 ch
= (unsigned char) *s
;
1536 if ((ch
== '-') || !B64CHAR(ch
)) {
1540 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
1541 if (bitsleft
>= 6) {
1542 /* The shift sequence has a partial character in it. If
1543 bitsleft < 6 then we could just classify it as padding
1544 but that is not the case here */
1546 errmsg
= "partial character in shift sequence";
1549 /* According to RFC2152 the remaining bits should be zero. We
1550 choose to signal an error/insert a replacement character
1551 here so indicate the potential of a misencoded character. */
1553 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1554 if (bitsleft
&& charsleft
<< (sizeof(charsleft
) * 8 - bitsleft
)) {
1555 errmsg
= "non-zero padding bits in shift sequence";
1560 if ((s
< e
) && (*(s
) == '-')) {
1564 } else if (SPECIAL(ch
,0,0)) {
1565 errmsg
= "unexpected special character";
1571 charsleft
= (charsleft
<< 6) | UB64(ch
);
1574 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
1577 else if ( ch
== '+' ) {
1578 startinpos
= s
-starts
;
1580 if (s
< e
&& *s
== '-') {
1589 else if (SPECIAL(ch
,0,0)) {
1590 startinpos
= s
-starts
;
1591 errmsg
= "unexpected special character";
1601 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1602 endinpos
= s
-starts
;
1603 if (unicode_decode_call_errorhandler(
1604 errors
, &errorHandler
,
1606 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1607 (PyObject
**)&unicode
, &outpos
, &p
))
1611 if (inShift
&& !consumed
) {
1612 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1614 if (unicode_decode_call_errorhandler(
1615 errors
, &errorHandler
,
1616 "utf7", "unterminated shift sequence",
1617 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1618 (PyObject
**)&unicode
, &outpos
, &p
))
1625 *consumed
= startinpos
;
1627 *consumed
= s
-starts
;
1630 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1633 Py_XDECREF(errorHandler
);
1635 return (PyObject
*)unicode
;
1638 Py_XDECREF(errorHandler
);
1645 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1648 int encodeWhiteSpace
,
1652 /* It might be possible to tighten this worst case */
1653 Py_ssize_t cbAllocated
= 5 * size
;
1656 unsigned int bitsleft
= 0;
1657 unsigned long charsleft
= 0;
1661 if (cbAllocated
/ 5 != size
)
1662 return PyErr_NoMemory();
1665 return PyString_FromStringAndSize(NULL
, 0);
1667 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
1671 start
= out
= PyString_AS_STRING(v
);
1672 for (;i
< size
; ++i
) {
1673 Py_UNICODE ch
= s
[i
];
1679 } else if (SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1683 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1684 inShift
= bitsleft
> 0;
1689 if (!SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1690 *out
++ = B64(charsleft
<< (6-bitsleft
));
1693 /* Characters not in the BASE64 set implicitly unshift the sequence
1694 so no '-' is required, except if the character is itself a '-' */
1695 if (B64CHAR(ch
) || ch
== '-') {
1702 charsleft
= (charsleft
<< 16) | ch
;
1703 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1705 /* If the next character is special then we dont' need to terminate
1706 the shift sequence. If the next character is not a BASE64 character
1707 or '-' then the shift sequence will be terminated implicitly and we
1708 don't have to insert a '-'. */
1710 if (bitsleft
== 0) {
1712 Py_UNICODE ch2
= s
[i
+1];
1714 if (SPECIAL(ch2
, encodeSetO
, encodeWhiteSpace
)) {
1716 } else if (B64CHAR(ch2
) || ch2
== '-') {
1733 *out
++= B64(charsleft
<< (6-bitsleft
) );
1737 _PyString_Resize(&v
, out
- start
);
1748 /* --- UTF-8 Codec -------------------------------------------------------- */
1751 char utf8_code_length
[256] = {
1752 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1753 illegal prefix. see RFC 2279 for details */
1754 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1755 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1758 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1759 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1760 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1761 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1762 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1763 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1764 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1766 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1767 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1768 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1769 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1772 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1776 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1779 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1782 Py_ssize_t
*consumed
)
1784 const char *starts
= s
;
1786 Py_ssize_t startinpos
;
1787 Py_ssize_t endinpos
;
1790 PyUnicodeObject
*unicode
;
1792 const char *errmsg
= "";
1793 PyObject
*errorHandler
= NULL
;
1794 PyObject
*exc
= NULL
;
1796 /* Note: size will always be longer than the resulting Unicode
1798 unicode
= _PyUnicode_New(size
);
1804 return (PyObject
*)unicode
;
1807 /* Unpack UTF-8 encoded data */
1812 Py_UCS4 ch
= (unsigned char)*s
;
1815 *p
++ = (Py_UNICODE
)ch
;
1820 n
= utf8_code_length
[ch
];
1826 errmsg
= "unexpected end of data";
1827 startinpos
= s
-starts
;
1836 errmsg
= "unexpected code byte";
1837 startinpos
= s
-starts
;
1838 endinpos
= startinpos
+1;
1842 errmsg
= "internal error";
1843 startinpos
= s
-starts
;
1844 endinpos
= startinpos
+1;
1848 if ((s
[1] & 0xc0) != 0x80) {
1849 errmsg
= "invalid data";
1850 startinpos
= s
-starts
;
1851 endinpos
= startinpos
+2;
1854 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1856 startinpos
= s
-starts
;
1857 endinpos
= startinpos
+2;
1858 errmsg
= "illegal encoding";
1862 *p
++ = (Py_UNICODE
)ch
;
1866 if ((s
[1] & 0xc0) != 0x80 ||
1867 (s
[2] & 0xc0) != 0x80) {
1868 errmsg
= "invalid data";
1869 startinpos
= s
-starts
;
1870 endinpos
= startinpos
+3;
1873 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1875 /* Note: UTF-8 encodings of surrogates are considered
1876 legal UTF-8 sequences;
1878 XXX For wide builds (UCS-4) we should probably try
1879 to recombine the surrogates into a single code
1882 errmsg
= "illegal encoding";
1883 startinpos
= s
-starts
;
1884 endinpos
= startinpos
+3;
1888 *p
++ = (Py_UNICODE
)ch
;
1892 if ((s
[1] & 0xc0) != 0x80 ||
1893 (s
[2] & 0xc0) != 0x80 ||
1894 (s
[3] & 0xc0) != 0x80) {
1895 errmsg
= "invalid data";
1896 startinpos
= s
-starts
;
1897 endinpos
= startinpos
+4;
1900 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
1901 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
1902 /* validate and convert to UTF-16 */
1903 if ((ch
< 0x10000) /* minimum value allowed for 4
1905 || (ch
> 0x10ffff)) /* maximum value allowed for
1908 errmsg
= "illegal encoding";
1909 startinpos
= s
-starts
;
1910 endinpos
= startinpos
+4;
1913 #ifdef Py_UNICODE_WIDE
1914 *p
++ = (Py_UNICODE
)ch
;
1916 /* compute and append the two surrogates: */
1918 /* translate from 10000..10FFFF to 0..FFFF */
1921 /* high surrogate = top 10 bits added to D800 */
1922 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
1924 /* low surrogate = bottom 10 bits added to DC00 */
1925 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
1930 /* Other sizes are only needed for UCS-4 */
1931 errmsg
= "unsupported Unicode code range";
1932 startinpos
= s
-starts
;
1933 endinpos
= startinpos
+n
;
1940 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1941 if (unicode_decode_call_errorhandler(
1942 errors
, &errorHandler
,
1944 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1945 (PyObject
**)&unicode
, &outpos
, &p
))
1949 *consumed
= s
-starts
;
1952 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
1955 Py_XDECREF(errorHandler
);
1957 return (PyObject
*)unicode
;
1960 Py_XDECREF(errorHandler
);
1966 /* Allocation strategy: if the string is short, convert into a stack buffer
1967 and allocate exactly as much space needed at the end. Else allocate the
1968 maximum possible needed (4 result bytes per Unicode character), and return
1969 the excess memory at the end.
1972 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
1976 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1978 Py_ssize_t i
; /* index into s of next input byte */
1979 PyObject
*v
; /* result string object */
1980 char *p
; /* next free byte in output buffer */
1981 Py_ssize_t nallocated
; /* number of result bytes allocated */
1982 Py_ssize_t nneeded
; /* number of result bytes needed */
1983 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
1988 if (size
<= MAX_SHORT_UNICHARS
) {
1989 /* Write into the stack buffer; nallocated can't overflow.
1990 * At the end, we'll allocate exactly as much heap space as it
1991 * turns out we need.
1993 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
1994 v
= NULL
; /* will allocate after we're done */
1998 /* Overallocate on the heap, and give the excess back at the end. */
1999 nallocated
= size
* 4;
2000 if (nallocated
/ 4 != size
) /* overflow! */
2001 return PyErr_NoMemory();
2002 v
= PyString_FromStringAndSize(NULL
, nallocated
);
2005 p
= PyString_AS_STRING(v
);
2008 for (i
= 0; i
< size
;) {
2009 Py_UCS4 ch
= s
[i
++];
2015 else if (ch
< 0x0800) {
2016 /* Encode Latin-1 */
2017 *p
++ = (char)(0xc0 | (ch
>> 6));
2018 *p
++ = (char)(0x80 | (ch
& 0x3f));
2021 /* Encode UCS2 Unicode ordinals */
2023 /* Special case: check for high surrogate */
2024 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
2026 /* Check for low surrogate and combine the two to
2027 form a UCS4 value */
2028 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2029 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
2033 /* Fall through: handles isolated high surrogates */
2035 *p
++ = (char)(0xe0 | (ch
>> 12));
2036 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2037 *p
++ = (char)(0x80 | (ch
& 0x3f));
2041 /* Encode UCS4 Unicode ordinals */
2042 *p
++ = (char)(0xf0 | (ch
>> 18));
2043 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
2044 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2045 *p
++ = (char)(0x80 | (ch
& 0x3f));
2050 /* This was stack allocated. */
2051 nneeded
= p
- stackbuf
;
2052 assert(nneeded
<= nallocated
);
2053 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
2056 /* Cut back to size actually needed. */
2057 nneeded
= p
- PyString_AS_STRING(v
);
2058 assert(nneeded
<= nallocated
);
2059 _PyString_Resize(&v
, nneeded
);
2063 #undef MAX_SHORT_UNICHARS
2066 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
2068 if (!PyUnicode_Check(unicode
)) {
2069 PyErr_BadArgument();
2072 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
2073 PyUnicode_GET_SIZE(unicode
),
2077 /* --- UTF-32 Codec ------------------------------------------------------- */
2080 PyUnicode_DecodeUTF32(const char *s
,
2085 return PyUnicode_DecodeUTF32Stateful(s
, size
, errors
, byteorder
, NULL
);
2089 PyUnicode_DecodeUTF32Stateful(const char *s
,
2093 Py_ssize_t
*consumed
)
2095 const char *starts
= s
;
2096 Py_ssize_t startinpos
;
2097 Py_ssize_t endinpos
;
2099 PyUnicodeObject
*unicode
;
2101 #ifndef Py_UNICODE_WIDE
2104 const int pairs
= 0;
2106 const unsigned char *q
, *e
;
2107 int bo
= 0; /* assume native ordering by default */
2108 const char *errmsg
= "";
2109 /* Offsets from q for retrieving bytes in the right order. */
2110 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2111 int iorder
[] = {0, 1, 2, 3};
2113 int iorder
[] = {3, 2, 1, 0};
2115 PyObject
*errorHandler
= NULL
;
2116 PyObject
*exc
= NULL
;
2117 /* On narrow builds we split characters outside the BMP into two
2118 codepoints => count how much extra space we need. */
2119 #ifndef Py_UNICODE_WIDE
2120 for (i
= pairs
= 0; i
< size
/4; i
++)
2121 if (((Py_UCS4
*)s
)[i
] >= 0x10000)
2125 /* This might be one to much, because of a BOM */
2126 unicode
= _PyUnicode_New((size
+3)/4+pairs
);
2130 return (PyObject
*)unicode
;
2132 /* Unpack UTF-32 encoded data */
2134 q
= (unsigned char *)s
;
2140 /* Check for BOM marks (U+FEFF) in the input and adjust current
2141 byte order setting accordingly. In native mode, the leading BOM
2142 mark is skipped, in all other modes, it is copied to the output
2143 stream as-is (giving a ZWNBSP character). */
2146 const Py_UCS4 bom
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2147 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2148 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2149 if (bom
== 0x0000FEFF) {
2153 else if (bom
== 0xFFFE0000) {
2158 if (bom
== 0x0000FEFF) {
2162 else if (bom
== 0xFFFE0000) {
2187 /* remaining bytes at the end? (size should be divisible by 4) */
2191 errmsg
= "truncated data";
2192 startinpos
= ((const char *)q
)-starts
;
2193 endinpos
= ((const char *)e
)-starts
;
2195 /* The remaining input chars are ignored if the callback
2196 chooses to skip the input */
2198 ch
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2199 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2203 errmsg
= "codepoint not in range(0x110000)";
2204 startinpos
= ((const char *)q
)-starts
;
2205 endinpos
= startinpos
+4;
2208 #ifndef Py_UNICODE_WIDE
2211 *p
++ = 0xD800 | ((ch
-0x10000) >> 10);
2212 *p
++ = 0xDC00 | ((ch
-0x10000) & 0x3FF);
2220 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2221 if (unicode_decode_call_errorhandler(
2222 errors
, &errorHandler
,
2224 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2225 (PyObject
**)&unicode
, &outpos
, &p
))
2233 *consumed
= (const char *)q
-starts
;
2236 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2239 Py_XDECREF(errorHandler
);
2241 return (PyObject
*)unicode
;
2245 Py_XDECREF(errorHandler
);
2251 PyUnicode_EncodeUTF32(const Py_UNICODE
*s
,
2258 Py_ssize_t nsize
, bytesize
;
2259 #ifndef Py_UNICODE_WIDE
2260 Py_ssize_t i
, pairs
;
2262 const int pairs
= 0;
2264 /* Offsets from p for storing byte pairs in the right order. */
2265 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2266 int iorder
[] = {0, 1, 2, 3};
2268 int iorder
[] = {3, 2, 1, 0};
2271 #define STORECHAR(CH) \
2273 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2274 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2275 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2276 p[iorder[0]] = (CH) & 0xff; \
2280 /* In narrow builds we can output surrogate pairs as one codepoint,
2281 so we need less space. */
2282 #ifndef Py_UNICODE_WIDE
2283 for (i
= pairs
= 0; i
< size
-1; i
++)
2284 if (0xD800 <= s
[i
] && s
[i
] <= 0xDBFF &&
2285 0xDC00 <= s
[i
+1] && s
[i
+1] <= 0xDFFF)
2288 nsize
= (size
- pairs
+ (byteorder
== 0));
2289 bytesize
= nsize
* 4;
2290 if (bytesize
/ 4 != nsize
)
2291 return PyErr_NoMemory();
2292 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2296 p
= (unsigned char *)PyString_AS_STRING(v
);
2302 if (byteorder
== -1) {
2309 else if (byteorder
== 1) {
2317 while (size
-- > 0) {
2319 #ifndef Py_UNICODE_WIDE
2320 if (0xD800 <= ch
&& ch
<= 0xDBFF && size
> 0) {
2322 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2323 ch
= (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2335 PyObject
*PyUnicode_AsUTF32String(PyObject
*unicode
)
2337 if (!PyUnicode_Check(unicode
)) {
2338 PyErr_BadArgument();
2341 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode
),
2342 PyUnicode_GET_SIZE(unicode
),
2347 /* --- UTF-16 Codec ------------------------------------------------------- */
2350 PyUnicode_DecodeUTF16(const char *s
,
2355 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
2359 PyUnicode_DecodeUTF16Stateful(const char *s
,
2363 Py_ssize_t
*consumed
)
2365 const char *starts
= s
;
2366 Py_ssize_t startinpos
;
2367 Py_ssize_t endinpos
;
2369 PyUnicodeObject
*unicode
;
2371 const unsigned char *q
, *e
;
2372 int bo
= 0; /* assume native ordering by default */
2373 const char *errmsg
= "";
2374 /* Offsets from q for retrieving byte pairs in the right order. */
2375 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2376 int ihi
= 1, ilo
= 0;
2378 int ihi
= 0, ilo
= 1;
2380 PyObject
*errorHandler
= NULL
;
2381 PyObject
*exc
= NULL
;
2383 /* Note: size will always be longer than the resulting Unicode
2385 unicode
= _PyUnicode_New(size
);
2389 return (PyObject
*)unicode
;
2391 /* Unpack UTF-16 encoded data */
2393 q
= (unsigned char *)s
;
2399 /* Check for BOM marks (U+FEFF) in the input and adjust current
2400 byte order setting accordingly. In native mode, the leading BOM
2401 mark is skipped, in all other modes, it is copied to the output
2402 stream as-is (giving a ZWNBSP character). */
2405 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
2406 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2407 if (bom
== 0xFEFF) {
2411 else if (bom
== 0xFFFE) {
2416 if (bom
== 0xFEFF) {
2420 else if (bom
== 0xFFFE) {
2441 /* remaining bytes at the end? (size should be even) */
2445 errmsg
= "truncated data";
2446 startinpos
= ((const char *)q
)-starts
;
2447 endinpos
= ((const char *)e
)-starts
;
2449 /* The remaining input chars are ignored if the callback
2450 chooses to skip the input */
2452 ch
= (q
[ihi
] << 8) | q
[ilo
];
2456 if (ch
< 0xD800 || ch
> 0xDFFF) {
2461 /* UTF-16 code pair: */
2463 errmsg
= "unexpected end of data";
2464 startinpos
= (((const char *)q
)-2)-starts
;
2465 endinpos
= ((const char *)e
)-starts
;
2468 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
2469 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
2471 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2472 #ifndef Py_UNICODE_WIDE
2476 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2481 errmsg
= "illegal UTF-16 surrogate";
2482 startinpos
= (((const char *)q
)-4)-starts
;
2483 endinpos
= startinpos
+2;
2488 errmsg
= "illegal encoding";
2489 startinpos
= (((const char *)q
)-2)-starts
;
2490 endinpos
= startinpos
+2;
2491 /* Fall through to report the error */
2494 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2495 if (unicode_decode_call_errorhandler(
2496 errors
, &errorHandler
,
2498 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2499 (PyObject
**)&unicode
, &outpos
, &p
))
2507 *consumed
= (const char *)q
-starts
;
2510 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2513 Py_XDECREF(errorHandler
);
2515 return (PyObject
*)unicode
;
2519 Py_XDECREF(errorHandler
);
2525 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
2532 Py_ssize_t nsize
, bytesize
;
2533 #ifdef Py_UNICODE_WIDE
2534 Py_ssize_t i
, pairs
;
2536 const int pairs
= 0;
2538 /* Offsets from p for storing byte pairs in the right order. */
2539 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2540 int ihi
= 1, ilo
= 0;
2542 int ihi
= 0, ilo
= 1;
2545 #define STORECHAR(CH) \
2547 p[ihi] = ((CH) >> 8) & 0xff; \
2548 p[ilo] = (CH) & 0xff; \
2552 #ifdef Py_UNICODE_WIDE
2553 for (i
= pairs
= 0; i
< size
; i
++)
2554 if (s
[i
] >= 0x10000)
2557 /* 2 * (size + pairs + (byteorder == 0)) */
2558 if (size
> PY_SSIZE_T_MAX
||
2559 size
> PY_SSIZE_T_MAX
- pairs
- (byteorder
== 0))
2560 return PyErr_NoMemory();
2561 nsize
= size
+ pairs
+ (byteorder
== 0);
2562 bytesize
= nsize
* 2;
2563 if (bytesize
/ 2 != nsize
)
2564 return PyErr_NoMemory();
2565 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2569 p
= (unsigned char *)PyString_AS_STRING(v
);
2575 if (byteorder
== -1) {
2580 else if (byteorder
== 1) {
2586 while (size
-- > 0) {
2587 Py_UNICODE ch
= *s
++;
2589 #ifdef Py_UNICODE_WIDE
2590 if (ch
>= 0x10000) {
2591 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
2592 ch
= 0xD800 | ((ch
-0x10000) >> 10);
2603 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
2605 if (!PyUnicode_Check(unicode
)) {
2606 PyErr_BadArgument();
2609 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
2610 PyUnicode_GET_SIZE(unicode
),
2615 /* --- Unicode Escape Codec ----------------------------------------------- */
2617 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
2619 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
2623 const char *starts
= s
;
2624 Py_ssize_t startinpos
;
2625 Py_ssize_t endinpos
;
2632 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
2633 PyObject
*errorHandler
= NULL
;
2634 PyObject
*exc
= NULL
;
2636 /* Escaped strings will always be longer than the resulting
2637 Unicode string, so we start with size here and then reduce the
2638 length after conversion to the true value.
2639 (but if the error callback returns a long replacement string
2640 we'll have to allocate more space) */
2641 v
= _PyUnicode_New(size
);
2645 return (PyObject
*)v
;
2647 p
= PyUnicode_AS_UNICODE(v
);
2655 /* Non-escape characters are interpreted as Unicode ordinals */
2657 *p
++ = (unsigned char) *s
++;
2661 startinpos
= s
-starts
;
2666 c
= '\0'; /* Invalid after \ */
2671 case '\\': *p
++ = '\\'; break;
2672 case '\'': *p
++ = '\''; break;
2673 case '\"': *p
++ = '\"'; break;
2674 case 'b': *p
++ = '\b'; break;
2675 case 'f': *p
++ = '\014'; break; /* FF */
2676 case 't': *p
++ = '\t'; break;
2677 case 'n': *p
++ = '\n'; break;
2678 case 'r': *p
++ = '\r'; break;
2679 case 'v': *p
++ = '\013'; break; /* VT */
2680 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
2682 /* \OOO (octal) escapes */
2683 case '0': case '1': case '2': case '3':
2684 case '4': case '5': case '6': case '7':
2686 if (s
< end
&& '0' <= *s
&& *s
<= '7') {
2687 x
= (x
<<3) + *s
++ - '0';
2688 if (s
< end
&& '0' <= *s
&& *s
<= '7')
2689 x
= (x
<<3) + *s
++ - '0';
2698 message
= "truncated \\xXX escape";
2704 message
= "truncated \\uXXXX escape";
2710 message
= "truncated \\UXXXXXXXX escape";
2713 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2716 if (unicode_decode_call_errorhandler(
2717 errors
, &errorHandler
,
2718 "unicodeescape", "end of string in escape sequence",
2719 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2720 (PyObject
**)&v
, &outpos
, &p
))
2724 for (i
= 0; i
< digits
; ++i
) {
2725 c
= (unsigned char) s
[i
];
2727 endinpos
= (s
+i
+1)-starts
;
2728 if (unicode_decode_call_errorhandler(
2729 errors
, &errorHandler
,
2730 "unicodeescape", message
,
2731 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2732 (PyObject
**)&v
, &outpos
, &p
))
2736 chr
= (chr
<<4) & ~0xF;
2737 if (c
>= '0' && c
<= '9')
2739 else if (c
>= 'a' && c
<= 'f')
2740 chr
+= 10 + c
- 'a';
2742 chr
+= 10 + c
- 'A';
2745 if (chr
== 0xffffffff && PyErr_Occurred())
2746 /* _decoding_error will have already written into the
2750 /* when we get here, chr is a 32-bit unicode character */
2752 /* UCS-2 character */
2753 *p
++ = (Py_UNICODE
) chr
;
2754 else if (chr
<= 0x10ffff) {
2755 /* UCS-4 character. Either store directly, or as
2757 #ifdef Py_UNICODE_WIDE
2761 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
2762 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
2765 endinpos
= s
-starts
;
2766 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2767 if (unicode_decode_call_errorhandler(
2768 errors
, &errorHandler
,
2769 "unicodeescape", "illegal Unicode character",
2770 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2771 (PyObject
**)&v
, &outpos
, &p
))
2778 message
= "malformed \\N character escape";
2779 if (ucnhash_CAPI
== NULL
) {
2780 /* load the unicode data module */
2782 m
= PyImport_ImportModuleNoBlock("unicodedata");
2785 api
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
2789 ucnhash_CAPI
= (_PyUnicode_Name_CAPI
*)PyCObject_AsVoidPtr(api
);
2791 if (ucnhash_CAPI
== NULL
)
2795 const char *start
= s
+1;
2796 /* look for the closing brace */
2797 while (*s
!= '}' && s
< end
)
2799 if (s
> start
&& s
< end
&& *s
== '}') {
2800 /* found a name. look it up in the unicode database */
2801 message
= "unknown Unicode character name";
2803 if (ucnhash_CAPI
->getcode(NULL
, start
, (int)(s
-start
-1), &chr
))
2807 endinpos
= s
-starts
;
2808 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2809 if (unicode_decode_call_errorhandler(
2810 errors
, &errorHandler
,
2811 "unicodeescape", message
,
2812 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2813 (PyObject
**)&v
, &outpos
, &p
))
2819 message
= "\\ at end of string";
2821 endinpos
= s
-starts
;
2822 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2823 if (unicode_decode_call_errorhandler(
2824 errors
, &errorHandler
,
2825 "unicodeescape", message
,
2826 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2827 (PyObject
**)&v
, &outpos
, &p
))
2832 *p
++ = (unsigned char)s
[-1];
2839 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2841 Py_XDECREF(errorHandler
);
2843 return (PyObject
*)v
;
2848 "\\N escapes not supported (can't load unicodedata module)"
2851 Py_XDECREF(errorHandler
);
2857 Py_XDECREF(errorHandler
);
2862 /* Return a Unicode-Escape string version of the Unicode object.
2864 If quotes is true, the string is enclosed in u"" or u'' quotes as
2869 Py_LOCAL_INLINE(const Py_UNICODE
*) findchar(const Py_UNICODE
*s
,
2873 /* like wcschr, but doesn't stop at NULL characters */
2875 while (size
-- > 0) {
2885 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
2892 static const char *hexdigit
= "0123456789abcdef";
2893 #ifdef Py_UNICODE_WIDE
2894 const Py_ssize_t expandsize
= 10;
2896 const Py_ssize_t expandsize
= 6;
2899 /* XXX(nnorwitz): rather than over-allocating, it would be
2900 better to choose a different scheme. Perhaps scan the
2901 first N-chars of the string and allocate based on that size.
2903 /* Initial allocation is based on the longest-possible unichr
2906 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2907 unichr, so in this case it's the longest unichr escape. In
2908 narrow (UTF-16) builds this is five chars per source unichr
2909 since there are two unichrs in the surrogate pair, so in narrow
2910 (UTF-16) builds it's not the longest unichr escape.
2912 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2913 so in the narrow (UTF-16) build case it's the longest unichr
2917 if (size
> (PY_SSIZE_T_MAX
- 2 - 1) / expandsize
)
2918 return PyErr_NoMemory();
2920 repr
= PyString_FromStringAndSize(NULL
,
2927 p
= PyString_AS_STRING(repr
);
2931 *p
++ = (findchar(s
, size
, '\'') &&
2932 !findchar(s
, size
, '"')) ? '"' : '\'';
2934 while (size
-- > 0) {
2935 Py_UNICODE ch
= *s
++;
2937 /* Escape quotes and backslashes */
2939 ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1]) || ch
== '\\') {
2945 #ifdef Py_UNICODE_WIDE
2946 /* Map 21-bit characters to '\U00xxxxxx' */
2947 else if (ch
>= 0x10000) {
2950 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
2951 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
2952 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
2953 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
2954 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
2955 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
2956 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
2957 *p
++ = hexdigit
[ch
& 0x0000000F];
2961 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2962 else if (ch
>= 0xD800 && ch
< 0xDC00) {
2968 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
2969 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
2972 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
2973 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
2974 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
2975 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
2976 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
2977 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
2978 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
2979 *p
++ = hexdigit
[ucs
& 0x0000000F];
2982 /* Fall through: isolated surrogates are copied as-is */
2988 /* Map 16-bit characters to '\uxxxx' */
2992 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
2993 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
2994 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
2995 *p
++ = hexdigit
[ch
& 0x000F];
2998 /* Map special whitespace to '\t', \n', '\r' */
2999 else if (ch
== '\t') {
3003 else if (ch
== '\n') {
3007 else if (ch
== '\r') {
3012 /* Map non-printable US ASCII to '\xhh' */
3013 else if (ch
< ' ' || ch
>= 0x7F) {
3016 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3017 *p
++ = hexdigit
[ch
& 0x000F];
3020 /* Copy everything else as-is */
3025 *p
++ = PyString_AS_STRING(repr
)[1];
3028 _PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
));
3032 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
3035 return unicodeescape_string(s
, size
, 0);
3038 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
3040 if (!PyUnicode_Check(unicode
)) {
3041 PyErr_BadArgument();
3044 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3045 PyUnicode_GET_SIZE(unicode
));
3048 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3050 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
3054 const char *starts
= s
;
3055 Py_ssize_t startinpos
;
3056 Py_ssize_t endinpos
;
3062 PyObject
*errorHandler
= NULL
;
3063 PyObject
*exc
= NULL
;
3065 /* Escaped strings will always be longer than the resulting
3066 Unicode string, so we start with size here and then reduce the
3067 length after conversion to the true value. (But decoding error
3068 handler might have to resize the string) */
3069 v
= _PyUnicode_New(size
);
3073 return (PyObject
*)v
;
3074 p
= PyUnicode_AS_UNICODE(v
);
3082 /* Non-escape characters are interpreted as Unicode ordinals */
3084 *p
++ = (unsigned char)*s
++;
3087 startinpos
= s
-starts
;
3089 /* \u-escapes are only interpreted iff the number of leading
3090 backslashes if odd */
3095 *p
++ = (unsigned char)*s
++;
3097 if (((s
- bs
) & 1) == 0 ||
3099 (*s
!= 'u' && *s
!= 'U')) {
3103 count
= *s
=='u' ? 4 : 8;
3106 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3107 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3108 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
3109 c
= (unsigned char)*s
;
3111 endinpos
= s
-starts
;
3112 if (unicode_decode_call_errorhandler(
3113 errors
, &errorHandler
,
3114 "rawunicodeescape", "truncated \\uXXXX",
3115 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3116 (PyObject
**)&v
, &outpos
, &p
))
3121 if (c
>= '0' && c
<= '9')
3123 else if (c
>= 'a' && c
<= 'f')
3129 /* UCS-2 character */
3130 *p
++ = (Py_UNICODE
) x
;
3131 else if (x
<= 0x10ffff) {
3132 /* UCS-4 character. Either store directly, or as
3134 #ifdef Py_UNICODE_WIDE
3135 *p
++ = (Py_UNICODE
) x
;
3138 *p
++ = 0xD800 + (Py_UNICODE
) (x
>> 10);
3139 *p
++ = 0xDC00 + (Py_UNICODE
) (x
& 0x03FF);
3142 endinpos
= s
-starts
;
3143 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3144 if (unicode_decode_call_errorhandler(
3145 errors
, &errorHandler
,
3146 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3147 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3148 (PyObject
**)&v
, &outpos
, &p
))
3154 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3156 Py_XDECREF(errorHandler
);
3158 return (PyObject
*)v
;
3162 Py_XDECREF(errorHandler
);
3167 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
3174 static const char *hexdigit
= "0123456789abcdef";
3175 #ifdef Py_UNICODE_WIDE
3176 const Py_ssize_t expandsize
= 10;
3178 const Py_ssize_t expandsize
= 6;
3181 if (size
> PY_SSIZE_T_MAX
/ expandsize
)
3182 return PyErr_NoMemory();
3184 repr
= PyString_FromStringAndSize(NULL
, expandsize
* size
);
3190 p
= q
= PyString_AS_STRING(repr
);
3191 while (size
-- > 0) {
3192 Py_UNICODE ch
= *s
++;
3193 #ifdef Py_UNICODE_WIDE
3194 /* Map 32-bit characters to '\Uxxxxxxxx' */
3195 if (ch
>= 0x10000) {
3198 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
3199 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
3200 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
3201 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
3202 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3203 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3204 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3205 *p
++ = hexdigit
[ch
& 15];
3209 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3210 if (ch
>= 0xD800 && ch
< 0xDC00) {
3216 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3217 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3220 *p
++ = hexdigit
[(ucs
>> 28) & 0xf];
3221 *p
++ = hexdigit
[(ucs
>> 24) & 0xf];
3222 *p
++ = hexdigit
[(ucs
>> 20) & 0xf];
3223 *p
++ = hexdigit
[(ucs
>> 16) & 0xf];
3224 *p
++ = hexdigit
[(ucs
>> 12) & 0xf];
3225 *p
++ = hexdigit
[(ucs
>> 8) & 0xf];
3226 *p
++ = hexdigit
[(ucs
>> 4) & 0xf];
3227 *p
++ = hexdigit
[ucs
& 0xf];
3230 /* Fall through: isolated surrogates are copied as-is */
3235 /* Map 16-bit characters to '\uxxxx' */
3239 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3240 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3241 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3242 *p
++ = hexdigit
[ch
& 15];
3244 /* Copy everything else as-is */
3249 _PyString_Resize(&repr
, p
- q
);
3253 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
3255 if (!PyUnicode_Check(unicode
)) {
3256 PyErr_BadArgument();
3259 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3260 PyUnicode_GET_SIZE(unicode
));
3263 /* --- Unicode Internal Codec ------------------------------------------- */
3265 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
3269 const char *starts
= s
;
3270 Py_ssize_t startinpos
;
3271 Py_ssize_t endinpos
;
3277 PyObject
*errorHandler
= NULL
;
3278 PyObject
*exc
= NULL
;
3280 #ifdef Py_UNICODE_WIDE
3281 Py_UNICODE unimax
= PyUnicode_GetMax();
3284 /* XXX overflow detection missing */
3285 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
3288 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
3289 return (PyObject
*)v
;
3290 p
= PyUnicode_AS_UNICODE(v
);
3294 memcpy(p
, s
, sizeof(Py_UNICODE
));
3295 /* We have to sanity check the raw data, otherwise doom looms for
3296 some malformed UCS-4 data. */
3298 #ifdef Py_UNICODE_WIDE
3299 *p
> unimax
|| *p
< 0 ||
3301 end
-s
< Py_UNICODE_SIZE
3304 startinpos
= s
- starts
;
3305 if (end
-s
< Py_UNICODE_SIZE
) {
3306 endinpos
= end
-starts
;
3307 reason
= "truncated input";
3310 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
3311 reason
= "illegal code point (> 0x10FFFF)";
3313 outpos
= p
- PyUnicode_AS_UNICODE(v
);
3314 if (unicode_decode_call_errorhandler(
3315 errors
, &errorHandler
,
3316 "unicode_internal", reason
,
3317 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3318 (PyObject
**)&v
, &outpos
, &p
)) {
3324 s
+= Py_UNICODE_SIZE
;
3328 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3330 Py_XDECREF(errorHandler
);
3332 return (PyObject
*)v
;
3336 Py_XDECREF(errorHandler
);
3341 /* --- Latin-1 Codec ------------------------------------------------------ */
3343 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
3350 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3352 Py_UNICODE r
= *(unsigned char*)s
;
3353 return PyUnicode_FromUnicode(&r
, 1);
3356 v
= _PyUnicode_New(size
);
3360 return (PyObject
*)v
;
3361 p
= PyUnicode_AS_UNICODE(v
);
3363 *p
++ = (unsigned char)*s
++;
3364 return (PyObject
*)v
;
3371 /* create or adjust a UnicodeEncodeError */
3372 static void make_encode_exception(PyObject
**exceptionObject
,
3373 const char *encoding
,
3374 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3375 Py_ssize_t startpos
, Py_ssize_t endpos
,
3378 if (*exceptionObject
== NULL
) {
3379 *exceptionObject
= PyUnicodeEncodeError_Create(
3380 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3383 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
3385 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
3387 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
3391 Py_DECREF(*exceptionObject
);
3392 *exceptionObject
= NULL
;
3396 /* raises a UnicodeEncodeError */
3397 static void raise_encode_exception(PyObject
**exceptionObject
,
3398 const char *encoding
,
3399 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3400 Py_ssize_t startpos
, Py_ssize_t endpos
,
3403 make_encode_exception(exceptionObject
,
3404 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3405 if (*exceptionObject
!= NULL
)
3406 PyCodec_StrictErrors(*exceptionObject
);
3409 /* error handling callback helper:
3410 build arguments, call the callback and check the arguments,
3411 put the result into newpos and return the replacement string, which
3412 has to be freed by the caller */
3413 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
3414 PyObject
**errorHandler
,
3415 const char *encoding
, const char *reason
,
3416 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
3417 Py_ssize_t startpos
, Py_ssize_t endpos
,
3420 static char *argparse
= "O!n;encoding error handler must return (unicode, int) tuple";
3423 PyObject
*resunicode
;
3425 if (*errorHandler
== NULL
) {
3426 *errorHandler
= PyCodec_LookupError(errors
);
3427 if (*errorHandler
== NULL
)
3431 make_encode_exception(exceptionObject
,
3432 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3433 if (*exceptionObject
== NULL
)
3436 restuple
= PyObject_CallFunctionObjArgs(
3437 *errorHandler
, *exceptionObject
, NULL
);
3438 if (restuple
== NULL
)
3440 if (!PyTuple_Check(restuple
)) {
3441 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
3442 Py_DECREF(restuple
);
3445 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3446 &resunicode
, newpos
)) {
3447 Py_DECREF(restuple
);
3451 *newpos
= size
+*newpos
;
3452 if (*newpos
<0 || *newpos
>size
) {
3453 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
3454 Py_DECREF(restuple
);
3457 Py_INCREF(resunicode
);
3458 Py_DECREF(restuple
);
3462 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
3469 /* pointers to the beginning and end+1 of input */
3470 const Py_UNICODE
*startp
= p
;
3471 const Py_UNICODE
*endp
= p
+ size
;
3472 /* pointer to the beginning of the unencodable characters */
3473 /* const Py_UNICODE *badp = NULL; */
3474 /* pointer into the output */
3476 /* current output position */
3477 Py_ssize_t respos
= 0;
3479 const char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
3480 const char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3481 PyObject
*errorHandler
= NULL
;
3482 PyObject
*exc
= NULL
;
3483 /* the following variable is used for caching string comparisons
3484 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3485 int known_errorHandler
= -1;
3487 /* allocate enough for a simple encoding without
3488 replacements, if we need more, we'll resize */
3489 res
= PyString_FromStringAndSize(NULL
, size
);
3494 str
= PyString_AS_STRING(res
);
3500 /* can we encode this? */
3502 /* no overflow check, because we know that the space is enough */
3507 Py_ssize_t unicodepos
= p
-startp
;
3508 Py_ssize_t requiredsize
;
3509 PyObject
*repunicode
;
3514 /* startpos for collecting unencodable chars */
3515 const Py_UNICODE
*collstart
= p
;
3516 const Py_UNICODE
*collend
= p
;
3517 /* find all unecodable characters */
3518 while ((collend
< endp
) && ((*collend
)>=limit
))
3520 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3521 if (known_errorHandler
==-1) {
3522 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3523 known_errorHandler
= 1;
3524 else if (!strcmp(errors
, "replace"))
3525 known_errorHandler
= 2;
3526 else if (!strcmp(errors
, "ignore"))
3527 known_errorHandler
= 3;
3528 else if (!strcmp(errors
, "xmlcharrefreplace"))
3529 known_errorHandler
= 4;
3531 known_errorHandler
= 0;
3533 switch (known_errorHandler
) {
3534 case 1: /* strict */
3535 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
3537 case 2: /* replace */
3538 while (collstart
++<collend
)
3539 *str
++ = '?'; /* fall through */
3540 case 3: /* ignore */
3543 case 4: /* xmlcharrefreplace */
3544 respos
= str
-PyString_AS_STRING(res
);
3545 /* determine replacement size (temporarily (mis)uses p) */
3546 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
3555 #ifndef Py_UNICODE_WIDE
3561 else if (*p
<1000000)
3567 requiredsize
= respos
+repsize
+(endp
-collend
);
3568 if (requiredsize
> ressize
) {
3569 if (requiredsize
<2*ressize
)
3570 requiredsize
= 2*ressize
;
3571 if (_PyString_Resize(&res
, requiredsize
))
3573 str
= PyString_AS_STRING(res
) + respos
;
3574 ressize
= requiredsize
;
3576 /* generate replacement (temporarily (mis)uses p) */
3577 for (p
= collstart
; p
< collend
; ++p
) {
3578 str
+= sprintf(str
, "&#%d;", (int)*p
);
3583 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
3584 encoding
, reason
, startp
, size
, &exc
,
3585 collstart
-startp
, collend
-startp
, &newpos
);
3586 if (repunicode
== NULL
)
3588 /* need more space? (at least enough for what we
3589 have+the replacement+the rest of the string, so
3590 we won't have to check space for encodable characters) */
3591 respos
= str
-PyString_AS_STRING(res
);
3592 repsize
= PyUnicode_GET_SIZE(repunicode
);
3593 requiredsize
= respos
+repsize
+(endp
-collend
);
3594 if (requiredsize
> ressize
) {
3595 if (requiredsize
<2*ressize
)
3596 requiredsize
= 2*ressize
;
3597 if (_PyString_Resize(&res
, requiredsize
)) {
3598 Py_DECREF(repunicode
);
3601 str
= PyString_AS_STRING(res
) + respos
;
3602 ressize
= requiredsize
;
3604 /* check if there is anything unencodable in the replacement
3605 and copy it to the output */
3606 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
3609 raise_encode_exception(&exc
, encoding
, startp
, size
,
3610 unicodepos
, unicodepos
+1, reason
);
3611 Py_DECREF(repunicode
);
3616 p
= startp
+ newpos
;
3617 Py_DECREF(repunicode
);
3621 /* Resize if we allocated to much */
3622 respos
= str
-PyString_AS_STRING(res
);
3624 /* If this falls res will be NULL */
3625 _PyString_Resize(&res
, respos
);
3626 Py_XDECREF(errorHandler
);
3632 Py_XDECREF(errorHandler
);
3637 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
3641 return unicode_encode_ucs1(p
, size
, errors
, 256);
3644 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
3646 if (!PyUnicode_Check(unicode
)) {
3647 PyErr_BadArgument();
3650 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
3651 PyUnicode_GET_SIZE(unicode
),
3655 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3657 PyObject
*PyUnicode_DecodeASCII(const char *s
,
3661 const char *starts
= s
;
3664 Py_ssize_t startinpos
;
3665 Py_ssize_t endinpos
;
3668 PyObject
*errorHandler
= NULL
;
3669 PyObject
*exc
= NULL
;
3671 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3672 if (size
== 1 && *(unsigned char*)s
< 128) {
3673 Py_UNICODE r
= *(unsigned char*)s
;
3674 return PyUnicode_FromUnicode(&r
, 1);
3677 v
= _PyUnicode_New(size
);
3681 return (PyObject
*)v
;
3682 p
= PyUnicode_AS_UNICODE(v
);
3685 register unsigned char c
= (unsigned char)*s
;
3691 startinpos
= s
-starts
;
3692 endinpos
= startinpos
+ 1;
3693 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
3694 if (unicode_decode_call_errorhandler(
3695 errors
, &errorHandler
,
3696 "ascii", "ordinal not in range(128)",
3697 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3698 (PyObject
**)&v
, &outpos
, &p
))
3702 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
3703 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3705 Py_XDECREF(errorHandler
);
3707 return (PyObject
*)v
;
3711 Py_XDECREF(errorHandler
);
3716 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
3720 return unicode_encode_ucs1(p
, size
, errors
, 128);
3723 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
3725 if (!PyUnicode_Check(unicode
)) {
3726 PyErr_BadArgument();
3729 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
3730 PyUnicode_GET_SIZE(unicode
),
3734 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3736 /* --- MBCS codecs for Windows -------------------------------------------- */
3738 #if SIZEOF_INT < SIZEOF_SSIZE_T
3742 /* XXX This code is limited to "true" double-byte encodings, as
3743 a) it assumes an incomplete character consists of a single byte, and
3744 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3745 encodings, see IsDBCSLeadByteEx documentation. */
3747 static int is_dbcs_lead_byte(const char *s
, int offset
)
3749 const char *curr
= s
+ offset
;
3751 if (IsDBCSLeadByte(*curr
)) {
3752 const char *prev
= CharPrev(s
, curr
);
3753 return (prev
== curr
) || !IsDBCSLeadByte(*prev
) || (curr
- prev
== 2);
3759 * Decode MBCS string into unicode object. If 'final' is set, converts
3760 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3762 static int decode_mbcs(PyUnicodeObject
**v
,
3763 const char *s
, /* MBCS string */
3764 int size
, /* sizeof MBCS string */
3773 /* Skip trailing lead-byte unless 'final' is set */
3774 if (!final
&& size
>= 1 && is_dbcs_lead_byte(s
, size
- 1))
3777 /* First get the size of the result */
3779 usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
3781 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3787 /* Create unicode object */
3788 *v
= _PyUnicode_New(usize
);
3793 /* Extend unicode object */
3794 n
= PyUnicode_GET_SIZE(*v
);
3795 if (_PyUnicode_Resize(v
, n
+ usize
) < 0)
3799 /* Do the conversion */
3801 p
= PyUnicode_AS_UNICODE(*v
) + n
;
3802 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
3803 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3811 PyObject
*PyUnicode_DecodeMBCSStateful(const char *s
,
3814 Py_ssize_t
*consumed
)
3816 PyUnicodeObject
*v
= NULL
;
3825 done
= decode_mbcs(&v
, s
, INT_MAX
, 0);
3828 done
= decode_mbcs(&v
, s
, (int)size
, !consumed
);
3839 if (size
> INT_MAX
) {
3846 return (PyObject
*)v
;
3849 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
3853 return PyUnicode_DecodeMBCSStateful(s
, size
, errors
, NULL
);
3857 * Convert unicode into string object (MBCS).
3858 * Returns 0 if succeed, -1 otherwise.
3860 static int encode_mbcs(PyObject
**repr
,
3861 const Py_UNICODE
*p
, /* unicode */
3862 int size
) /* size of unicode */
3869 /* First get the size of the result */
3871 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
3872 if (mbcssize
== 0) {
3873 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3878 if (*repr
== NULL
) {
3879 /* Create string object */
3880 *repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
3885 /* Extend string object */
3886 n
= PyString_Size(*repr
);
3887 if (_PyString_Resize(repr
, n
+ mbcssize
) < 0)
3891 /* Do the conversion */
3893 char *s
= PyString_AS_STRING(*repr
) + n
;
3894 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
3895 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3903 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
3907 PyObject
*repr
= NULL
;
3913 ret
= encode_mbcs(&repr
, p
, INT_MAX
);
3916 ret
= encode_mbcs(&repr
, p
, (int)size
);
3924 if (size
> INT_MAX
) {
3934 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
3936 if (!PyUnicode_Check(unicode
)) {
3937 PyErr_BadArgument();
3940 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
3941 PyUnicode_GET_SIZE(unicode
),
3947 #endif /* MS_WINDOWS */
3949 /* --- Character Mapping Codec -------------------------------------------- */
3951 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
3956 const char *starts
= s
;
3957 Py_ssize_t startinpos
;
3958 Py_ssize_t endinpos
;
3963 Py_ssize_t extrachars
= 0;
3964 PyObject
*errorHandler
= NULL
;
3965 PyObject
*exc
= NULL
;
3966 Py_UNICODE
*mapstring
= NULL
;
3967 Py_ssize_t maplen
= 0;
3969 /* Default to Latin-1 */
3970 if (mapping
== NULL
)
3971 return PyUnicode_DecodeLatin1(s
, size
, errors
);
3973 v
= _PyUnicode_New(size
);
3977 return (PyObject
*)v
;
3978 p
= PyUnicode_AS_UNICODE(v
);
3980 if (PyUnicode_CheckExact(mapping
)) {
3981 mapstring
= PyUnicode_AS_UNICODE(mapping
);
3982 maplen
= PyUnicode_GET_SIZE(mapping
);
3984 unsigned char ch
= *s
;
3985 Py_UNICODE x
= 0xfffe; /* illegal value */
3991 /* undefined mapping */
3992 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3993 startinpos
= s
-starts
;
3994 endinpos
= startinpos
+1;
3995 if (unicode_decode_call_errorhandler(
3996 errors
, &errorHandler
,
3997 "charmap", "character maps to <undefined>",
3998 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3999 (PyObject
**)&v
, &outpos
, &p
)) {
4010 unsigned char ch
= *s
;
4013 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4014 w
= PyInt_FromLong((long)ch
);
4017 x
= PyObject_GetItem(mapping
, w
);
4020 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4021 /* No mapping found means: mapping is undefined. */
4030 if (PyInt_Check(x
)) {
4031 long value
= PyInt_AS_LONG(x
);
4032 if (value
< 0 || value
> 65535) {
4033 PyErr_SetString(PyExc_TypeError
,
4034 "character mapping must be in range(65536)");
4038 *p
++ = (Py_UNICODE
)value
;
4040 else if (x
== Py_None
) {
4041 /* undefined mapping */
4042 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4043 startinpos
= s
-starts
;
4044 endinpos
= startinpos
+1;
4045 if (unicode_decode_call_errorhandler(
4046 errors
, &errorHandler
,
4047 "charmap", "character maps to <undefined>",
4048 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4049 (PyObject
**)&v
, &outpos
, &p
)) {
4056 else if (PyUnicode_Check(x
)) {
4057 Py_ssize_t targetsize
= PyUnicode_GET_SIZE(x
);
4059 if (targetsize
== 1)
4061 *p
++ = *PyUnicode_AS_UNICODE(x
);
4063 else if (targetsize
> 1) {
4065 if (targetsize
> extrachars
) {
4067 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
4068 Py_ssize_t needed
= (targetsize
- extrachars
) + \
4070 extrachars
+= needed
;
4071 /* XXX overflow detection missing */
4072 if (_PyUnicode_Resize(&v
,
4073 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
4077 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
4080 PyUnicode_AS_UNICODE(x
),
4083 extrachars
-= targetsize
;
4085 /* 1-0 mapping: skip the character */
4088 /* wrong return value */
4089 PyErr_SetString(PyExc_TypeError
,
4090 "character mapping must return integer, None or unicode");
4098 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
4099 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
4101 Py_XDECREF(errorHandler
);
4103 return (PyObject
*)v
;
4106 Py_XDECREF(errorHandler
);
4112 /* Charmap encoding: the lookup table */
4114 struct encoding_map
{
4116 unsigned char level1
[32];
4118 unsigned char level23
[1];
4122 encoding_map_size(PyObject
*obj
, PyObject
* args
)
4124 struct encoding_map
*map
= (struct encoding_map
*)obj
;
4125 return PyInt_FromLong(sizeof(*map
) - 1 + 16*map
->count2
+
4129 static PyMethodDef encoding_map_methods
[] = {
4130 {"size", encoding_map_size
, METH_NOARGS
,
4131 PyDoc_STR("Return the size (in bytes) of this object") },
4136 encoding_map_dealloc(PyObject
* o
)
4141 static PyTypeObject EncodingMapType
= {
4142 PyVarObject_HEAD_INIT(NULL
, 0)
4143 "EncodingMap", /*tp_name*/
4144 sizeof(struct encoding_map
), /*tp_basicsize*/
4147 encoding_map_dealloc
, /*tp_dealloc*/
4154 0, /*tp_as_sequence*/
4155 0, /*tp_as_mapping*/
4162 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
4166 0, /*tp_richcompare*/
4167 0, /*tp_weaklistoffset*/
4170 encoding_map_methods
, /*tp_methods*/
4177 0, /*tp_dictoffset*/
4186 PyUnicode_BuildEncodingMap(PyObject
* string
)
4190 struct encoding_map
*mresult
;
4193 unsigned char level1
[32];
4194 unsigned char level2
[512];
4195 unsigned char *mlevel1
, *mlevel2
, *mlevel3
;
4196 int count2
= 0, count3
= 0;
4198 if (!PyUnicode_Check(string
) || PyUnicode_GetSize(string
) != 256) {
4199 PyErr_BadArgument();
4202 decode
= PyUnicode_AS_UNICODE(string
);
4203 memset(level1
, 0xFF, sizeof level1
);
4204 memset(level2
, 0xFF, sizeof level2
);
4206 /* If there isn't a one-to-one mapping of NULL to \0,
4207 or if there are non-BMP characters, we need to use
4208 a mapping dictionary. */
4211 for (i
= 1; i
< 256; i
++) {
4214 #ifdef Py_UNICODE_WIDE
4215 || decode
[i
] > 0xFFFF
4221 if (decode
[i
] == 0xFFFE)
4222 /* unmapped character */
4224 l1
= decode
[i
] >> 11;
4225 l2
= decode
[i
] >> 7;
4226 if (level1
[l1
] == 0xFF)
4227 level1
[l1
] = count2
++;
4228 if (level2
[l2
] == 0xFF)
4229 level2
[l2
] = count3
++;
4232 if (count2
>= 0xFF || count3
>= 0xFF)
4236 PyObject
*result
= PyDict_New();
4237 PyObject
*key
, *value
;
4240 for (i
= 0; i
< 256; i
++) {
4242 key
= PyInt_FromLong(decode
[i
]);
4243 value
= PyInt_FromLong(i
);
4246 if (PyDict_SetItem(result
, key
, value
) == -1)
4259 /* Create a three-level trie */
4260 result
= PyObject_MALLOC(sizeof(struct encoding_map
) +
4261 16*count2
+ 128*count3
- 1);
4263 return PyErr_NoMemory();
4264 PyObject_Init(result
, &EncodingMapType
);
4265 mresult
= (struct encoding_map
*)result
;
4266 mresult
->count2
= count2
;
4267 mresult
->count3
= count3
;
4268 mlevel1
= mresult
->level1
;
4269 mlevel2
= mresult
->level23
;
4270 mlevel3
= mresult
->level23
+ 16*count2
;
4271 memcpy(mlevel1
, level1
, 32);
4272 memset(mlevel2
, 0xFF, 16*count2
);
4273 memset(mlevel3
, 0, 128*count3
);
4275 for (i
= 1; i
< 256; i
++) {
4276 int o1
, o2
, o3
, i2
, i3
;
4277 if (decode
[i
] == 0xFFFE)
4278 /* unmapped character */
4281 o2
= (decode
[i
]>>7) & 0xF;
4282 i2
= 16*mlevel1
[o1
] + o2
;
4283 if (mlevel2
[i2
] == 0xFF)
4284 mlevel2
[i2
] = count3
++;
4285 o3
= decode
[i
] & 0x7F;
4286 i3
= 128*mlevel2
[i2
] + o3
;
4293 encoding_map_lookup(Py_UNICODE c
, PyObject
*mapping
)
4295 struct encoding_map
*map
= (struct encoding_map
*)mapping
;
4297 int l2
= (c
>>7) & 0xF;
4301 #ifdef Py_UNICODE_WIDE
4309 i
= map
->level1
[l1
];
4314 i
= map
->level23
[16*i
+l2
];
4319 i
= map
->level23
[16*map
->count2
+ 128*i
+ l3
];
4326 /* Lookup the character ch in the mapping. If the character
4327 can't be found, Py_None is returned (or NULL, if another
4329 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
4331 PyObject
*w
= PyInt_FromLong((long)c
);
4336 x
= PyObject_GetItem(mapping
, w
);
4339 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4340 /* No mapping found means: mapping is undefined. */
4348 else if (x
== Py_None
)
4350 else if (PyInt_Check(x
)) {
4351 long value
= PyInt_AS_LONG(x
);
4352 if (value
< 0 || value
> 255) {
4353 PyErr_SetString(PyExc_TypeError
,
4354 "character mapping must be in range(256)");
4360 else if (PyString_Check(x
))
4363 /* wrong return value */
4364 PyErr_SetString(PyExc_TypeError
,
4365 "character mapping must return integer, None or str");
4372 charmapencode_resize(PyObject
**outobj
, Py_ssize_t
*outpos
, Py_ssize_t requiredsize
)
4374 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4375 /* exponentially overallocate to minimize reallocations */
4376 if (requiredsize
< 2*outsize
)
4377 requiredsize
= 2*outsize
;
4378 if (_PyString_Resize(outobj
, requiredsize
)) {
4384 typedef enum charmapencode_result
{
4385 enc_SUCCESS
, enc_FAILED
, enc_EXCEPTION
4386 }charmapencode_result
;
4387 /* lookup the character, put the result in the output string and adjust
4388 various state variables. Reallocate the output string if not enough
4389 space is available. Return a new reference to the object that
4390 was put in the output buffer, or Py_None, if the mapping was undefined
4391 (in which case no character was written) or NULL, if a
4392 reallocation error occurred. The caller must decref the result */
4394 charmapencode_result
charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
4395 PyObject
**outobj
, Py_ssize_t
*outpos
)
4399 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4401 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4402 int res
= encoding_map_lookup(c
, mapping
);
4403 Py_ssize_t requiredsize
= *outpos
+1;
4406 if (outsize
<requiredsize
)
4407 if (!charmapencode_resize(outobj
, outpos
, requiredsize
))
4408 return enc_EXCEPTION
;
4409 outstart
= PyString_AS_STRING(*outobj
);
4410 outstart
[(*outpos
)++] = (char)res
;
4414 rep
= charmapencode_lookup(c
, mapping
);
4416 return enc_EXCEPTION
;
4417 else if (rep
==Py_None
) {
4421 if (PyInt_Check(rep
)) {
4422 Py_ssize_t requiredsize
= *outpos
+1;
4423 if (outsize
<requiredsize
)
4424 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4426 return enc_EXCEPTION
;
4428 outstart
= PyString_AS_STRING(*outobj
);
4429 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
4432 const char *repchars
= PyString_AS_STRING(rep
);
4433 Py_ssize_t repsize
= PyString_GET_SIZE(rep
);
4434 Py_ssize_t requiredsize
= *outpos
+repsize
;
4435 if (outsize
<requiredsize
)
4436 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4438 return enc_EXCEPTION
;
4440 outstart
= PyString_AS_STRING(*outobj
);
4441 memcpy(outstart
+ *outpos
, repchars
, repsize
);
4449 /* handle an error in PyUnicode_EncodeCharmap
4450 Return 0 on success, -1 on error */
4452 int charmap_encoding_error(
4453 const Py_UNICODE
*p
, Py_ssize_t size
, Py_ssize_t
*inpos
, PyObject
*mapping
,
4454 PyObject
**exceptionObject
,
4455 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
4456 PyObject
**res
, Py_ssize_t
*respos
)
4458 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4462 /* startpos for collecting unencodable chars */
4463 Py_ssize_t collstartpos
= *inpos
;
4464 Py_ssize_t collendpos
= *inpos
+1;
4466 char *encoding
= "charmap";
4467 char *reason
= "character maps to <undefined>";
4468 charmapencode_result x
;
4470 /* find all unencodable characters */
4471 while (collendpos
< size
) {
4473 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4474 int res
= encoding_map_lookup(p
[collendpos
], mapping
);
4481 rep
= charmapencode_lookup(p
[collendpos
], mapping
);
4484 else if (rep
!=Py_None
) {
4491 /* cache callback name lookup
4492 * (if not done yet, i.e. it's the first error) */
4493 if (*known_errorHandler
==-1) {
4494 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4495 *known_errorHandler
= 1;
4496 else if (!strcmp(errors
, "replace"))
4497 *known_errorHandler
= 2;
4498 else if (!strcmp(errors
, "ignore"))
4499 *known_errorHandler
= 3;
4500 else if (!strcmp(errors
, "xmlcharrefreplace"))
4501 *known_errorHandler
= 4;
4503 *known_errorHandler
= 0;
4505 switch (*known_errorHandler
) {
4506 case 1: /* strict */
4507 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4509 case 2: /* replace */
4510 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
4511 x
= charmapencode_output('?', mapping
, res
, respos
);
4512 if (x
==enc_EXCEPTION
) {
4515 else if (x
==enc_FAILED
) {
4516 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4521 case 3: /* ignore */
4522 *inpos
= collendpos
;
4524 case 4: /* xmlcharrefreplace */
4525 /* generate replacement (temporarily (mis)uses p) */
4526 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
4527 char buffer
[2+29+1+1];
4529 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
4530 for (cp
= buffer
; *cp
; ++cp
) {
4531 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
4532 if (x
==enc_EXCEPTION
)
4534 else if (x
==enc_FAILED
) {
4535 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4540 *inpos
= collendpos
;
4543 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
4544 encoding
, reason
, p
, size
, exceptionObject
,
4545 collstartpos
, collendpos
, &newpos
);
4546 if (repunicode
== NULL
)
4548 /* generate replacement */
4549 repsize
= PyUnicode_GET_SIZE(repunicode
);
4550 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
4551 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
4552 if (x
==enc_EXCEPTION
) {
4555 else if (x
==enc_FAILED
) {
4556 Py_DECREF(repunicode
);
4557 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4562 Py_DECREF(repunicode
);
4567 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
4573 PyObject
*res
= NULL
;
4574 /* current input position */
4575 Py_ssize_t inpos
= 0;
4576 /* current output position */
4577 Py_ssize_t respos
= 0;
4578 PyObject
*errorHandler
= NULL
;
4579 PyObject
*exc
= NULL
;
4580 /* the following variable is used for caching string comparisons
4581 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4582 * 3=ignore, 4=xmlcharrefreplace */
4583 int known_errorHandler
= -1;
4585 /* Default to Latin-1 */
4586 if (mapping
== NULL
)
4587 return PyUnicode_EncodeLatin1(p
, size
, errors
);
4589 /* allocate enough for a simple encoding without
4590 replacements, if we need more, we'll resize */
4591 res
= PyString_FromStringAndSize(NULL
, size
);
4597 while (inpos
<size
) {
4598 /* try to encode it */
4599 charmapencode_result x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
4600 if (x
==enc_EXCEPTION
) /* error */
4602 if (x
==enc_FAILED
) { /* unencodable character */
4603 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
4605 &known_errorHandler
, &errorHandler
, errors
,
4611 /* done with this character => adjust input position */
4615 /* Resize if we allocated to much */
4616 if (respos
<PyString_GET_SIZE(res
)) {
4617 if (_PyString_Resize(&res
, respos
))
4621 Py_XDECREF(errorHandler
);
4627 Py_XDECREF(errorHandler
);
4631 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
4634 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
4635 PyErr_BadArgument();
4638 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
4639 PyUnicode_GET_SIZE(unicode
),
4644 /* create or adjust a UnicodeTranslateError */
4645 static void make_translate_exception(PyObject
**exceptionObject
,
4646 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4647 Py_ssize_t startpos
, Py_ssize_t endpos
,
4650 if (*exceptionObject
== NULL
) {
4651 *exceptionObject
= PyUnicodeTranslateError_Create(
4652 unicode
, size
, startpos
, endpos
, reason
);
4655 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
4657 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
4659 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
4663 Py_DECREF(*exceptionObject
);
4664 *exceptionObject
= NULL
;
4668 /* raises a UnicodeTranslateError */
4669 static void raise_translate_exception(PyObject
**exceptionObject
,
4670 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4671 Py_ssize_t startpos
, Py_ssize_t endpos
,
4674 make_translate_exception(exceptionObject
,
4675 unicode
, size
, startpos
, endpos
, reason
);
4676 if (*exceptionObject
!= NULL
)
4677 PyCodec_StrictErrors(*exceptionObject
);
4680 /* error handling callback helper:
4681 build arguments, call the callback and check the arguments,
4682 put the result into newpos and return the replacement string, which
4683 has to be freed by the caller */
4684 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
4685 PyObject
**errorHandler
,
4687 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
4688 Py_ssize_t startpos
, Py_ssize_t endpos
,
4691 static char *argparse
= "O!n;translating error handler must return (unicode, int) tuple";
4693 Py_ssize_t i_newpos
;
4695 PyObject
*resunicode
;
4697 if (*errorHandler
== NULL
) {
4698 *errorHandler
= PyCodec_LookupError(errors
);
4699 if (*errorHandler
== NULL
)
4703 make_translate_exception(exceptionObject
,
4704 unicode
, size
, startpos
, endpos
, reason
);
4705 if (*exceptionObject
== NULL
)
4708 restuple
= PyObject_CallFunctionObjArgs(
4709 *errorHandler
, *exceptionObject
, NULL
);
4710 if (restuple
== NULL
)
4712 if (!PyTuple_Check(restuple
)) {
4713 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
4714 Py_DECREF(restuple
);
4717 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
4718 &resunicode
, &i_newpos
)) {
4719 Py_DECREF(restuple
);
4723 *newpos
= size
+i_newpos
;
4726 if (*newpos
<0 || *newpos
>size
) {
4727 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
4728 Py_DECREF(restuple
);
4731 Py_INCREF(resunicode
);
4732 Py_DECREF(restuple
);
4736 /* Lookup the character ch in the mapping and put the result in result,
4737 which must be decrefed by the caller.
4738 Return 0 on success, -1 on error */
4740 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
4742 PyObject
*w
= PyInt_FromLong((long)c
);
4747 x
= PyObject_GetItem(mapping
, w
);
4750 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4751 /* No mapping found means: use 1:1 mapping. */
4758 else if (x
== Py_None
) {
4762 else if (PyInt_Check(x
)) {
4763 long value
= PyInt_AS_LONG(x
);
4764 long max
= PyUnicode_GetMax();
4765 if (value
< 0 || value
> max
) {
4766 PyErr_Format(PyExc_TypeError
,
4767 "character mapping must be in range(0x%lx)", max
+1);
4774 else if (PyUnicode_Check(x
)) {
4779 /* wrong return value */
4780 PyErr_SetString(PyExc_TypeError
,
4781 "character mapping must return integer, None or unicode");
4786 /* ensure that *outobj is at least requiredsize characters long,
4787 if not reallocate and adjust various state variables.
4788 Return 0 on success, -1 on error */
4790 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
4791 Py_ssize_t requiredsize
)
4793 Py_ssize_t oldsize
= PyUnicode_GET_SIZE(*outobj
);
4794 if (requiredsize
> oldsize
) {
4795 /* remember old output position */
4796 Py_ssize_t outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
4797 /* exponentially overallocate to minimize reallocations */
4798 if (requiredsize
< 2 * oldsize
)
4799 requiredsize
= 2 * oldsize
;
4800 if (_PyUnicode_Resize(outobj
, requiredsize
) < 0)
4802 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
4806 /* lookup the character, put the result in the output string and adjust
4807 various state variables. Return a new reference to the object that
4808 was put in the output buffer in *result, or Py_None, if the mapping was
4809 undefined (in which case no character was written).
4810 The called must decref result.
4811 Return 0 on success, -1 on error. */
4813 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
4814 Py_ssize_t insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
4817 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
4820 /* not found => default to 1:1 mapping */
4821 *(*outp
)++ = *curinp
;
4823 else if (*res
==Py_None
)
4825 else if (PyInt_Check(*res
)) {
4826 /* no overflow check, because we know that the space is enough */
4827 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
4829 else if (PyUnicode_Check(*res
)) {
4830 Py_ssize_t repsize
= PyUnicode_GET_SIZE(*res
);
4832 /* no overflow check, because we know that the space is enough */
4833 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
4835 else if (repsize
!=0) {
4836 /* more than one character */
4837 Py_ssize_t requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
4838 (insize
- (curinp
-startinp
)) +
4840 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
4842 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
4851 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
4857 PyObject
*res
= NULL
;
4858 /* pointers to the beginning and end+1 of input */
4859 const Py_UNICODE
*startp
= p
;
4860 const Py_UNICODE
*endp
= p
+ size
;
4861 /* pointer into the output */
4863 /* current output position */
4864 Py_ssize_t respos
= 0;
4865 char *reason
= "character maps to <undefined>";
4866 PyObject
*errorHandler
= NULL
;
4867 PyObject
*exc
= NULL
;
4868 /* the following variable is used for caching string comparisons
4869 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4870 * 3=ignore, 4=xmlcharrefreplace */
4871 int known_errorHandler
= -1;
4873 if (mapping
== NULL
) {
4874 PyErr_BadArgument();
4878 /* allocate enough for a simple 1:1 translation without
4879 replacements, if we need more, we'll resize */
4880 res
= PyUnicode_FromUnicode(NULL
, size
);
4885 str
= PyUnicode_AS_UNICODE(res
);
4888 /* try to encode it */
4890 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
4895 if (x
!=Py_None
) /* it worked => adjust input pointer */
4897 else { /* untranslatable character */
4898 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4902 /* startpos for collecting untranslatable chars */
4903 const Py_UNICODE
*collstart
= p
;
4904 const Py_UNICODE
*collend
= p
+1;
4905 const Py_UNICODE
*coll
;
4907 /* find all untranslatable characters */
4908 while (collend
< endp
) {
4909 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
4916 /* cache callback name lookup
4917 * (if not done yet, i.e. it's the first error) */
4918 if (known_errorHandler
==-1) {
4919 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4920 known_errorHandler
= 1;
4921 else if (!strcmp(errors
, "replace"))
4922 known_errorHandler
= 2;
4923 else if (!strcmp(errors
, "ignore"))
4924 known_errorHandler
= 3;
4925 else if (!strcmp(errors
, "xmlcharrefreplace"))
4926 known_errorHandler
= 4;
4928 known_errorHandler
= 0;
4930 switch (known_errorHandler
) {
4931 case 1: /* strict */
4932 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
4934 case 2: /* replace */
4935 /* No need to check for space, this is a 1:1 replacement */
4936 for (coll
= collstart
; coll
<collend
; ++coll
)
4939 case 3: /* ignore */
4942 case 4: /* xmlcharrefreplace */
4943 /* generate replacement (temporarily (mis)uses p) */
4944 for (p
= collstart
; p
< collend
; ++p
) {
4945 char buffer
[2+29+1+1];
4947 sprintf(buffer
, "&#%d;", (int)*p
);
4948 if (charmaptranslate_makespace(&res
, &str
,
4949 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
4951 for (cp
= buffer
; *cp
; ++cp
)
4957 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
4958 reason
, startp
, size
, &exc
,
4959 collstart
-startp
, collend
-startp
, &newpos
);
4960 if (repunicode
== NULL
)
4962 /* generate replacement */
4963 repsize
= PyUnicode_GET_SIZE(repunicode
);
4964 if (charmaptranslate_makespace(&res
, &str
,
4965 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
4966 Py_DECREF(repunicode
);
4969 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
4971 p
= startp
+ newpos
;
4972 Py_DECREF(repunicode
);
4976 /* Resize if we allocated to much */
4977 respos
= str
-PyUnicode_AS_UNICODE(res
);
4978 if (respos
<PyUnicode_GET_SIZE(res
)) {
4979 if (_PyUnicode_Resize(&res
, respos
) < 0)
4983 Py_XDECREF(errorHandler
);
4989 Py_XDECREF(errorHandler
);
4993 PyObject
*PyUnicode_Translate(PyObject
*str
,
4999 str
= PyUnicode_FromObject(str
);
5002 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
5003 PyUnicode_GET_SIZE(str
),
5014 /* --- Decimal Encoder ---------------------------------------------------- */
5016 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
5021 Py_UNICODE
*p
, *end
;
5022 PyObject
*errorHandler
= NULL
;
5023 PyObject
*exc
= NULL
;
5024 const char *encoding
= "decimal";
5025 const char *reason
= "invalid decimal Unicode string";
5026 /* the following variable is used for caching string comparisons
5027 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5028 int known_errorHandler
= -1;
5030 if (output
== NULL
) {
5031 PyErr_BadArgument();
5038 register Py_UNICODE ch
= *p
;
5040 PyObject
*repunicode
;
5044 Py_UNICODE
*collstart
;
5045 Py_UNICODE
*collend
;
5047 if (Py_UNICODE_ISSPACE(ch
)) {
5052 decimal
= Py_UNICODE_TODECIMAL(ch
);
5054 *output
++ = '0' + decimal
;
5058 if (0 < ch
&& ch
< 256) {
5059 *output
++ = (char)ch
;
5063 /* All other characters are considered unencodable */
5066 while (collend
< end
) {
5067 if ((0 < *collend
&& *collend
< 256) ||
5068 !Py_UNICODE_ISSPACE(*collend
) ||
5069 Py_UNICODE_TODECIMAL(*collend
))
5072 /* cache callback name lookup
5073 * (if not done yet, i.e. it's the first error) */
5074 if (known_errorHandler
==-1) {
5075 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5076 known_errorHandler
= 1;
5077 else if (!strcmp(errors
, "replace"))
5078 known_errorHandler
= 2;
5079 else if (!strcmp(errors
, "ignore"))
5080 known_errorHandler
= 3;
5081 else if (!strcmp(errors
, "xmlcharrefreplace"))
5082 known_errorHandler
= 4;
5084 known_errorHandler
= 0;
5086 switch (known_errorHandler
) {
5087 case 1: /* strict */
5088 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
5090 case 2: /* replace */
5091 for (p
= collstart
; p
< collend
; ++p
)
5094 case 3: /* ignore */
5097 case 4: /* xmlcharrefreplace */
5098 /* generate replacement (temporarily (mis)uses p) */
5099 for (p
= collstart
; p
< collend
; ++p
)
5100 output
+= sprintf(output
, "&#%d;", (int)*p
);
5104 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
5105 encoding
, reason
, s
, length
, &exc
,
5106 collstart
-s
, collend
-s
, &newpos
);
5107 if (repunicode
== NULL
)
5109 /* generate replacement */
5110 repsize
= PyUnicode_GET_SIZE(repunicode
);
5111 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
5112 Py_UNICODE ch
= *uni2
;
5113 if (Py_UNICODE_ISSPACE(ch
))
5116 decimal
= Py_UNICODE_TODECIMAL(ch
);
5118 *output
++ = '0' + decimal
;
5119 else if (0 < ch
&& ch
< 256)
5120 *output
++ = (char)ch
;
5122 Py_DECREF(repunicode
);
5123 raise_encode_exception(&exc
, encoding
,
5124 s
, length
, collstart
-s
, collend
-s
, reason
);
5130 Py_DECREF(repunicode
);
5133 /* 0-terminate the output string */
5136 Py_XDECREF(errorHandler
);
5141 Py_XDECREF(errorHandler
);
5145 /* --- Helpers ------------------------------------------------------------ */
5147 #include "stringlib/unicodedefs.h"
5149 #define FROM_UNICODE
5151 #include "stringlib/fastsearch.h"
5153 #include "stringlib/count.h"
5154 #include "stringlib/find.h"
5155 #include "stringlib/partition.h"
5157 /* helper macro to fixup start/end slice values */
5158 #define FIX_START_END(obj) \
5160 start += (obj)->length; \
5163 if (end > (obj)->length) \
5164 end = (obj)->length; \
5166 end += (obj)->length; \
5170 Py_ssize_t
PyUnicode_Count(PyObject
*str
,
5176 PyUnicodeObject
* str_obj
;
5177 PyUnicodeObject
* sub_obj
;
5179 str_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(str
);
5182 sub_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(substr
);
5188 FIX_START_END(str_obj
);
5190 result
= stringlib_count(
5191 str_obj
->str
+ start
, end
- start
, sub_obj
->str
, sub_obj
->length
5200 Py_ssize_t
PyUnicode_Find(PyObject
*str
,
5208 str
= PyUnicode_FromObject(str
);
5211 sub
= PyUnicode_FromObject(sub
);
5218 result
= stringlib_find_slice(
5219 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5220 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5224 result
= stringlib_rfind_slice(
5225 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5226 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5237 int tailmatch(PyUnicodeObject
*self
,
5238 PyUnicodeObject
*substring
,
5243 if (substring
->length
== 0)
5246 FIX_START_END(self
);
5248 end
-= substring
->length
;
5252 if (direction
> 0) {
5253 if (Py_UNICODE_MATCH(self
, end
, substring
))
5256 if (Py_UNICODE_MATCH(self
, start
, substring
))
5263 Py_ssize_t
PyUnicode_Tailmatch(PyObject
*str
,
5271 str
= PyUnicode_FromObject(str
);
5274 substr
= PyUnicode_FromObject(substr
);
5275 if (substr
== NULL
) {
5280 result
= tailmatch((PyUnicodeObject
*)str
,
5281 (PyUnicodeObject
*)substr
,
5282 start
, end
, direction
);
5288 /* Apply fixfct filter to the Unicode object self and return a
5289 reference to the modified object */
5292 PyObject
*fixup(PyUnicodeObject
*self
,
5293 int (*fixfct
)(PyUnicodeObject
*s
))
5298 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5302 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5304 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
5305 /* fixfct should return TRUE if it modified the buffer. If
5306 FALSE, return a reference to the original buffer instead
5307 (to save space, not time) */
5310 return (PyObject
*) self
;
5312 return (PyObject
*) u
;
5316 int fixupper(PyUnicodeObject
*self
)
5318 Py_ssize_t len
= self
->length
;
5319 Py_UNICODE
*s
= self
->str
;
5323 register Py_UNICODE ch
;
5325 ch
= Py_UNICODE_TOUPPER(*s
);
5337 int fixlower(PyUnicodeObject
*self
)
5339 Py_ssize_t len
= self
->length
;
5340 Py_UNICODE
*s
= self
->str
;
5344 register Py_UNICODE ch
;
5346 ch
= Py_UNICODE_TOLOWER(*s
);
5358 int fixswapcase(PyUnicodeObject
*self
)
5360 Py_ssize_t len
= self
->length
;
5361 Py_UNICODE
*s
= self
->str
;
5365 if (Py_UNICODE_ISUPPER(*s
)) {
5366 *s
= Py_UNICODE_TOLOWER(*s
);
5368 } else if (Py_UNICODE_ISLOWER(*s
)) {
5369 *s
= Py_UNICODE_TOUPPER(*s
);
5379 int fixcapitalize(PyUnicodeObject
*self
)
5381 Py_ssize_t len
= self
->length
;
5382 Py_UNICODE
*s
= self
->str
;
5387 if (Py_UNICODE_ISLOWER(*s
)) {
5388 *s
= Py_UNICODE_TOUPPER(*s
);
5393 if (Py_UNICODE_ISUPPER(*s
)) {
5394 *s
= Py_UNICODE_TOLOWER(*s
);
5403 int fixtitle(PyUnicodeObject
*self
)
5405 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5406 register Py_UNICODE
*e
;
5407 int previous_is_cased
;
5409 /* Shortcut for single character strings */
5410 if (PyUnicode_GET_SIZE(self
) == 1) {
5411 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
5420 e
= p
+ PyUnicode_GET_SIZE(self
);
5421 previous_is_cased
= 0;
5422 for (; p
< e
; p
++) {
5423 register const Py_UNICODE ch
= *p
;
5425 if (previous_is_cased
)
5426 *p
= Py_UNICODE_TOLOWER(ch
);
5428 *p
= Py_UNICODE_TOTITLE(ch
);
5430 if (Py_UNICODE_ISLOWER(ch
) ||
5431 Py_UNICODE_ISUPPER(ch
) ||
5432 Py_UNICODE_ISTITLE(ch
))
5433 previous_is_cased
= 1;
5435 previous_is_cased
= 0;
5441 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
5443 PyObject
*internal_separator
= NULL
;
5444 const Py_UNICODE blank
= ' ';
5445 const Py_UNICODE
*sep
= &blank
;
5446 Py_ssize_t seplen
= 1;
5447 PyUnicodeObject
*res
= NULL
; /* the result */
5448 Py_ssize_t res_alloc
= 100; /* # allocated bytes for string in res */
5449 Py_ssize_t res_used
; /* # used bytes */
5450 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
5451 PyObject
*fseq
; /* PySequence_Fast(seq) */
5452 Py_ssize_t seqlen
; /* len(fseq) -- number of items in sequence */
5456 fseq
= PySequence_Fast(seq
, "");
5461 /* Grrrr. A codec may be invoked to convert str objects to
5462 * Unicode, and so it's possible to call back into Python code
5463 * during PyUnicode_FromObject(), and so it's possible for a sick
5464 * codec to change the size of fseq (if seq is a list). Therefore
5465 * we have to keep refetching the size -- can't assume seqlen
5468 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5469 /* If empty sequence, return u"". */
5471 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
5474 /* If singleton sequence with an exact Unicode, return that. */
5476 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
5477 if (PyUnicode_CheckExact(item
)) {
5479 res
= (PyUnicodeObject
*)item
;
5484 /* At least two items to join, or one that isn't exact Unicode. */
5486 /* Set up sep and seplen -- they're needed. */
5487 if (separator
== NULL
) {
5492 internal_separator
= PyUnicode_FromObject(separator
);
5493 if (internal_separator
== NULL
)
5495 sep
= PyUnicode_AS_UNICODE(internal_separator
);
5496 seplen
= PyUnicode_GET_SIZE(internal_separator
);
5497 /* In case PyUnicode_FromObject() mutated seq. */
5498 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5503 res
= _PyUnicode_New(res_alloc
);
5506 res_p
= PyUnicode_AS_UNICODE(res
);
5509 for (i
= 0; i
< seqlen
; ++i
) {
5511 Py_ssize_t new_res_used
;
5513 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
5514 /* Convert item to Unicode. */
5515 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
5516 PyErr_Format(PyExc_TypeError
,
5517 "sequence item %zd: expected string or Unicode,"
5519 i
, Py_TYPE(item
)->tp_name
);
5522 item
= PyUnicode_FromObject(item
);
5525 /* We own a reference to item from here on. */
5527 /* In case PyUnicode_FromObject() mutated seq. */
5528 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5530 /* Make sure we have enough space for the separator and the item. */
5531 itemlen
= PyUnicode_GET_SIZE(item
);
5532 new_res_used
= res_used
+ itemlen
;
5533 if (new_res_used
< 0)
5535 if (i
< seqlen
- 1) {
5536 new_res_used
+= seplen
;
5537 if (new_res_used
< 0)
5540 if (new_res_used
> res_alloc
) {
5541 /* double allocated size until it's big enough */
5543 res_alloc
+= res_alloc
;
5546 } while (new_res_used
> res_alloc
);
5547 if (_PyUnicode_Resize(&res
, res_alloc
) < 0) {
5551 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
5554 /* Copy item, and maybe the separator. */
5555 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), itemlen
);
5557 if (i
< seqlen
- 1) {
5558 Py_UNICODE_COPY(res_p
, sep
, seplen
);
5562 res_used
= new_res_used
;
5565 /* Shrink res to match the used area; this probably can't fail,
5566 * but it's cheap to check.
5568 if (_PyUnicode_Resize(&res
, res_used
) < 0)
5572 Py_XDECREF(internal_separator
);
5574 return (PyObject
*)res
;
5577 PyErr_SetString(PyExc_OverflowError
,
5578 "join() result is too long for a Python string");
5583 Py_XDECREF(internal_separator
);
5590 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
5602 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
5607 if (left
> PY_SSIZE_T_MAX
- self
->length
||
5608 right
> PY_SSIZE_T_MAX
- (left
+ self
->length
)) {
5609 PyErr_SetString(PyExc_OverflowError
, "padded string is too long");
5612 u
= _PyUnicode_New(left
+ self
->length
+ right
);
5615 Py_UNICODE_FILL(u
->str
, fill
, left
);
5616 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
5618 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
5624 #define SPLIT_APPEND(data, left, right) \
5625 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5628 if (PyList_Append(list, str)) { \
5636 PyObject
*split_whitespace(PyUnicodeObject
*self
,
5638 Py_ssize_t maxcount
)
5640 register Py_ssize_t i
;
5641 register Py_ssize_t j
;
5642 Py_ssize_t len
= self
->length
;
5644 register const Py_UNICODE
*buf
= self
->str
;
5646 for (i
= j
= 0; i
< len
; ) {
5648 while (i
< len
&& Py_UNICODE_ISSPACE(buf
[i
]))
5651 while (i
< len
&& !Py_UNICODE_ISSPACE(buf
[i
]))
5654 if (maxcount
-- <= 0)
5656 SPLIT_APPEND(buf
, j
, i
);
5657 while (i
< len
&& Py_UNICODE_ISSPACE(buf
[i
]))
5663 SPLIT_APPEND(buf
, j
, len
);
5672 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
5675 register Py_ssize_t i
;
5676 register Py_ssize_t j
;
5682 string
= PyUnicode_FromObject(string
);
5685 data
= PyUnicode_AS_UNICODE(string
);
5686 len
= PyUnicode_GET_SIZE(string
);
5688 list
= PyList_New(0);
5692 for (i
= j
= 0; i
< len
; ) {
5695 /* Find a line and append it */
5696 while (i
< len
&& !BLOOM_LINEBREAK(data
[i
]))
5699 /* Skip the line break reading CRLF as one line break */
5702 if (data
[i
] == '\r' && i
+ 1 < len
&&
5710 SPLIT_APPEND(data
, j
, eol
);
5714 SPLIT_APPEND(data
, j
, len
);
5727 PyObject
*split_char(PyUnicodeObject
*self
,
5730 Py_ssize_t maxcount
)
5732 register Py_ssize_t i
;
5733 register Py_ssize_t j
;
5734 Py_ssize_t len
= self
->length
;
5736 register const Py_UNICODE
*buf
= self
->str
;
5738 for (i
= j
= 0; i
< len
; ) {
5740 if (maxcount
-- <= 0)
5742 SPLIT_APPEND(buf
, j
, i
);
5748 SPLIT_APPEND(buf
, j
, len
);
5758 PyObject
*split_substring(PyUnicodeObject
*self
,
5760 PyUnicodeObject
*substring
,
5761 Py_ssize_t maxcount
)
5763 register Py_ssize_t i
;
5764 register Py_ssize_t j
;
5765 Py_ssize_t len
= self
->length
;
5766 Py_ssize_t sublen
= substring
->length
;
5769 for (i
= j
= 0; i
<= len
- sublen
; ) {
5770 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
5771 if (maxcount
-- <= 0)
5773 SPLIT_APPEND(self
->str
, j
, i
);
5779 SPLIT_APPEND(self
->str
, j
, len
);
5789 PyObject
*rsplit_whitespace(PyUnicodeObject
*self
,
5791 Py_ssize_t maxcount
)
5793 register Py_ssize_t i
;
5794 register Py_ssize_t j
;
5795 Py_ssize_t len
= self
->length
;
5797 register const Py_UNICODE
*buf
= self
->str
;
5799 for (i
= j
= len
- 1; i
>= 0; ) {
5801 while (i
>= 0 && Py_UNICODE_ISSPACE(buf
[i
]))
5804 while (i
>= 0 && !Py_UNICODE_ISSPACE(buf
[i
]))
5807 if (maxcount
-- <= 0)
5809 SPLIT_APPEND(buf
, i
+ 1, j
+ 1);
5810 while (i
>= 0 && Py_UNICODE_ISSPACE(buf
[i
]))
5816 SPLIT_APPEND(buf
, 0, j
+ 1);
5818 if (PyList_Reverse(list
) < 0)
5828 PyObject
*rsplit_char(PyUnicodeObject
*self
,
5831 Py_ssize_t maxcount
)
5833 register Py_ssize_t i
;
5834 register Py_ssize_t j
;
5835 Py_ssize_t len
= self
->length
;
5837 register const Py_UNICODE
*buf
= self
->str
;
5839 for (i
= j
= len
- 1; i
>= 0; ) {
5841 if (maxcount
-- <= 0)
5843 SPLIT_APPEND(buf
, i
+ 1, j
+ 1);
5849 SPLIT_APPEND(buf
, 0, j
+ 1);
5851 if (PyList_Reverse(list
) < 0)
5861 PyObject
*rsplit_substring(PyUnicodeObject
*self
,
5863 PyUnicodeObject
*substring
,
5864 Py_ssize_t maxcount
)
5866 register Py_ssize_t i
;
5867 register Py_ssize_t j
;
5868 Py_ssize_t len
= self
->length
;
5869 Py_ssize_t sublen
= substring
->length
;
5872 for (i
= len
- sublen
, j
= len
; i
>= 0; ) {
5873 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
5874 if (maxcount
-- <= 0)
5876 SPLIT_APPEND(self
->str
, i
+ sublen
, j
);
5883 SPLIT_APPEND(self
->str
, 0, j
);
5885 if (PyList_Reverse(list
) < 0)
5897 PyObject
*split(PyUnicodeObject
*self
,
5898 PyUnicodeObject
*substring
,
5899 Py_ssize_t maxcount
)
5904 maxcount
= PY_SSIZE_T_MAX
;
5906 list
= PyList_New(0);
5910 if (substring
== NULL
)
5911 return split_whitespace(self
,list
,maxcount
);
5913 else if (substring
->length
== 1)
5914 return split_char(self
,list
,substring
->str
[0],maxcount
);
5916 else if (substring
->length
== 0) {
5918 PyErr_SetString(PyExc_ValueError
, "empty separator");
5922 return split_substring(self
,list
,substring
,maxcount
);
5926 PyObject
*rsplit(PyUnicodeObject
*self
,
5927 PyUnicodeObject
*substring
,
5928 Py_ssize_t maxcount
)
5933 maxcount
= PY_SSIZE_T_MAX
;
5935 list
= PyList_New(0);
5939 if (substring
== NULL
)
5940 return rsplit_whitespace(self
,list
,maxcount
);
5942 else if (substring
->length
== 1)
5943 return rsplit_char(self
,list
,substring
->str
[0],maxcount
);
5945 else if (substring
->length
== 0) {
5947 PyErr_SetString(PyExc_ValueError
, "empty separator");
5951 return rsplit_substring(self
,list
,substring
,maxcount
);
5955 PyObject
*replace(PyUnicodeObject
*self
,
5956 PyUnicodeObject
*str1
,
5957 PyUnicodeObject
*str2
,
5958 Py_ssize_t maxcount
)
5963 maxcount
= PY_SSIZE_T_MAX
;
5965 if (str1
->length
== str2
->length
) {
5968 if (str1
->length
== 1) {
5969 /* replace characters */
5971 if (!findchar(self
->str
, self
->length
, str1
->str
[0]))
5973 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5976 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5979 for (i
= 0; i
< u
->length
; i
++)
5980 if (u
->str
[i
] == u1
) {
5987 self
->str
, self
->length
, str1
->str
, str1
->length
, FAST_SEARCH
5991 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5994 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5995 while (i
<= self
->length
- str1
->length
)
5996 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
5999 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
6006 Py_ssize_t n
, i
, j
, e
;
6007 Py_ssize_t product
, new_size
, delta
;
6010 /* replace strings */
6011 n
= stringlib_count(self
->str
, self
->length
, str1
->str
, str1
->length
);
6016 /* new_size = self->length + n * (str2->length - str1->length)); */
6017 delta
= (str2
->length
- str1
->length
);
6019 new_size
= self
->length
;
6021 product
= n
* (str2
->length
- str1
->length
);
6022 if ((product
/ (str2
->length
- str1
->length
)) != n
) {
6023 PyErr_SetString(PyExc_OverflowError
,
6024 "replace string is too long");
6027 new_size
= self
->length
+ product
;
6029 PyErr_SetString(PyExc_OverflowError
,
6030 "replace string is too long");
6034 u
= _PyUnicode_New(new_size
);
6039 e
= self
->length
- str1
->length
;
6040 if (str1
->length
> 0) {
6042 /* look for next match */
6045 if (Py_UNICODE_MATCH(self
, j
, str1
))
6052 /* copy unchanged part [i:j] */
6053 Py_UNICODE_COPY(p
, self
->str
+i
, j
-i
);
6056 /* copy substitution string */
6057 if (str2
->length
> 0) {
6058 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
6061 i
= j
+ str1
->length
;
6063 if (i
< self
->length
)
6064 /* copy tail [i:] */
6065 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
6069 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
6073 *p
++ = self
->str
[i
++];
6075 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
6078 return (PyObject
*) u
;
6081 /* nothing to replace; return original string (when possible) */
6082 if (PyUnicode_CheckExact(self
)) {
6084 return (PyObject
*) self
;
6086 return PyUnicode_FromUnicode(self
->str
, self
->length
);
6089 /* --- Unicode Object Methods --------------------------------------------- */
6091 PyDoc_STRVAR(title__doc__
,
6092 "S.title() -> unicode\n\
6094 Return a titlecased version of S, i.e. words start with title case\n\
6095 characters, all remaining cased characters have lower case.");
6098 unicode_title(PyUnicodeObject
*self
)
6100 return fixup(self
, fixtitle
);
6103 PyDoc_STRVAR(capitalize__doc__
,
6104 "S.capitalize() -> unicode\n\
6106 Return a capitalized version of S, i.e. make the first character\n\
6110 unicode_capitalize(PyUnicodeObject
*self
)
6112 return fixup(self
, fixcapitalize
);
6116 PyDoc_STRVAR(capwords__doc__
,
6117 "S.capwords() -> unicode\n\
6119 Apply .capitalize() to all words in S and return the result with\n\
6120 normalized whitespace (all whitespace strings are replaced by ' ').");
6123 unicode_capwords(PyUnicodeObject
*self
)
6129 /* Split into words */
6130 list
= split(self
, NULL
, -1);
6134 /* Capitalize each word */
6135 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
6136 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
6140 Py_DECREF(PyList_GET_ITEM(list
, i
));
6141 PyList_SET_ITEM(list
, i
, item
);
6144 /* Join the words to form a new string */
6145 item
= PyUnicode_Join(NULL
, list
);
6149 return (PyObject
*)item
;
6153 /* Argument converter. Coerces to a single unicode character */
6156 convert_uc(PyObject
*obj
, void *addr
)
6158 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
6162 uniobj
= PyUnicode_FromObject(obj
);
6163 if (uniobj
== NULL
) {
6164 PyErr_SetString(PyExc_TypeError
,
6165 "The fill character cannot be converted to Unicode");
6168 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
6169 PyErr_SetString(PyExc_TypeError
,
6170 "The fill character must be exactly one character long");
6174 unistr
= PyUnicode_AS_UNICODE(uniobj
);
6175 *fillcharloc
= unistr
[0];
6180 PyDoc_STRVAR(center__doc__
,
6181 "S.center(width[, fillchar]) -> unicode\n\
6183 Return S centered in a Unicode string of length width. Padding is\n\
6184 done using the specified fill character (default is a space)");
6187 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
6189 Py_ssize_t marg
, left
;
6191 Py_UNICODE fillchar
= ' ';
6193 if (!PyArg_ParseTuple(args
, "n|O&:center", &width
, convert_uc
, &fillchar
))
6196 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6198 return (PyObject
*) self
;
6201 marg
= width
- self
->length
;
6202 left
= marg
/ 2 + (marg
& width
& 1);
6204 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
6209 /* This code should go into some future Unicode collation support
6210 module. The basic comparison should compare ordinals on a naive
6211 basis (this is what Java does and thus JPython too). */
6213 /* speedy UTF-16 code point order comparison */
6215 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6217 static short utf16Fixup
[32] =
6219 0, 0, 0, 0, 0, 0, 0, 0,
6220 0, 0, 0, 0, 0, 0, 0, 0,
6221 0, 0, 0, 0, 0, 0, 0, 0,
6222 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6226 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6228 Py_ssize_t len1
, len2
;
6230 Py_UNICODE
*s1
= str1
->str
;
6231 Py_UNICODE
*s2
= str2
->str
;
6233 len1
= str1
->length
;
6234 len2
= str2
->length
;
6236 while (len1
> 0 && len2
> 0) {
6242 if (c1
> (1<<11) * 26)
6243 c1
+= utf16Fixup
[c1
>>11];
6244 if (c2
> (1<<11) * 26)
6245 c2
+= utf16Fixup
[c2
>>11];
6246 /* now c1 and c2 are in UTF-32-compatible order */
6249 return (c1
< c2
) ? -1 : 1;
6254 return (len1
< len2
) ? -1 : (len1
!= len2
);
6260 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6262 register Py_ssize_t len1
, len2
;
6264 Py_UNICODE
*s1
= str1
->str
;
6265 Py_UNICODE
*s2
= str2
->str
;
6267 len1
= str1
->length
;
6268 len2
= str2
->length
;
6270 while (len1
> 0 && len2
> 0) {
6277 return (c1
< c2
) ? -1 : 1;
6282 return (len1
< len2
) ? -1 : (len1
!= len2
);
6287 int PyUnicode_Compare(PyObject
*left
,
6290 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
6293 /* Coerce the two arguments */
6294 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6297 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6301 /* Shortcut for empty or interned objects */
6308 result
= unicode_compare(u
, v
);
6320 PyObject
*PyUnicode_RichCompare(PyObject
*left
,
6326 result
= PyUnicode_Compare(left
, right
);
6327 if (result
== -1 && PyErr_Occurred())
6330 /* Convert the return value to a Boolean */
6333 result
= (result
== 0);
6336 result
= (result
!= 0);
6339 result
= (result
<= 0);
6342 result
= (result
>= 0);
6345 result
= (result
== -1);
6348 result
= (result
== 1);
6351 return PyBool_FromLong(result
);
6357 Type errors mean that PyUnicode_FromObject() could not convert
6358 one of the arguments (usually the right hand side) to Unicode,
6359 ie. we can't handle the comparison request. However, it is
6360 possible that the other object knows a comparison method, which
6361 is why we return Py_NotImplemented to give the other object a
6365 if (PyErr_ExceptionMatches(PyExc_TypeError
)) {
6367 Py_INCREF(Py_NotImplemented
);
6368 return Py_NotImplemented
;
6370 if (op
!= Py_EQ
&& op
!= Py_NE
)
6373 /* Equality comparison.
6375 This is a special case: we silence any PyExc_UnicodeDecodeError
6376 and instead turn it into a PyErr_UnicodeWarning.
6379 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
))
6382 if (PyErr_Warn(PyExc_UnicodeWarning
,
6384 "Unicode equal comparison "
6385 "failed to convert both arguments to Unicode - "
6386 "interpreting them as being unequal" :
6387 "Unicode unequal comparison "
6388 "failed to convert both arguments to Unicode - "
6389 "interpreting them as being unequal"
6392 result
= (op
== Py_NE
);
6393 return PyBool_FromLong(result
);
6396 int PyUnicode_Contains(PyObject
*container
,
6399 PyObject
*str
, *sub
;
6402 /* Coerce the two arguments */
6403 sub
= PyUnicode_FromObject(element
);
6405 PyErr_SetString(PyExc_TypeError
,
6406 "'in <string>' requires string as left operand");
6410 str
= PyUnicode_FromObject(container
);
6416 result
= stringlib_contains_obj(str
, sub
);
6424 /* Concat to string or Unicode object giving a new Unicode object. */
6426 PyObject
*PyUnicode_Concat(PyObject
*left
,
6429 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
6431 /* Coerce the two arguments */
6432 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6435 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6440 if (v
== unicode_empty
) {
6442 return (PyObject
*)u
;
6444 if (u
== unicode_empty
) {
6446 return (PyObject
*)v
;
6449 /* Concat the two Unicode strings */
6450 w
= _PyUnicode_New(u
->length
+ v
->length
);
6453 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
6454 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
6458 return (PyObject
*)w
;
6466 PyDoc_STRVAR(count__doc__
,
6467 "S.count(sub[, start[, end]]) -> int\n\
6469 Return the number of non-overlapping occurrences of substring sub in\n\
6470 Unicode string S[start:end]. Optional arguments start and end are\n\
6471 interpreted as in slice notation.");
6474 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
6476 PyUnicodeObject
*substring
;
6477 Py_ssize_t start
= 0;
6478 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6481 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
6482 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
6485 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
6486 (PyObject
*)substring
);
6487 if (substring
== NULL
)
6490 FIX_START_END(self
);
6492 result
= PyInt_FromSsize_t(
6493 stringlib_count(self
->str
+ start
, end
- start
,
6494 substring
->str
, substring
->length
)
6497 Py_DECREF(substring
);
6502 PyDoc_STRVAR(encode__doc__
,
6503 "S.encode([encoding[,errors]]) -> string or unicode\n\
6505 Encodes S using the codec registered for encoding. encoding defaults\n\
6506 to the default encoding. errors may be given to set a different error\n\
6507 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6508 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6509 'xmlcharrefreplace' as well as any other name registered with\n\
6510 codecs.register_error that can handle UnicodeEncodeErrors.");
6513 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
6515 char *encoding
= NULL
;
6516 char *errors
= NULL
;
6519 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
6521 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
6524 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6525 PyErr_Format(PyExc_TypeError
,
6526 "encoder did not return a string/unicode object "
6528 Py_TYPE(v
)->tp_name
);
6538 PyDoc_STRVAR(decode__doc__
,
6539 "S.decode([encoding[,errors]]) -> string or unicode\n\
6541 Decodes S using the codec registered for encoding. encoding defaults\n\
6542 to the default encoding. errors may be given to set a different error\n\
6543 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6544 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6545 as well as any other name registerd with codecs.register_error that is\n\
6546 able to handle UnicodeDecodeErrors.");
6549 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
)
6551 char *encoding
= NULL
;
6552 char *errors
= NULL
;
6555 if (!PyArg_ParseTuple(args
, "|ss:decode", &encoding
, &errors
))
6557 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
6560 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6561 PyErr_Format(PyExc_TypeError
,
6562 "decoder did not return a string/unicode object "
6564 Py_TYPE(v
)->tp_name
);
6574 PyDoc_STRVAR(expandtabs__doc__
,
6575 "S.expandtabs([tabsize]) -> unicode\n\
6577 Return a copy of S where all tab characters are expanded using spaces.\n\
6578 If tabsize is not given, a tab size of 8 characters is assumed.");
6581 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
6587 Py_ssize_t i
, j
, incr
;
6591 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
6594 /* First pass: determine size of output string */
6595 i
= 0; /* chars up to and including most recent \n or \r */
6596 j
= 0; /* chars since most recent \n or \r (use in tab calculations) */
6597 e
= self
->str
+ self
->length
; /* end of input */
6598 for (p
= self
->str
; p
< e
; p
++)
6601 incr
= tabsize
- (j
% tabsize
); /* cannot overflow */
6602 if (j
> PY_SSIZE_T_MAX
- incr
)
6608 if (j
> PY_SSIZE_T_MAX
- 1)
6611 if (*p
== '\n' || *p
== '\r') {
6612 if (i
> PY_SSIZE_T_MAX
- j
)
6619 if (i
> PY_SSIZE_T_MAX
- j
)
6622 /* Second pass: create output string and fill it */
6623 u
= _PyUnicode_New(i
+ j
);
6627 j
= 0; /* same as in first pass */
6628 q
= u
->str
; /* next output char */
6629 qe
= u
->str
+ u
->length
; /* end of output */
6631 for (p
= self
->str
; p
< e
; p
++)
6634 i
= tabsize
- (j
% tabsize
);
6648 if (*p
== '\n' || *p
== '\r')
6652 return (PyObject
*) u
;
6657 PyErr_SetString(PyExc_OverflowError
, "new string is too long");
6661 PyDoc_STRVAR(find__doc__
,
6662 "S.find(sub [,start [,end]]) -> int\n\
6664 Return the lowest index in S where substring sub is found,\n\
6665 such that sub is contained within s[start:end]. Optional\n\
6666 arguments start and end are interpreted as in slice notation.\n\
6668 Return -1 on failure.");
6671 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
6673 PyObject
*substring
;
6678 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6681 result
= stringlib_find_slice(
6682 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6683 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6687 Py_DECREF(substring
);
6689 return PyInt_FromSsize_t(result
);
6693 unicode_getitem(PyUnicodeObject
*self
, Py_ssize_t index
)
6695 if (index
< 0 || index
>= self
->length
) {
6696 PyErr_SetString(PyExc_IndexError
, "string index out of range");
6700 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
6704 unicode_hash(PyUnicodeObject
*self
)
6706 /* Since Unicode objects compare equal to their ASCII string
6707 counterparts, they should use the individual character values
6708 as basis for their hash value. This is needed to assure that
6709 strings and Unicode objects behave in the same way as
6712 register Py_ssize_t len
;
6713 register Py_UNICODE
*p
;
6716 if (self
->hash
!= -1)
6718 len
= PyUnicode_GET_SIZE(self
);
6719 p
= PyUnicode_AS_UNICODE(self
);
6722 x
= (1000003*x
) ^ *p
++;
6723 x
^= PyUnicode_GET_SIZE(self
);
6730 PyDoc_STRVAR(index__doc__
,
6731 "S.index(sub [,start [,end]]) -> int\n\
6733 Like S.find() but raise ValueError when the substring is not found.");
6736 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
6739 PyObject
*substring
;
6743 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
6746 result
= stringlib_find_slice(
6747 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6748 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6752 Py_DECREF(substring
);
6755 PyErr_SetString(PyExc_ValueError
, "substring not found");
6759 return PyInt_FromSsize_t(result
);
6762 PyDoc_STRVAR(islower__doc__
,
6763 "S.islower() -> bool\n\
6765 Return True if all cased characters in S are lowercase and there is\n\
6766 at least one cased character in S, False otherwise.");
6769 unicode_islower(PyUnicodeObject
*self
)
6771 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6772 register const Py_UNICODE
*e
;
6775 /* Shortcut for single character strings */
6776 if (PyUnicode_GET_SIZE(self
) == 1)
6777 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
6779 /* Special case for empty strings */
6780 if (PyUnicode_GET_SIZE(self
) == 0)
6781 return PyBool_FromLong(0);
6783 e
= p
+ PyUnicode_GET_SIZE(self
);
6785 for (; p
< e
; p
++) {
6786 register const Py_UNICODE ch
= *p
;
6788 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
6789 return PyBool_FromLong(0);
6790 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
6793 return PyBool_FromLong(cased
);
6796 PyDoc_STRVAR(isupper__doc__
,
6797 "S.isupper() -> bool\n\
6799 Return True if all cased characters in S are uppercase and there is\n\
6800 at least one cased character in S, False otherwise.");
6803 unicode_isupper(PyUnicodeObject
*self
)
6805 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6806 register const Py_UNICODE
*e
;
6809 /* Shortcut for single character strings */
6810 if (PyUnicode_GET_SIZE(self
) == 1)
6811 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
6813 /* Special case for empty strings */
6814 if (PyUnicode_GET_SIZE(self
) == 0)
6815 return PyBool_FromLong(0);
6817 e
= p
+ PyUnicode_GET_SIZE(self
);
6819 for (; p
< e
; p
++) {
6820 register const Py_UNICODE ch
= *p
;
6822 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
6823 return PyBool_FromLong(0);
6824 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
6827 return PyBool_FromLong(cased
);
6830 PyDoc_STRVAR(istitle__doc__
,
6831 "S.istitle() -> bool\n\
6833 Return True if S is a titlecased string and there is at least one\n\
6834 character in S, i.e. upper- and titlecase characters may only\n\
6835 follow uncased characters and lowercase characters only cased ones.\n\
6836 Return False otherwise.");
6839 unicode_istitle(PyUnicodeObject
*self
)
6841 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6842 register const Py_UNICODE
*e
;
6843 int cased
, previous_is_cased
;
6845 /* Shortcut for single character strings */
6846 if (PyUnicode_GET_SIZE(self
) == 1)
6847 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
6848 (Py_UNICODE_ISUPPER(*p
) != 0));
6850 /* Special case for empty strings */
6851 if (PyUnicode_GET_SIZE(self
) == 0)
6852 return PyBool_FromLong(0);
6854 e
= p
+ PyUnicode_GET_SIZE(self
);
6856 previous_is_cased
= 0;
6857 for (; p
< e
; p
++) {
6858 register const Py_UNICODE ch
= *p
;
6860 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
6861 if (previous_is_cased
)
6862 return PyBool_FromLong(0);
6863 previous_is_cased
= 1;
6866 else if (Py_UNICODE_ISLOWER(ch
)) {
6867 if (!previous_is_cased
)
6868 return PyBool_FromLong(0);
6869 previous_is_cased
= 1;
6873 previous_is_cased
= 0;
6875 return PyBool_FromLong(cased
);
6878 PyDoc_STRVAR(isspace__doc__
,
6879 "S.isspace() -> bool\n\
6881 Return True if all characters in S are whitespace\n\
6882 and there is at least one character in S, False otherwise.");
6885 unicode_isspace(PyUnicodeObject
*self
)
6887 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6888 register const Py_UNICODE
*e
;
6890 /* Shortcut for single character strings */
6891 if (PyUnicode_GET_SIZE(self
) == 1 &&
6892 Py_UNICODE_ISSPACE(*p
))
6893 return PyBool_FromLong(1);
6895 /* Special case for empty strings */
6896 if (PyUnicode_GET_SIZE(self
) == 0)
6897 return PyBool_FromLong(0);
6899 e
= p
+ PyUnicode_GET_SIZE(self
);
6900 for (; p
< e
; p
++) {
6901 if (!Py_UNICODE_ISSPACE(*p
))
6902 return PyBool_FromLong(0);
6904 return PyBool_FromLong(1);
6907 PyDoc_STRVAR(isalpha__doc__
,
6908 "S.isalpha() -> bool\n\
6910 Return True if all characters in S are alphabetic\n\
6911 and there is at least one character in S, False otherwise.");
6914 unicode_isalpha(PyUnicodeObject
*self
)
6916 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6917 register const Py_UNICODE
*e
;
6919 /* Shortcut for single character strings */
6920 if (PyUnicode_GET_SIZE(self
) == 1 &&
6921 Py_UNICODE_ISALPHA(*p
))
6922 return PyBool_FromLong(1);
6924 /* Special case for empty strings */
6925 if (PyUnicode_GET_SIZE(self
) == 0)
6926 return PyBool_FromLong(0);
6928 e
= p
+ PyUnicode_GET_SIZE(self
);
6929 for (; p
< e
; p
++) {
6930 if (!Py_UNICODE_ISALPHA(*p
))
6931 return PyBool_FromLong(0);
6933 return PyBool_FromLong(1);
6936 PyDoc_STRVAR(isalnum__doc__
,
6937 "S.isalnum() -> bool\n\
6939 Return True if all characters in S are alphanumeric\n\
6940 and there is at least one character in S, False otherwise.");
6943 unicode_isalnum(PyUnicodeObject
*self
)
6945 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6946 register const Py_UNICODE
*e
;
6948 /* Shortcut for single character strings */
6949 if (PyUnicode_GET_SIZE(self
) == 1 &&
6950 Py_UNICODE_ISALNUM(*p
))
6951 return PyBool_FromLong(1);
6953 /* Special case for empty strings */
6954 if (PyUnicode_GET_SIZE(self
) == 0)
6955 return PyBool_FromLong(0);
6957 e
= p
+ PyUnicode_GET_SIZE(self
);
6958 for (; p
< e
; p
++) {
6959 if (!Py_UNICODE_ISALNUM(*p
))
6960 return PyBool_FromLong(0);
6962 return PyBool_FromLong(1);
6965 PyDoc_STRVAR(isdecimal__doc__
,
6966 "S.isdecimal() -> bool\n\
6968 Return True if there are only decimal characters in S,\n\
6972 unicode_isdecimal(PyUnicodeObject
*self
)
6974 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6975 register const Py_UNICODE
*e
;
6977 /* Shortcut for single character strings */
6978 if (PyUnicode_GET_SIZE(self
) == 1 &&
6979 Py_UNICODE_ISDECIMAL(*p
))
6980 return PyBool_FromLong(1);
6982 /* Special case for empty strings */
6983 if (PyUnicode_GET_SIZE(self
) == 0)
6984 return PyBool_FromLong(0);
6986 e
= p
+ PyUnicode_GET_SIZE(self
);
6987 for (; p
< e
; p
++) {
6988 if (!Py_UNICODE_ISDECIMAL(*p
))
6989 return PyBool_FromLong(0);
6991 return PyBool_FromLong(1);
6994 PyDoc_STRVAR(isdigit__doc__
,
6995 "S.isdigit() -> bool\n\
6997 Return True if all characters in S are digits\n\
6998 and there is at least one character in S, False otherwise.");
7001 unicode_isdigit(PyUnicodeObject
*self
)
7003 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7004 register const Py_UNICODE
*e
;
7006 /* Shortcut for single character strings */
7007 if (PyUnicode_GET_SIZE(self
) == 1 &&
7008 Py_UNICODE_ISDIGIT(*p
))
7009 return PyBool_FromLong(1);
7011 /* Special case for empty strings */
7012 if (PyUnicode_GET_SIZE(self
) == 0)
7013 return PyBool_FromLong(0);
7015 e
= p
+ PyUnicode_GET_SIZE(self
);
7016 for (; p
< e
; p
++) {
7017 if (!Py_UNICODE_ISDIGIT(*p
))
7018 return PyBool_FromLong(0);
7020 return PyBool_FromLong(1);
7023 PyDoc_STRVAR(isnumeric__doc__
,
7024 "S.isnumeric() -> bool\n\
7026 Return True if there are only numeric characters in S,\n\
7030 unicode_isnumeric(PyUnicodeObject
*self
)
7032 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
7033 register const Py_UNICODE
*e
;
7035 /* Shortcut for single character strings */
7036 if (PyUnicode_GET_SIZE(self
) == 1 &&
7037 Py_UNICODE_ISNUMERIC(*p
))
7038 return PyBool_FromLong(1);
7040 /* Special case for empty strings */
7041 if (PyUnicode_GET_SIZE(self
) == 0)
7042 return PyBool_FromLong(0);
7044 e
= p
+ PyUnicode_GET_SIZE(self
);
7045 for (; p
< e
; p
++) {
7046 if (!Py_UNICODE_ISNUMERIC(*p
))
7047 return PyBool_FromLong(0);
7049 return PyBool_FromLong(1);
7052 PyDoc_STRVAR(join__doc__
,
7053 "S.join(sequence) -> unicode\n\
7055 Return a string which is the concatenation of the strings in the\n\
7056 sequence. The separator between elements is S.");
7059 unicode_join(PyObject
*self
, PyObject
*data
)
7061 return PyUnicode_Join(self
, data
);
7065 unicode_length(PyUnicodeObject
*self
)
7067 return self
->length
;
7070 PyDoc_STRVAR(ljust__doc__
,
7071 "S.ljust(width[, fillchar]) -> int\n\
7073 Return S left justified in a Unicode string of length width. Padding is\n\
7074 done using the specified fill character (default is a space).");
7077 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
7080 Py_UNICODE fillchar
= ' ';
7082 if (!PyArg_ParseTuple(args
, "n|O&:ljust", &width
, convert_uc
, &fillchar
))
7085 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7087 return (PyObject
*) self
;
7090 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
7093 PyDoc_STRVAR(lower__doc__
,
7094 "S.lower() -> unicode\n\
7096 Return a copy of the string S converted to lowercase.");
7099 unicode_lower(PyUnicodeObject
*self
)
7101 return fixup(self
, fixlower
);
7105 #define RIGHTSTRIP 1
7108 /* Arrays indexed by above */
7109 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7111 #define STRIPNAME(i) (stripformat[i]+3)
7113 /* externally visible for str.strip(unicode) */
7115 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
7117 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
7118 Py_ssize_t len
= PyUnicode_GET_SIZE(self
);
7119 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
7120 Py_ssize_t seplen
= PyUnicode_GET_SIZE(sepobj
);
7123 BLOOM_MASK sepmask
= make_bloom_mask(sep
, seplen
);
7126 if (striptype
!= RIGHTSTRIP
) {
7127 while (i
< len
&& BLOOM_MEMBER(sepmask
, s
[i
], sep
, seplen
)) {
7133 if (striptype
!= LEFTSTRIP
) {
7136 } while (j
>= i
&& BLOOM_MEMBER(sepmask
, s
[j
], sep
, seplen
));
7140 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7142 return (PyObject
*)self
;
7145 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7150 do_strip(PyUnicodeObject
*self
, int striptype
)
7152 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
7153 Py_ssize_t len
= PyUnicode_GET_SIZE(self
), i
, j
;
7156 if (striptype
!= RIGHTSTRIP
) {
7157 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
7163 if (striptype
!= LEFTSTRIP
) {
7166 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
7170 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7172 return (PyObject
*)self
;
7175 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7180 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
7182 PyObject
*sep
= NULL
;
7184 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
7187 if (sep
!= NULL
&& sep
!= Py_None
) {
7188 if (PyUnicode_Check(sep
))
7189 return _PyUnicode_XStrip(self
, striptype
, sep
);
7190 else if (PyString_Check(sep
)) {
7192 sep
= PyUnicode_FromObject(sep
);
7195 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
7200 PyErr_Format(PyExc_TypeError
,
7201 "%s arg must be None, unicode or str",
7202 STRIPNAME(striptype
));
7207 return do_strip(self
, striptype
);
7211 PyDoc_STRVAR(strip__doc__
,
7212 "S.strip([chars]) -> unicode\n\
7214 Return a copy of the string S with leading and trailing\n\
7215 whitespace removed.\n\
7216 If chars is given and not None, remove characters in chars instead.\n\
7217 If chars is a str, it will be converted to unicode before stripping");
7220 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
7222 if (PyTuple_GET_SIZE(args
) == 0)
7223 return do_strip(self
, BOTHSTRIP
); /* Common case */
7225 return do_argstrip(self
, BOTHSTRIP
, args
);
7229 PyDoc_STRVAR(lstrip__doc__
,
7230 "S.lstrip([chars]) -> unicode\n\
7232 Return a copy of the string S with leading whitespace removed.\n\
7233 If chars is given and not None, remove characters in chars instead.\n\
7234 If chars is a str, it will be converted to unicode before stripping");
7237 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
7239 if (PyTuple_GET_SIZE(args
) == 0)
7240 return do_strip(self
, LEFTSTRIP
); /* Common case */
7242 return do_argstrip(self
, LEFTSTRIP
, args
);
7246 PyDoc_STRVAR(rstrip__doc__
,
7247 "S.rstrip([chars]) -> unicode\n\
7249 Return a copy of the string S with trailing whitespace removed.\n\
7250 If chars is given and not None, remove characters in chars instead.\n\
7251 If chars is a str, it will be converted to unicode before stripping");
7254 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
7256 if (PyTuple_GET_SIZE(args
) == 0)
7257 return do_strip(self
, RIGHTSTRIP
); /* Common case */
7259 return do_argstrip(self
, RIGHTSTRIP
, args
);
7264 unicode_repeat(PyUnicodeObject
*str
, Py_ssize_t len
)
7274 if (len
== 1 && PyUnicode_CheckExact(str
)) {
7275 /* no repeat, return original string */
7277 return (PyObject
*) str
;
7280 /* ensure # of chars needed doesn't overflow int and # of bytes
7281 * needed doesn't overflow size_t
7283 nchars
= len
* str
->length
;
7284 if (len
&& nchars
/ len
!= str
->length
) {
7285 PyErr_SetString(PyExc_OverflowError
,
7286 "repeated string is too long");
7289 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
7290 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
7291 PyErr_SetString(PyExc_OverflowError
,
7292 "repeated string is too long");
7295 u
= _PyUnicode_New(nchars
);
7301 if (str
->length
== 1 && len
> 0) {
7302 Py_UNICODE_FILL(p
, str
->str
[0], len
);
7304 Py_ssize_t done
= 0; /* number of characters copied this far */
7305 if (done
< nchars
) {
7306 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
7309 while (done
< nchars
) {
7310 Py_ssize_t n
= (done
<= nchars
-done
) ? done
: nchars
-done
;
7311 Py_UNICODE_COPY(p
+done
, p
, n
);
7316 return (PyObject
*) u
;
7319 PyObject
*PyUnicode_Replace(PyObject
*obj
,
7322 Py_ssize_t maxcount
)
7329 self
= PyUnicode_FromObject(obj
);
7332 str1
= PyUnicode_FromObject(subobj
);
7337 str2
= PyUnicode_FromObject(replobj
);
7343 result
= replace((PyUnicodeObject
*)self
,
7344 (PyUnicodeObject
*)str1
,
7345 (PyUnicodeObject
*)str2
,
7353 PyDoc_STRVAR(replace__doc__
,
7354 "S.replace (old, new[, count]) -> unicode\n\
7356 Return a copy of S with all occurrences of substring\n\
7357 old replaced by new. If the optional argument count is\n\
7358 given, only the first count occurrences are replaced.");
7361 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
7363 PyUnicodeObject
*str1
;
7364 PyUnicodeObject
*str2
;
7365 Py_ssize_t maxcount
= -1;
7368 if (!PyArg_ParseTuple(args
, "OO|n:replace", &str1
, &str2
, &maxcount
))
7370 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
7373 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
7379 result
= replace(self
, str1
, str2
, maxcount
);
7387 PyObject
*unicode_repr(PyObject
*unicode
)
7389 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
7390 PyUnicode_GET_SIZE(unicode
),
7394 PyDoc_STRVAR(rfind__doc__
,
7395 "S.rfind(sub [,start [,end]]) -> int\n\
7397 Return the highest index in S where substring sub is found,\n\
7398 such that sub is contained within s[start:end]. Optional\n\
7399 arguments start and end are interpreted as in slice notation.\n\
7401 Return -1 on failure.");
7404 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
7406 PyObject
*substring
;
7411 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7414 result
= stringlib_rfind_slice(
7415 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7416 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7420 Py_DECREF(substring
);
7422 return PyInt_FromSsize_t(result
);
7425 PyDoc_STRVAR(rindex__doc__
,
7426 "S.rindex(sub [,start [,end]]) -> int\n\
7428 Like S.rfind() but raise ValueError when the substring is not found.");
7431 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
7433 PyObject
*substring
;
7438 if (!_ParseTupleFinds(args
, &substring
, &start
, &end
))
7441 result
= stringlib_rfind_slice(
7442 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7443 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7447 Py_DECREF(substring
);
7450 PyErr_SetString(PyExc_ValueError
, "substring not found");
7453 return PyInt_FromSsize_t(result
);
7456 PyDoc_STRVAR(rjust__doc__
,
7457 "S.rjust(width[, fillchar]) -> unicode\n\
7459 Return S right justified in a Unicode string of length width. Padding is\n\
7460 done using the specified fill character (default is a space).");
7463 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
7466 Py_UNICODE fillchar
= ' ';
7468 if (!PyArg_ParseTuple(args
, "n|O&:rjust", &width
, convert_uc
, &fillchar
))
7471 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7473 return (PyObject
*) self
;
7476 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
7480 unicode_slice(PyUnicodeObject
*self
, Py_ssize_t start
, Py_ssize_t end
)
7482 /* standard clamping */
7487 if (end
> self
->length
)
7489 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
7490 /* full slice, return original string */
7492 return (PyObject
*) self
;
7497 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
7501 PyObject
*PyUnicode_Split(PyObject
*s
,
7503 Py_ssize_t maxsplit
)
7507 s
= PyUnicode_FromObject(s
);
7511 sep
= PyUnicode_FromObject(sep
);
7518 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7525 PyDoc_STRVAR(split__doc__
,
7526 "S.split([sep [,maxsplit]]) -> list of strings\n\
7528 Return a list of the words in S, using sep as the\n\
7529 delimiter string. If maxsplit is given, at most maxsplit\n\
7530 splits are done. If sep is not specified or is None, any\n\
7531 whitespace string is a separator and empty strings are\n\
7532 removed from the result.");
7535 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
7537 PyObject
*substring
= Py_None
;
7538 Py_ssize_t maxcount
= -1;
7540 if (!PyArg_ParseTuple(args
, "|On:split", &substring
, &maxcount
))
7543 if (substring
== Py_None
)
7544 return split(self
, NULL
, maxcount
);
7545 else if (PyUnicode_Check(substring
))
7546 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
7548 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
7552 PyUnicode_Partition(PyObject
*str_in
, PyObject
*sep_in
)
7558 str_obj
= PyUnicode_FromObject(str_in
);
7561 sep_obj
= PyUnicode_FromObject(sep_in
);
7567 out
= stringlib_partition(
7568 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7569 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7580 PyUnicode_RPartition(PyObject
*str_in
, PyObject
*sep_in
)
7586 str_obj
= PyUnicode_FromObject(str_in
);
7589 sep_obj
= PyUnicode_FromObject(sep_in
);
7595 out
= stringlib_rpartition(
7596 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7597 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7606 PyDoc_STRVAR(partition__doc__
,
7607 "S.partition(sep) -> (head, sep, tail)\n\
7609 Searches for the separator sep in S, and returns the part before it,\n\
7610 the separator itself, and the part after it. If the separator is not\n\
7611 found, returns S and two empty strings.");
7614 unicode_partition(PyUnicodeObject
*self
, PyObject
*separator
)
7616 return PyUnicode_Partition((PyObject
*)self
, separator
);
7619 PyDoc_STRVAR(rpartition__doc__
,
7620 "S.rpartition(sep) -> (tail, sep, head)\n\
7622 Searches for the separator sep in S, starting at the end of S, and returns\n\
7623 the part before it, the separator itself, and the part after it. If the\n\
7624 separator is not found, returns two empty strings and S.");
7627 unicode_rpartition(PyUnicodeObject
*self
, PyObject
*separator
)
7629 return PyUnicode_RPartition((PyObject
*)self
, separator
);
7632 PyObject
*PyUnicode_RSplit(PyObject
*s
,
7634 Py_ssize_t maxsplit
)
7638 s
= PyUnicode_FromObject(s
);
7642 sep
= PyUnicode_FromObject(sep
);
7649 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7656 PyDoc_STRVAR(rsplit__doc__
,
7657 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7659 Return a list of the words in S, using sep as the\n\
7660 delimiter string, starting at the end of the string and\n\
7661 working to the front. If maxsplit is given, at most maxsplit\n\
7662 splits are done. If sep is not specified, any whitespace string\n\
7666 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
7668 PyObject
*substring
= Py_None
;
7669 Py_ssize_t maxcount
= -1;
7671 if (!PyArg_ParseTuple(args
, "|On:rsplit", &substring
, &maxcount
))
7674 if (substring
== Py_None
)
7675 return rsplit(self
, NULL
, maxcount
);
7676 else if (PyUnicode_Check(substring
))
7677 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
7679 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
7682 PyDoc_STRVAR(splitlines__doc__
,
7683 "S.splitlines([keepends]]) -> list of strings\n\
7685 Return a list of the lines in S, breaking at line boundaries.\n\
7686 Line breaks are not included in the resulting list unless keepends\n\
7687 is given and true.");
7690 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
7694 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
7697 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
7701 PyObject
*unicode_str(PyUnicodeObject
*self
)
7703 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
7706 PyDoc_STRVAR(swapcase__doc__
,
7707 "S.swapcase() -> unicode\n\
7709 Return a copy of S with uppercase characters converted to lowercase\n\
7713 unicode_swapcase(PyUnicodeObject
*self
)
7715 return fixup(self
, fixswapcase
);
7718 PyDoc_STRVAR(translate__doc__
,
7719 "S.translate(table) -> unicode\n\
7721 Return a copy of the string S, where all characters have been mapped\n\
7722 through the given translation table, which must be a mapping of\n\
7723 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7724 Unmapped characters are left untouched. Characters mapped to None\n\
7728 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
7730 return PyUnicode_TranslateCharmap(self
->str
,
7736 PyDoc_STRVAR(upper__doc__
,
7737 "S.upper() -> unicode\n\
7739 Return a copy of S converted to uppercase.");
7742 unicode_upper(PyUnicodeObject
*self
)
7744 return fixup(self
, fixupper
);
7747 PyDoc_STRVAR(zfill__doc__
,
7748 "S.zfill(width) -> unicode\n\
7750 Pad a numeric string x with zeros on the left, to fill a field\n\
7751 of the specified width. The string x is never truncated.");
7754 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
7760 if (!PyArg_ParseTuple(args
, "n:zfill", &width
))
7763 if (self
->length
>= width
) {
7764 if (PyUnicode_CheckExact(self
)) {
7766 return (PyObject
*) self
;
7769 return PyUnicode_FromUnicode(
7770 PyUnicode_AS_UNICODE(self
),
7771 PyUnicode_GET_SIZE(self
)
7775 fill
= width
- self
->length
;
7777 u
= pad(self
, fill
, 0, '0');
7782 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
7783 /* move sign to beginning of string */
7784 u
->str
[0] = u
->str
[fill
];
7788 return (PyObject
*) u
;
7793 free_listsize(PyUnicodeObject
*self
)
7795 return PyInt_FromLong(numfree
);
7799 PyDoc_STRVAR(startswith__doc__
,
7800 "S.startswith(prefix[, start[, end]]) -> bool\n\
7802 Return True if S starts with the specified prefix, False otherwise.\n\
7803 With optional start, test S beginning at that position.\n\
7804 With optional end, stop comparing S at that position.\n\
7805 prefix can also be a tuple of strings to try.");
7808 unicode_startswith(PyUnicodeObject
*self
,
7812 PyUnicodeObject
*substring
;
7813 Py_ssize_t start
= 0;
7814 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7817 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &subobj
,
7818 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7820 if (PyTuple_Check(subobj
)) {
7822 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7823 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7824 PyTuple_GET_ITEM(subobj
, i
));
7825 if (substring
== NULL
)
7827 result
= tailmatch(self
, substring
, start
, end
, -1);
7828 Py_DECREF(substring
);
7833 /* nothing matched */
7836 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7837 if (substring
== NULL
)
7839 result
= tailmatch(self
, substring
, start
, end
, -1);
7840 Py_DECREF(substring
);
7841 return PyBool_FromLong(result
);
7845 PyDoc_STRVAR(endswith__doc__
,
7846 "S.endswith(suffix[, start[, end]]) -> bool\n\
7848 Return True if S ends with the specified suffix, False otherwise.\n\
7849 With optional start, test S beginning at that position.\n\
7850 With optional end, stop comparing S at that position.\n\
7851 suffix can also be a tuple of strings to try.");
7854 unicode_endswith(PyUnicodeObject
*self
,
7858 PyUnicodeObject
*substring
;
7859 Py_ssize_t start
= 0;
7860 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7863 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &subobj
,
7864 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
7866 if (PyTuple_Check(subobj
)) {
7868 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7869 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7870 PyTuple_GET_ITEM(subobj
, i
));
7871 if (substring
== NULL
)
7873 result
= tailmatch(self
, substring
, start
, end
, +1);
7874 Py_DECREF(substring
);
7881 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7882 if (substring
== NULL
)
7885 result
= tailmatch(self
, substring
, start
, end
, +1);
7886 Py_DECREF(substring
);
7887 return PyBool_FromLong(result
);
7891 /* Implements do_string_format, which is unicode because of stringlib */
7892 #include "stringlib/string_format.h"
7894 PyDoc_STRVAR(format__doc__
,
7895 "S.format(*args, **kwargs) -> unicode\n\
7900 unicode__format__(PyObject
*self
, PyObject
*args
)
7902 PyObject
*format_spec
;
7903 PyObject
*result
= NULL
;
7904 PyObject
*tmp
= NULL
;
7906 /* If 2.x, convert format_spec to the same type as value */
7907 /* This is to allow things like u''.format('') */
7908 if (!PyArg_ParseTuple(args
, "O:__format__", &format_spec
))
7910 if (!(PyBytes_Check(format_spec
) || PyUnicode_Check(format_spec
))) {
7911 PyErr_Format(PyExc_TypeError
, "__format__ arg must be str "
7912 "or unicode, not %s", Py_TYPE(format_spec
)->tp_name
);
7915 tmp
= PyObject_Unicode(format_spec
);
7920 result
= _PyUnicode_FormatAdvanced(self
,
7921 PyUnicode_AS_UNICODE(format_spec
),
7922 PyUnicode_GET_SIZE(format_spec
));
7928 PyDoc_STRVAR(p_format__doc__
,
7929 "S.__format__(format_spec) -> unicode\n\
7934 unicode__sizeof__(PyUnicodeObject
*v
)
7936 return PyInt_FromSsize_t(sizeof(PyUnicodeObject
) +
7937 sizeof(Py_UNICODE
) * (v
->length
+ 1));
7940 PyDoc_STRVAR(sizeof__doc__
,
7941 "S.__sizeof__() -> size of S in memory, in bytes\n\
7946 unicode_getnewargs(PyUnicodeObject
*v
)
7948 return Py_BuildValue("(u#)", v
->str
, v
->length
);
7952 static PyMethodDef unicode_methods
[] = {
7954 /* Order is according to common usage: often used methods should
7955 appear first, since lookup is done sequentially. */
7957 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
, encode__doc__
},
7958 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
7959 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
7960 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
7961 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
7962 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
7963 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
7964 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
7965 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
7966 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
7967 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
7968 {"partition", (PyCFunction
) unicode_partition
, METH_O
, partition__doc__
},
7969 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
7970 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
7971 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
7972 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
7973 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
, decode__doc__
},
7974 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7975 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
7976 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
7977 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
7978 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
7979 {"rpartition", (PyCFunction
) unicode_rpartition
, METH_O
, rpartition__doc__
},
7980 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
7981 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
7982 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
7983 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
7984 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
7985 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
7986 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
7987 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
7988 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
7989 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
7990 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
7991 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
7992 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
7993 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
7994 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
7995 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
7996 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
7997 {"format", (PyCFunction
) do_string_format
, METH_VARARGS
| METH_KEYWORDS
, format__doc__
},
7998 {"__format__", (PyCFunction
) unicode__format__
, METH_VARARGS
, p_format__doc__
},
7999 {"_formatter_field_name_split", (PyCFunction
) formatter_field_name_split
, METH_NOARGS
},
8000 {"_formatter_parser", (PyCFunction
) formatter_parser
, METH_NOARGS
},
8001 {"__sizeof__", (PyCFunction
) unicode__sizeof__
, METH_NOARGS
, sizeof__doc__
},
8003 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
8007 /* This one is just used for debugging the implementation. */
8008 {"freelistsize", (PyCFunction
) free_listsize
, METH_NOARGS
},
8011 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
8016 unicode_mod(PyObject
*v
, PyObject
*w
)
8018 if (!PyUnicode_Check(v
)) {
8019 Py_INCREF(Py_NotImplemented
);
8020 return Py_NotImplemented
;
8022 return PyUnicode_Format(v
, w
);
8025 static PyNumberMethods unicode_as_number
= {
8030 unicode_mod
, /*nb_remainder*/
8033 static PySequenceMethods unicode_as_sequence
= {
8034 (lenfunc
) unicode_length
, /* sq_length */
8035 PyUnicode_Concat
, /* sq_concat */
8036 (ssizeargfunc
) unicode_repeat
, /* sq_repeat */
8037 (ssizeargfunc
) unicode_getitem
, /* sq_item */
8038 (ssizessizeargfunc
) unicode_slice
, /* sq_slice */
8039 0, /* sq_ass_item */
8040 0, /* sq_ass_slice */
8041 PyUnicode_Contains
, /* sq_contains */
8045 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
8047 if (PyIndex_Check(item
)) {
8048 Py_ssize_t i
= PyNumber_AsSsize_t(item
, PyExc_IndexError
);
8049 if (i
== -1 && PyErr_Occurred())
8052 i
+= PyUnicode_GET_SIZE(self
);
8053 return unicode_getitem(self
, i
);
8054 } else if (PySlice_Check(item
)) {
8055 Py_ssize_t start
, stop
, step
, slicelength
, cur
, i
;
8056 Py_UNICODE
* source_buf
;
8057 Py_UNICODE
* result_buf
;
8060 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyUnicode_GET_SIZE(self
),
8061 &start
, &stop
, &step
, &slicelength
) < 0) {
8065 if (slicelength
<= 0) {
8066 return PyUnicode_FromUnicode(NULL
, 0);
8067 } else if (start
== 0 && step
== 1 && slicelength
== self
->length
&&
8068 PyUnicode_CheckExact(self
)) {
8070 return (PyObject
*)self
;
8071 } else if (step
== 1) {
8072 return PyUnicode_FromUnicode(self
->str
+ start
, slicelength
);
8074 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
8075 result_buf
= (Py_UNICODE
*)PyObject_MALLOC(slicelength
*
8076 sizeof(Py_UNICODE
));
8078 if (result_buf
== NULL
)
8079 return PyErr_NoMemory();
8081 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
8082 result_buf
[i
] = source_buf
[cur
];
8085 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
8086 PyObject_FREE(result_buf
);
8090 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
8095 static PyMappingMethods unicode_as_mapping
= {
8096 (lenfunc
)unicode_length
, /* mp_length */
8097 (binaryfunc
)unicode_subscript
, /* mp_subscript */
8098 (objobjargproc
)0, /* mp_ass_subscript */
8102 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
8107 PyErr_SetString(PyExc_SystemError
,
8108 "accessing non-existent unicode segment");
8111 *ptr
= (void *) self
->str
;
8112 return PyUnicode_GET_DATA_SIZE(self
);
8116 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, Py_ssize_t index
,
8119 PyErr_SetString(PyExc_TypeError
,
8120 "cannot use unicode as modifiable buffer");
8125 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
8129 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
8134 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
8141 PyErr_SetString(PyExc_SystemError
,
8142 "accessing non-existent unicode segment");
8145 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
8148 *ptr
= (void *) PyString_AS_STRING(str
);
8149 return PyString_GET_SIZE(str
);
8152 /* Helpers for PyUnicode_Format() */
8155 getnextarg(PyObject
*args
, Py_ssize_t arglen
, Py_ssize_t
*p_argidx
)
8157 Py_ssize_t argidx
= *p_argidx
;
8158 if (argidx
< arglen
) {
8163 return PyTuple_GetItem(args
, argidx
);
8165 PyErr_SetString(PyExc_TypeError
,
8166 "not enough arguments for format string");
8170 #define F_LJUST (1<<0)
8171 #define F_SIGN (1<<1)
8172 #define F_BLANK (1<<2)
8173 #define F_ALT (1<<3)
8174 #define F_ZERO (1<<4)
8177 strtounicode(Py_UNICODE
*buffer
, const char *charbuffer
)
8179 register Py_ssize_t i
;
8180 Py_ssize_t len
= strlen(charbuffer
);
8181 for (i
= len
- 1; i
>= 0; i
--)
8182 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
8188 doubletounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, double x
)
8192 PyOS_ascii_formatd((char *)buffer
, len
, format
, x
);
8193 result
= strtounicode(buffer
, (char *)buffer
);
8194 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
8198 longtounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, long x
)
8202 PyOS_snprintf((char *)buffer
, len
, format
, x
);
8203 result
= strtounicode(buffer
, (char *)buffer
);
8204 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
8207 /* XXX To save some code duplication, formatfloat/long/int could have been
8208 shared with stringobject.c, converting from 8-bit to Unicode after the
8209 formatting is done. */
8212 formatfloat(Py_UNICODE
*buf
,
8219 /* fmt = '%#.' + `prec` + `type`
8220 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
8224 x
= PyFloat_AsDouble(v
);
8225 if (x
== -1.0 && PyErr_Occurred())
8229 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
8231 /* Worst case length calc to ensure no buffer overrun:
8235 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8236 for any double rep.)
8237 len = 1 + prec + 1 + 2 + 5 = 9 + prec
8240 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8241 len = 1 + 50 + 1 + prec = 52 + prec
8243 If prec=0 the effective precision is 1 (the leading digit is
8244 always given), therefore increase the length by one.
8247 if (((type
== 'g' || type
== 'G') &&
8248 buflen
<= (size_t)10 + (size_t)prec
) ||
8249 (type
== 'f' && buflen
<= (size_t)53 + (size_t)prec
)) {
8250 PyErr_SetString(PyExc_OverflowError
,
8251 "formatted float is too long (precision too large?)");
8254 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%d%c",
8255 (flags
&F_ALT
) ? "#" : "",
8257 return doubletounicode(buf
, buflen
, fmt
, x
);
8261 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
8265 PyObject
*str
; /* temporary string object. */
8266 PyUnicodeObject
*result
;
8268 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
8271 result
= _PyUnicode_New(len
);
8276 for (i
= 0; i
< len
; i
++)
8277 result
->str
[i
] = buf
[i
];
8278 result
->str
[len
] = 0;
8280 return (PyObject
*)result
;
8284 formatint(Py_UNICODE
*buf
,
8291 /* fmt = '%#.' + `prec` + 'l' + `type`
8292 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8296 char fmt
[64]; /* plenty big enough! */
8300 x
= PyInt_AsLong(v
);
8301 if (x
== -1 && PyErr_Occurred())
8303 if (x
< 0 && type
== 'u') {
8306 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
8313 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8314 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8316 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
8317 PyErr_SetString(PyExc_OverflowError
,
8318 "formatted integer is too long (precision too large?)");
8322 if ((flags
& F_ALT
) &&
8323 (type
== 'x' || type
== 'X')) {
8324 /* When converting under %#x or %#X, there are a number
8325 * of issues that cause pain:
8326 * - when 0 is being converted, the C standard leaves off
8327 * the '0x' or '0X', which is inconsistent with other
8328 * %#x/%#X conversions and inconsistent with Python's
8330 * - there are platforms that violate the standard and
8331 * convert 0 with the '0x' or '0X'
8332 * (Metrowerks, Compaq Tru64)
8333 * - there are platforms that give '0x' when converting
8334 * under %#X, but convert 0 in accordance with the
8335 * standard (OS/2 EMX)
8337 * We can achieve the desired consistency by inserting our
8338 * own '0x' or '0X' prefix, and substituting %x/%X in place
8341 * Note that this is the same approach as used in
8342 * formatint() in stringobject.c
8344 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
8345 sign
, type
, prec
, type
);
8348 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
8349 sign
, (flags
&F_ALT
) ? "#" : "",
8353 return longtounicode(buf
, buflen
, fmt
, -x
);
8355 return longtounicode(buf
, buflen
, fmt
, x
);
8359 formatchar(Py_UNICODE
*buf
,
8363 /* presume that the buffer is at least 2 characters long */
8364 if (PyUnicode_Check(v
)) {
8365 if (PyUnicode_GET_SIZE(v
) != 1)
8367 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
8370 else if (PyString_Check(v
)) {
8371 if (PyString_GET_SIZE(v
) != 1)
8373 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
8377 /* Integer input truncated to a character */
8379 x
= PyInt_AsLong(v
);
8380 if (x
== -1 && PyErr_Occurred())
8382 #ifdef Py_UNICODE_WIDE
8383 if (x
< 0 || x
> 0x10ffff) {
8384 PyErr_SetString(PyExc_OverflowError
,
8385 "%c arg not in range(0x110000) "
8386 "(wide Python build)");
8390 if (x
< 0 || x
> 0xffff) {
8391 PyErr_SetString(PyExc_OverflowError
,
8392 "%c arg not in range(0x10000) "
8393 "(narrow Python build)");
8397 buf
[0] = (Py_UNICODE
) x
;
8403 PyErr_SetString(PyExc_TypeError
,
8404 "%c requires int or char");
8408 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8410 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8411 chars are formatted. XXX This is a magic number. Each formatting
8412 routine does bounds checking to ensure no overflow, but a better
8413 solution may be to malloc a buffer of appropriate size for each
8414 format. For now, the current solution is sufficient.
8416 #define FORMATBUFLEN (size_t)120
8418 PyObject
*PyUnicode_Format(PyObject
*format
,
8421 Py_UNICODE
*fmt
, *res
;
8422 Py_ssize_t fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
8424 PyUnicodeObject
*result
= NULL
;
8425 PyObject
*dict
= NULL
;
8428 if (format
== NULL
|| args
== NULL
) {
8429 PyErr_BadInternalCall();
8432 uformat
= PyUnicode_FromObject(format
);
8433 if (uformat
== NULL
)
8435 fmt
= PyUnicode_AS_UNICODE(uformat
);
8436 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
8438 reslen
= rescnt
= fmtcnt
+ 100;
8439 result
= _PyUnicode_New(reslen
);
8442 res
= PyUnicode_AS_UNICODE(result
);
8444 if (PyTuple_Check(args
)) {
8445 arglen
= PyTuple_Size(args
);
8452 if (Py_TYPE(args
)->tp_as_mapping
&& !PyTuple_Check(args
) &&
8453 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
8456 while (--fmtcnt
>= 0) {
8459 rescnt
= fmtcnt
+ 100;
8461 if (_PyUnicode_Resize(&result
, reslen
) < 0)
8463 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
8469 /* Got a format specifier */
8471 Py_ssize_t width
= -1;
8473 Py_UNICODE c
= '\0';
8477 PyObject
*temp
= NULL
;
8481 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
8485 Py_UNICODE
*keystart
;
8491 PyErr_SetString(PyExc_TypeError
,
8492 "format requires a mapping");
8498 /* Skip over balanced parentheses */
8499 while (pcount
> 0 && --fmtcnt
>= 0) {
8502 else if (*fmt
== '(')
8506 keylen
= fmt
- keystart
- 1;
8507 if (fmtcnt
< 0 || pcount
> 0) {
8508 PyErr_SetString(PyExc_ValueError
,
8509 "incomplete format key");
8513 /* keys are converted to strings using UTF-8 and
8514 then looked up since Python uses strings to hold
8515 variables names etc. in its namespaces and we
8516 wouldn't want to break common idioms. */
8517 key
= PyUnicode_EncodeUTF8(keystart
,
8521 key
= PyUnicode_FromUnicode(keystart
, keylen
);
8529 args
= PyObject_GetItem(dict
, key
);
8538 while (--fmtcnt
>= 0) {
8539 switch (c
= *fmt
++) {
8540 case '-': flags
|= F_LJUST
; continue;
8541 case '+': flags
|= F_SIGN
; continue;
8542 case ' ': flags
|= F_BLANK
; continue;
8543 case '#': flags
|= F_ALT
; continue;
8544 case '0': flags
|= F_ZERO
; continue;
8549 v
= getnextarg(args
, arglen
, &argidx
);
8552 if (!PyInt_Check(v
)) {
8553 PyErr_SetString(PyExc_TypeError
,
8557 width
= PyInt_AsLong(v
);
8565 else if (c
>= '0' && c
<= '9') {
8567 while (--fmtcnt
>= 0) {
8569 if (c
< '0' || c
> '9')
8571 if ((width
*10) / 10 != width
) {
8572 PyErr_SetString(PyExc_ValueError
,
8576 width
= width
*10 + (c
- '0');
8584 v
= getnextarg(args
, arglen
, &argidx
);
8587 if (!PyInt_Check(v
)) {
8588 PyErr_SetString(PyExc_TypeError
,
8592 prec
= PyInt_AsLong(v
);
8598 else if (c
>= '0' && c
<= '9') {
8600 while (--fmtcnt
>= 0) {
8601 c
= Py_CHARMASK(*fmt
++);
8602 if (c
< '0' || c
> '9')
8604 if ((prec
*10) / 10 != prec
) {
8605 PyErr_SetString(PyExc_ValueError
,
8609 prec
= prec
*10 + (c
- '0');
8614 if (c
== 'h' || c
== 'l' || c
== 'L') {
8620 PyErr_SetString(PyExc_ValueError
,
8621 "incomplete format");
8625 v
= getnextarg(args
, arglen
, &argidx
);
8635 /* presume that buffer length is at least 1 */
8642 if (PyUnicode_Check(v
) && c
== 's') {
8649 temp
= PyObject_Unicode(v
);
8651 temp
= PyObject_Repr(v
);
8654 if (PyUnicode_Check(temp
))
8655 /* nothing to do */;
8656 else if (PyString_Check(temp
)) {
8657 /* convert to string to Unicode */
8658 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
8659 PyString_GET_SIZE(temp
),
8669 PyErr_SetString(PyExc_TypeError
,
8670 "%s argument has non-string str()");
8674 pbuf
= PyUnicode_AS_UNICODE(temp
);
8675 len
= PyUnicode_GET_SIZE(temp
);
8676 if (prec
>= 0 && len
> prec
)
8689 if (PyNumber_Check(v
)) {
8690 PyObject
*iobj
=NULL
;
8692 if (PyInt_Check(v
) || (PyLong_Check(v
))) {
8697 iobj
= PyNumber_Int(v
);
8698 if (iobj
==NULL
) iobj
= PyNumber_Long(v
);
8701 if (PyInt_Check(iobj
)) {
8704 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8705 flags
, prec
, c
, iobj
);
8711 else if (PyLong_Check(iobj
)) {
8713 temp
= formatlong(iobj
, flags
, prec
, c
);
8717 pbuf
= PyUnicode_AS_UNICODE(temp
);
8718 len
= PyUnicode_GET_SIZE(temp
);
8727 PyErr_Format(PyExc_TypeError
,
8728 "%%%c format: a number is required, "
8729 "not %.200s", (char)c
, Py_TYPE(v
)->tp_name
);
8745 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8756 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
8762 PyErr_Format(PyExc_ValueError
,
8763 "unsupported format character '%c' (0x%x) "
8765 (31<=c
&& c
<=126) ? (char)c
: '?',
8767 (Py_ssize_t
)(fmt
- 1 -
8768 PyUnicode_AS_UNICODE(uformat
)));
8772 if (*pbuf
== '-' || *pbuf
== '+') {
8776 else if (flags
& F_SIGN
)
8778 else if (flags
& F_BLANK
)
8785 if (rescnt
- (sign
!= 0) < width
) {
8787 rescnt
= width
+ fmtcnt
+ 100;
8794 if (_PyUnicode_Resize(&result
, reslen
) < 0) {
8798 res
= PyUnicode_AS_UNICODE(result
)
8808 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8809 assert(pbuf
[0] == '0');
8810 assert(pbuf
[1] == c
);
8821 if (width
> len
&& !(flags
& F_LJUST
)) {
8825 } while (--width
> len
);
8830 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8831 assert(pbuf
[0] == '0');
8832 assert(pbuf
[1] == c
);
8837 Py_UNICODE_COPY(res
, pbuf
, len
);
8840 while (--width
>= len
) {
8844 if (dict
&& (argidx
< arglen
) && c
!= '%') {
8845 PyErr_SetString(PyExc_TypeError
,
8846 "not all arguments converted during string formatting");
8853 if (argidx
< arglen
&& !dict
) {
8854 PyErr_SetString(PyExc_TypeError
,
8855 "not all arguments converted during string formatting");
8859 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
8865 return (PyObject
*)result
;
8876 static PyBufferProcs unicode_as_buffer
= {
8877 (readbufferproc
) unicode_buffer_getreadbuf
,
8878 (writebufferproc
) unicode_buffer_getwritebuf
,
8879 (segcountproc
) unicode_buffer_getsegcount
,
8880 (charbufferproc
) unicode_buffer_getcharbuf
,
8884 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
8887 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8890 static char *kwlist
[] = {"string", "encoding", "errors", 0};
8891 char *encoding
= NULL
;
8892 char *errors
= NULL
;
8894 if (type
!= &PyUnicode_Type
)
8895 return unicode_subtype_new(type
, args
, kwds
);
8896 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
8897 kwlist
, &x
, &encoding
, &errors
))
8900 return (PyObject
*)_PyUnicode_New(0);
8901 if (encoding
== NULL
&& errors
== NULL
)
8902 return PyObject_Unicode(x
);
8904 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
8908 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8910 PyUnicodeObject
*tmp
, *pnew
;
8913 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
8914 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
8917 assert(PyUnicode_Check(tmp
));
8918 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
8923 pnew
->str
= (Py_UNICODE
*) PyObject_MALLOC(sizeof(Py_UNICODE
) * (n
+1));
8924 if (pnew
->str
== NULL
) {
8925 _Py_ForgetReference((PyObject
*)pnew
);
8928 return PyErr_NoMemory();
8930 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
8932 pnew
->hash
= tmp
->hash
;
8934 return (PyObject
*)pnew
;
8937 PyDoc_STRVAR(unicode_doc
,
8938 "unicode(string [, encoding[, errors]]) -> object\n\
8940 Create a new Unicode object from the given encoded string.\n\
8941 encoding defaults to the current default string encoding.\n\
8942 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8944 PyTypeObject PyUnicode_Type
= {
8945 PyVarObject_HEAD_INIT(&PyType_Type
, 0)
8946 "unicode", /* tp_name */
8947 sizeof(PyUnicodeObject
), /* tp_size */
8948 0, /* tp_itemsize */
8950 (destructor
)unicode_dealloc
, /* tp_dealloc */
8955 unicode_repr
, /* tp_repr */
8956 &unicode_as_number
, /* tp_as_number */
8957 &unicode_as_sequence
, /* tp_as_sequence */
8958 &unicode_as_mapping
, /* tp_as_mapping */
8959 (hashfunc
) unicode_hash
, /* tp_hash*/
8961 (reprfunc
) unicode_str
, /* tp_str */
8962 PyObject_GenericGetAttr
, /* tp_getattro */
8963 0, /* tp_setattro */
8964 &unicode_as_buffer
, /* tp_as_buffer */
8965 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
8966 Py_TPFLAGS_BASETYPE
| Py_TPFLAGS_UNICODE_SUBCLASS
, /* tp_flags */
8967 unicode_doc
, /* tp_doc */
8968 0, /* tp_traverse */
8970 PyUnicode_RichCompare
, /* tp_richcompare */
8971 0, /* tp_weaklistoffset */
8973 0, /* tp_iternext */
8974 unicode_methods
, /* tp_methods */
8977 &PyBaseString_Type
, /* tp_base */
8979 0, /* tp_descr_get */
8980 0, /* tp_descr_set */
8981 0, /* tp_dictoffset */
8984 unicode_new
, /* tp_new */
8985 PyObject_Del
, /* tp_free */
8988 /* Initialize the Unicode implementation */
8990 void _PyUnicode_Init(void)
8994 /* XXX - move this array to unicodectype.c ? */
8995 Py_UNICODE linebreak
[] = {
8996 0x000A, /* LINE FEED */
8997 0x000D, /* CARRIAGE RETURN */
8998 0x001C, /* FILE SEPARATOR */
8999 0x001D, /* GROUP SEPARATOR */
9000 0x001E, /* RECORD SEPARATOR */
9001 0x0085, /* NEXT LINE */
9002 0x2028, /* LINE SEPARATOR */
9003 0x2029, /* PARAGRAPH SEPARATOR */
9006 /* Init the implementation */
9009 unicode_empty
= _PyUnicode_New(0);
9013 strcpy(unicode_default_encoding
, "ascii");
9014 for (i
= 0; i
< 256; i
++)
9015 unicode_latin1
[i
] = NULL
;
9016 if (PyType_Ready(&PyUnicode_Type
) < 0)
9017 Py_FatalError("Can't initialize 'unicode'");
9019 /* initialize the linebreak bloom filter */
9020 bloom_linebreak
= make_bloom_mask(
9021 linebreak
, sizeof(linebreak
) / sizeof(linebreak
[0])
9024 PyType_Ready(&EncodingMapType
);
9027 /* Finalize the Unicode implementation */
9030 PyUnicode_ClearFreeList(void)
9032 int freelist_size
= numfree
;
9035 for (u
= free_list
; u
!= NULL
;) {
9036 PyUnicodeObject
*v
= u
;
9037 u
= *(PyUnicodeObject
**)u
;
9039 PyObject_DEL(v
->str
);
9040 Py_XDECREF(v
->defenc
);
9045 assert(numfree
== 0);
9046 return freelist_size
;
9050 _PyUnicode_Fini(void)
9054 Py_XDECREF(unicode_empty
);
9055 unicode_empty
= NULL
;
9057 for (i
= 0; i
< 256; i
++) {
9058 if (unicode_latin1
[i
]) {
9059 Py_DECREF(unicode_latin1
[i
]);
9060 unicode_latin1
[i
] = NULL
;
9063 (void)PyUnicode_ClearFreeList();
9074 indent-tabs-mode: nil