Merged revisions 79352 via svnmerge from
[python/dscho.git] / Include / unicodeobject.h
blob08b518a8d757555e5c89423c957f1d0c2e639e16
1 #ifndef Py_UNICODEOBJECT_H
2 #define Py_UNICODEOBJECT_H
4 #include <stdarg.h>
6 /*
8 Unicode implementation based on original code by Fredrik Lundh,
9 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
10 Unicode Integration Proposal (see file Misc/unicode.txt).
12 Copyright (c) Corporation for National Research Initiatives.
15 Original header:
16 --------------------------------------------------------------------
18 * Yet another Unicode string type for Python. This type supports the
19 * 16-bit Basic Multilingual Plane (BMP) only.
21 * Written by Fredrik Lundh, January 1999.
23 * Copyright (c) 1999 by Secret Labs AB.
24 * Copyright (c) 1999 by Fredrik Lundh.
26 * fredrik@pythonware.com
27 * http://www.pythonware.com
29 * --------------------------------------------------------------------
30 * This Unicode String Type is
32 * Copyright (c) 1999 by Secret Labs AB
33 * Copyright (c) 1999 by Fredrik Lundh
35 * By obtaining, using, and/or copying this software and/or its
36 * associated documentation, you agree that you have read, understood,
37 * and will comply with the following terms and conditions:
39 * Permission to use, copy, modify, and distribute this software and its
40 * associated documentation for any purpose and without fee is hereby
41 * granted, provided that the above copyright notice appears in all
42 * copies, and that both that copyright notice and this permission notice
43 * appear in supporting documentation, and that the name of Secret Labs
44 * AB or the author not be used in advertising or publicity pertaining to
45 * distribution of the software without specific, written prior
46 * permission.
48 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
49 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
50 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
51 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
52 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
53 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
54 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
55 * -------------------------------------------------------------------- */
57 #include <ctype.h>
59 /* === Internal API ======================================================= */
61 /* --- Internal Unicode Format -------------------------------------------- */
63 /* Python 3.x requires unicode */
64 #define Py_USING_UNICODE
66 /* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
67 properly set, but the default rules below doesn't set it. I'll
68 sort this out some other day -- fredrik@pythonware.com */
70 #ifndef Py_UNICODE_SIZE
71 #error Must define Py_UNICODE_SIZE
72 #endif
74 /* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode
75 strings are stored as UCS-2 (with limited support for UTF-16) */
77 #if Py_UNICODE_SIZE >= 4
78 #define Py_UNICODE_WIDE
79 #endif
81 /* Set these flags if the platform has "wchar.h", "wctype.h" and the
82 wchar_t type is a 16-bit unsigned type */
83 /* #define HAVE_WCHAR_H */
84 /* #define HAVE_USABLE_WCHAR_T */
86 /* Defaults for various platforms */
87 #ifndef PY_UNICODE_TYPE
89 /* Windows has a usable wchar_t type (unless we're using UCS-4) */
90 # if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
91 # define HAVE_USABLE_WCHAR_T
92 # define PY_UNICODE_TYPE wchar_t
93 # endif
95 # if defined(Py_UNICODE_WIDE)
96 # define PY_UNICODE_TYPE Py_UCS4
97 # endif
99 #endif
101 /* If the compiler provides a wchar_t type we try to support it
102 through the interface functions PyUnicode_FromWideChar() and
103 PyUnicode_AsWideChar(). */
105 #ifdef HAVE_USABLE_WCHAR_T
106 # ifndef HAVE_WCHAR_H
107 # define HAVE_WCHAR_H
108 # endif
109 #endif
111 #ifdef HAVE_WCHAR_H
112 /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
113 # ifdef _HAVE_BSDI
114 # include <time.h>
115 # endif
116 # include <wchar.h>
117 #endif
120 * Use this typedef when you need to represent a UTF-16 surrogate pair
121 * as single unsigned integer.
123 #if SIZEOF_INT >= 4
124 typedef unsigned int Py_UCS4;
125 #elif SIZEOF_LONG >= 4
126 typedef unsigned long Py_UCS4;
127 #endif
129 /* Py_UNICODE is the native Unicode storage format (code unit) used by
130 Python and represents a single Unicode element in the Unicode
131 type. */
133 typedef PY_UNICODE_TYPE Py_UNICODE;
135 /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
137 /* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
138 produce different external names and thus cause import errors in
139 case Python interpreters and extensions with mixed compiled in
140 Unicode width assumptions are combined. */
142 #ifndef Py_UNICODE_WIDE
144 # define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
145 # define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
146 # define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
147 # define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
148 # define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
149 # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
150 # define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
151 # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
152 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
153 # define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
154 # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
155 # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
156 # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
157 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
158 # define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
159 # define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
160 # define PyUnicode_Compare PyUnicodeUCS2_Compare
161 # define PyUnicode_Concat PyUnicodeUCS2_Concat
162 # define PyUnicode_Append PyUnicodeUCS2_Append
163 # define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
164 # define PyUnicode_Contains PyUnicodeUCS2_Contains
165 # define PyUnicode_Count PyUnicodeUCS2_Count
166 # define PyUnicode_Decode PyUnicodeUCS2_Decode
167 # define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
168 # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
169 # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
170 # define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
171 # define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
172 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
173 # define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
174 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
175 # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
176 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
177 # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
178 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
179 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
180 # define PyUnicode_Encode PyUnicodeUCS2_Encode
181 # define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
182 # define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
183 # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
184 # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
185 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
186 # define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
187 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
188 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
189 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
190 # define PyUnicode_Find PyUnicodeUCS2_Find
191 # define PyUnicode_Format PyUnicodeUCS2_Format
192 # define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
193 # define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
194 # define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
195 # define PyUnicode_FromObject PyUnicodeUCS2_FromObject
196 # define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
197 # define PyUnicode_FromString PyUnicodeUCS2_FromString
198 # define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
199 # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
200 # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
201 # define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
202 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
203 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
204 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
205 # define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
206 # define PyUnicode_Join PyUnicodeUCS2_Join
207 # define PyUnicode_Partition PyUnicodeUCS2_Partition
208 # define PyUnicode_RPartition PyUnicodeUCS2_RPartition
209 # define PyUnicode_RSplit PyUnicodeUCS2_RSplit
210 # define PyUnicode_Replace PyUnicodeUCS2_Replace
211 # define PyUnicode_Resize PyUnicodeUCS2_Resize
212 # define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
213 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
214 # define PyUnicode_Split PyUnicodeUCS2_Split
215 # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
216 # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
217 # define PyUnicode_Translate PyUnicodeUCS2_Translate
218 # define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
219 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
220 # define _PyUnicode_Fini _PyUnicodeUCS2_Fini
221 # define _PyUnicode_Init _PyUnicodeUCS2_Init
222 # define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
223 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
224 # define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
225 # define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
226 # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
227 # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
228 # define _PyUnicode_IsPrintable _PyUnicodeUCS2_IsPrintable
229 # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
230 # define _PyUnicode_IsXidStart _PyUnicodeUCS2_IsXidStart
231 # define _PyUnicode_IsXidContinue _PyUnicodeUCS2_IsXidContinue
232 # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
233 # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
234 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
235 # define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
236 # define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
237 # define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
238 # define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
239 # define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
241 #else
243 # define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
244 # define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
245 # define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
246 # define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
247 # define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
248 # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
249 # define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
250 # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
251 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
252 # define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
253 # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
254 # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
255 # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
256 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
257 # define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
258 # define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist
259 # define PyUnicode_Compare PyUnicodeUCS4_Compare
260 # define PyUnicode_Concat PyUnicodeUCS4_Concat
261 # define PyUnicode_Append PyUnicodeUCS4_Append
262 # define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
263 # define PyUnicode_Contains PyUnicodeUCS4_Contains
264 # define PyUnicode_Count PyUnicodeUCS4_Count
265 # define PyUnicode_Decode PyUnicodeUCS4_Decode
266 # define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
267 # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
268 # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
269 # define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
270 # define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
271 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
272 # define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
273 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
274 # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
275 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
276 # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
277 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
278 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
279 # define PyUnicode_Encode PyUnicodeUCS4_Encode
280 # define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
281 # define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
282 # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
283 # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
284 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
285 # define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
286 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
287 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
288 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
289 # define PyUnicode_Find PyUnicodeUCS4_Find
290 # define PyUnicode_Format PyUnicodeUCS4_Format
291 # define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
292 # define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
293 # define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
294 # define PyUnicode_FromObject PyUnicodeUCS4_FromObject
295 # define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
296 # define PyUnicode_FromString PyUnicodeUCS4_FromString
297 # define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
298 # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
299 # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
300 # define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
301 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
302 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
303 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
304 # define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
305 # define PyUnicode_Join PyUnicodeUCS4_Join
306 # define PyUnicode_Partition PyUnicodeUCS4_Partition
307 # define PyUnicode_RPartition PyUnicodeUCS4_RPartition
308 # define PyUnicode_RSplit PyUnicodeUCS4_RSplit
309 # define PyUnicode_Replace PyUnicodeUCS4_Replace
310 # define PyUnicode_Resize PyUnicodeUCS4_Resize
311 # define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
312 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
313 # define PyUnicode_Split PyUnicodeUCS4_Split
314 # define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
315 # define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
316 # define PyUnicode_Translate PyUnicodeUCS4_Translate
317 # define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
318 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
319 # define _PyUnicode_Fini _PyUnicodeUCS4_Fini
320 # define _PyUnicode_Init _PyUnicodeUCS4_Init
321 # define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
322 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
323 # define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
324 # define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
325 # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
326 # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
327 # define _PyUnicode_IsPrintable _PyUnicodeUCS4_IsPrintable
328 # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
329 # define _PyUnicode_IsXidStart _PyUnicodeUCS4_IsXidStart
330 # define _PyUnicode_IsXidContinue _PyUnicodeUCS4_IsXidContinue
331 # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
332 # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
333 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
334 # define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
335 # define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
336 # define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
337 # define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
338 # define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
341 #endif
343 /* --- Internal Unicode Operations ---------------------------------------- */
345 /* If you want Python to use the compiler's wctype.h functions instead
346 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
347 configure Python using --with-wctype-functions. This reduces the
348 interpreter's code size. */
350 #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
352 #include <wctype.h>
354 #define Py_UNICODE_ISSPACE(ch) iswspace(ch)
356 #define Py_UNICODE_ISLOWER(ch) iswlower(ch)
357 #define Py_UNICODE_ISUPPER(ch) iswupper(ch)
358 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
359 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
361 #define Py_UNICODE_TOLOWER(ch) towlower(ch)
362 #define Py_UNICODE_TOUPPER(ch) towupper(ch)
363 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
365 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
366 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
367 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
368 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
370 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
371 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
372 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
374 #define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
376 #else
378 /* Since splitting on whitespace is an important use case, and
379 whitespace in most situations is solely ASCII whitespace, we
380 optimize for the common case by using a quick look-up table
381 _Py_ascii_whitespace (see below) with an inlined check.
384 #define Py_UNICODE_ISSPACE(ch) \
385 ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
387 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
388 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
389 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
390 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
392 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
393 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
394 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
396 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
397 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
398 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
399 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
401 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
402 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
403 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
405 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
407 #endif
409 #define Py_UNICODE_ISALNUM(ch) \
410 (Py_UNICODE_ISALPHA(ch) || \
411 Py_UNICODE_ISDECIMAL(ch) || \
412 Py_UNICODE_ISDIGIT(ch) || \
413 Py_UNICODE_ISNUMERIC(ch))
415 #define Py_UNICODE_COPY(target, source, length) \
416 Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
418 #define Py_UNICODE_FILL(target, value, length) \
419 do {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
420 for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
421 } while (0)
423 /* Check if substring matches at given offset. the offset must be
424 valid, and the substring must not be empty */
426 #define Py_UNICODE_MATCH(string, offset, substring) \
427 ((*((string)->str + (offset)) == *((substring)->str)) && \
428 ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
429 !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
431 #ifdef __cplusplus
432 extern "C" {
433 #endif
435 /* --- Unicode Type ------------------------------------------------------- */
437 typedef struct {
438 PyObject_HEAD
439 Py_ssize_t length; /* Length of raw Unicode data in buffer */
440 Py_UNICODE *str; /* Raw Unicode buffer */
441 long hash; /* Hash value; -1 if not set */
442 int state; /* != 0 if interned. In this case the two
443 * references from the dictionary to this object
444 * are *not* counted in ob_refcnt. */
445 PyObject *defenc; /* (Default) Encoded version as Python
446 string, or NULL; this is used for
447 implementing the buffer protocol */
448 } PyUnicodeObject;
450 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
451 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
453 #define SSTATE_NOT_INTERNED 0
454 #define SSTATE_INTERNED_MORTAL 1
455 #define SSTATE_INTERNED_IMMORTAL 2
457 #define PyUnicode_Check(op) \
458 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
459 #define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
461 /* Fast access macros */
462 #define PyUnicode_GET_SIZE(op) \
463 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
464 #define PyUnicode_GET_DATA_SIZE(op) \
465 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
466 #define PyUnicode_AS_UNICODE(op) \
467 (assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
468 #define PyUnicode_AS_DATA(op) \
469 (assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
471 /* --- Constants ---------------------------------------------------------- */
473 /* This Unicode character will be used as replacement character during
474 decoding if the errors argument is set to "replace". Note: the
475 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
476 Unicode 3.0. */
478 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
480 /* === Public API ========================================================= */
482 /* --- Plain Py_UNICODE --------------------------------------------------- */
484 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
485 size.
487 u may be NULL which causes the contents to be undefined. It is the
488 user's responsibility to fill in the needed data afterwards. Note
489 that modifying the Unicode object contents after construction is
490 only allowed if u was set to NULL.
492 The buffer is copied into the new object. */
494 PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
495 const Py_UNICODE *u, /* Unicode buffer */
496 Py_ssize_t size /* size of buffer */
499 /* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */
500 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
501 const char *u, /* char buffer */
502 Py_ssize_t size /* size of buffer */
505 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
506 Latin-1 encoded bytes */
507 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
508 const char *u /* string */
511 /* Return a read-only pointer to the Unicode object's internal
512 Py_UNICODE buffer. */
514 PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
515 PyObject *unicode /* Unicode object */
518 /* Get the length of the Unicode object. */
520 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
521 PyObject *unicode /* Unicode object */
524 /* Get the maximum ordinal for a Unicode character. */
525 PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
527 /* Resize an already allocated Unicode object to the new size length.
529 *unicode is modified to point to the new (resized) object and 0
530 returned on success.
532 This API may only be called by the function which also called the
533 Unicode constructor. The refcount on the object must be 1. Otherwise,
534 an error is returned.
536 Error handling is implemented as follows: an exception is set, -1
537 is returned and *unicode left untouched.
541 PyAPI_FUNC(int) PyUnicode_Resize(
542 PyObject **unicode, /* Pointer to the Unicode object */
543 Py_ssize_t length /* New length */
546 /* Coerce obj to an Unicode object and return a reference with
547 *incremented* refcount.
549 Coercion is done in the following way:
551 1. String and other char buffer compatible objects are decoded
552 under the assumptions that they contain data using the current
553 default encoding. Decoding is done in "strict" mode.
555 2. All other objects (including Unicode objects) raise an
556 exception.
558 The API returns NULL in case of an error. The caller is responsible
559 for decref'ing the returned objects.
563 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
564 register PyObject *obj, /* Object */
565 const char *encoding, /* encoding */
566 const char *errors /* error handling */
569 /* Coerce obj to an Unicode object and return a reference with
570 *incremented* refcount.
572 Unicode objects are passed back as-is (subclasses are converted to
573 true Unicode objects), all other objects are delegated to
574 PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
575 using the default encoding as basis for decoding the object.
577 The API returns NULL in case of an error. The caller is responsible
578 for decref'ing the returned objects.
582 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
583 register PyObject *obj /* Object */
586 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
587 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
589 /* Format the object based on the format_spec, as defined in PEP 3101
590 (Advanced String Formatting). */
591 PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
592 Py_UNICODE *format_spec,
593 Py_ssize_t format_spec_len);
595 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
596 PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
597 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(const char *);
598 PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
600 /* Use only if you know it's a string */
601 #define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
603 /* --- wchar_t support for platforms which support it --------------------- */
605 #ifdef HAVE_WCHAR_H
607 /* Create a Unicode Object from the whcar_t buffer w of the given
608 size.
610 The buffer is copied into the new object. */
612 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
613 register const wchar_t *w, /* wchar_t buffer */
614 Py_ssize_t size /* size of buffer */
617 /* Copies the Unicode Object contents into the wchar_t buffer w. At
618 most size wchar_t characters are copied.
620 Note that the resulting wchar_t string may or may not be
621 0-terminated. It is the responsibility of the caller to make sure
622 that the wchar_t string is 0-terminated in case this is required by
623 the application.
625 Returns the number of wchar_t characters copied (excluding a
626 possibly trailing 0-termination character) or -1 in case of an
627 error. */
629 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
630 PyUnicodeObject *unicode, /* Unicode object */
631 register wchar_t *w, /* wchar_t buffer */
632 Py_ssize_t size /* size of buffer */
635 #endif
637 /* --- Unicode ordinals --------------------------------------------------- */
639 /* Create a Unicode Object from the given Unicode code point ordinal.
641 The ordinal must be in range(0x10000) on narrow Python builds
642 (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
643 raised in case it is not.
647 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
649 /* --- Free-list management ----------------------------------------------- */
651 /* Clear the free list used by the Unicode implementation.
653 This can be used to release memory used for objects on the free
654 list back to the Python memory allocator.
658 PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
660 /* === Builtin Codecs =====================================================
662 Many of these APIs take two arguments encoding and errors. These
663 parameters encoding and errors have the same semantics as the ones
664 of the builtin unicode() API.
666 Setting encoding to NULL causes the default encoding to be used.
668 Error handling is set by errors which may also be set to NULL
669 meaning to use the default handling defined for the codec. Default
670 error handling for all builtin codecs is "strict" (ValueErrors are
671 raised).
673 The codecs all use a similar interface. Only deviation from the
674 generic ones are documented.
678 /* --- Manage the default encoding ---------------------------------------- */
680 /* Return a Python string holding the default encoded value of the
681 Unicode object.
683 The resulting string is cached in the Unicode object for subsequent
684 usage by this function. The cached version is needed to implement
685 the character buffer interface and will live (at least) as long as
686 the Unicode object itself.
688 The refcount of the string is *not* incremented.
690 *** Exported for internal use by the interpreter only !!! ***
694 PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
695 PyObject *unicode,
696 const char *errors);
698 /* Returns a pointer to the default encoding (normally, UTF-8) of the
699 Unicode object unicode and the size of the encoded representation
700 in bytes stored in *size.
702 In case of an error, no *size is set.
704 *** This API is for interpreter INTERNAL USE ONLY and will likely
705 *** be removed or changed for Python 3.1.
707 *** If you need to access the Unicode object as UTF-8 bytes string,
708 *** please use PyUnicode_AsUTF8String() instead.
712 PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize(
713 PyObject *unicode,
714 Py_ssize_t *size);
716 /* Returns a pointer to the default encoding (normally, UTf-8) of the
717 Unicode object unicode.
719 Use of this API is DEPRECATED since no size information can be
720 extracted from the returned data.
722 *** This API is for interpreter INTERNAL USE ONLY and will likely
723 *** be removed or changed for Python 3.1.
725 *** If you need to access the Unicode object as UTF-8 bytes string,
726 *** please use PyUnicode_AsUTF8String() instead.
730 PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode);
732 /* Returns the currently active default encoding.
734 The default encoding is currently implemented as run-time settable
735 process global. This may change in future versions of the
736 interpreter to become a parameter which is managed on a per-thread
737 basis.
741 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
743 /* Sets the currently active default encoding.
745 Returns 0 on success, -1 in case of an error.
749 PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
750 const char *encoding /* Encoding name in standard form */
753 /* --- Generic Codecs ----------------------------------------------------- */
755 /* Create a Unicode object by decoding the encoded string s of the
756 given size. */
758 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
759 const char *s, /* encoded string */
760 Py_ssize_t size, /* size of buffer */
761 const char *encoding, /* encoding */
762 const char *errors /* error handling */
765 /* Decode a Unicode object unicode and return the result as Python
766 object. */
768 PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
769 PyObject *unicode, /* Unicode object */
770 const char *encoding, /* encoding */
771 const char *errors /* error handling */
774 /* Decode a Unicode object unicode and return the result as Unicode
775 object. */
777 PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
778 PyObject *unicode, /* Unicode object */
779 const char *encoding, /* encoding */
780 const char *errors /* error handling */
783 /* Encodes a Py_UNICODE buffer of the given size and returns a
784 Python string object. */
786 PyAPI_FUNC(PyObject*) PyUnicode_Encode(
787 const Py_UNICODE *s, /* Unicode char buffer */
788 Py_ssize_t size, /* number of Py_UNICODE chars to encode */
789 const char *encoding, /* encoding */
790 const char *errors /* error handling */
793 /* Encodes a Unicode object and returns the result as Python
794 object. */
796 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
797 PyObject *unicode, /* Unicode object */
798 const char *encoding, /* encoding */
799 const char *errors /* error handling */
802 /* Encodes a Unicode object and returns the result as Python string
803 object. */
805 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
806 PyObject *unicode, /* Unicode object */
807 const char *encoding, /* encoding */
808 const char *errors /* error handling */
811 /* Encodes a Unicode object and returns the result as Unicode
812 object. */
814 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
815 PyObject *unicode, /* Unicode object */
816 const char *encoding, /* encoding */
817 const char *errors /* error handling */
820 /* Build an encoding map. */
822 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
823 PyObject* string /* 256 character map */
826 /* --- UTF-7 Codecs ------------------------------------------------------- */
828 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
829 const char *string, /* UTF-7 encoded string */
830 Py_ssize_t length, /* size of string */
831 const char *errors /* error handling */
834 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
835 const char *string, /* UTF-7 encoded string */
836 Py_ssize_t length, /* size of string */
837 const char *errors, /* error handling */
838 Py_ssize_t *consumed /* bytes consumed */
841 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
842 const Py_UNICODE *data, /* Unicode char buffer */
843 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
844 int base64SetO, /* Encode RFC2152 Set O characters in base64 */
845 int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
846 const char *errors /* error handling */
849 /* --- UTF-8 Codecs ------------------------------------------------------- */
851 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
852 const char *string, /* UTF-8 encoded string */
853 Py_ssize_t length, /* size of string */
854 const char *errors /* error handling */
857 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
858 const char *string, /* UTF-8 encoded string */
859 Py_ssize_t length, /* size of string */
860 const char *errors, /* error handling */
861 Py_ssize_t *consumed /* bytes consumed */
864 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
865 PyObject *unicode /* Unicode object */
868 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
869 const Py_UNICODE *data, /* Unicode char buffer */
870 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
871 const char *errors /* error handling */
874 /* --- UTF-32 Codecs ------------------------------------------------------ */
876 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
877 the corresponding Unicode object.
879 errors (if non-NULL) defines the error handling. It defaults
880 to "strict".
882 If byteorder is non-NULL, the decoder starts decoding using the
883 given byte order:
885 *byteorder == -1: little endian
886 *byteorder == 0: native order
887 *byteorder == 1: big endian
889 In native mode, the first four bytes of the stream are checked for a
890 BOM mark. If found, the BOM mark is analysed, the byte order
891 adjusted and the BOM skipped. In the other modes, no BOM mark
892 interpretation is done. After completion, *byteorder is set to the
893 current byte order at the end of input data.
895 If byteorder is NULL, the codec starts in native order mode.
899 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
900 const char *string, /* UTF-32 encoded string */
901 Py_ssize_t length, /* size of string */
902 const char *errors, /* error handling */
903 int *byteorder /* pointer to byteorder to use
904 0=native;-1=LE,1=BE; updated on
905 exit */
908 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
909 const char *string, /* UTF-32 encoded string */
910 Py_ssize_t length, /* size of string */
911 const char *errors, /* error handling */
912 int *byteorder, /* pointer to byteorder to use
913 0=native;-1=LE,1=BE; updated on
914 exit */
915 Py_ssize_t *consumed /* bytes consumed */
918 /* Returns a Python string using the UTF-32 encoding in native byte
919 order. The string always starts with a BOM mark. */
921 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
922 PyObject *unicode /* Unicode object */
925 /* Returns a Python string object holding the UTF-32 encoded value of
926 the Unicode data.
928 If byteorder is not 0, output is written according to the following
929 byte order:
931 byteorder == -1: little endian
932 byteorder == 0: native byte order (writes a BOM mark)
933 byteorder == 1: big endian
935 If byteorder is 0, the output string will always start with the
936 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
937 prepended.
941 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
942 const Py_UNICODE *data, /* Unicode char buffer */
943 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
944 const char *errors, /* error handling */
945 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
948 /* --- UTF-16 Codecs ------------------------------------------------------ */
950 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
951 the corresponding Unicode object.
953 errors (if non-NULL) defines the error handling. It defaults
954 to "strict".
956 If byteorder is non-NULL, the decoder starts decoding using the
957 given byte order:
959 *byteorder == -1: little endian
960 *byteorder == 0: native order
961 *byteorder == 1: big endian
963 In native mode, the first two bytes of the stream are checked for a
964 BOM mark. If found, the BOM mark is analysed, the byte order
965 adjusted and the BOM skipped. In the other modes, no BOM mark
966 interpretation is done. After completion, *byteorder is set to the
967 current byte order at the end of input data.
969 If byteorder is NULL, the codec starts in native order mode.
973 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
974 const char *string, /* UTF-16 encoded string */
975 Py_ssize_t length, /* size of string */
976 const char *errors, /* error handling */
977 int *byteorder /* pointer to byteorder to use
978 0=native;-1=LE,1=BE; updated on
979 exit */
982 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
983 const char *string, /* UTF-16 encoded string */
984 Py_ssize_t length, /* size of string */
985 const char *errors, /* error handling */
986 int *byteorder, /* pointer to byteorder to use
987 0=native;-1=LE,1=BE; updated on
988 exit */
989 Py_ssize_t *consumed /* bytes consumed */
992 /* Returns a Python string using the UTF-16 encoding in native byte
993 order. The string always starts with a BOM mark. */
995 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
996 PyObject *unicode /* Unicode object */
999 /* Returns a Python string object holding the UTF-16 encoded value of
1000 the Unicode data.
1002 If byteorder is not 0, output is written according to the following
1003 byte order:
1005 byteorder == -1: little endian
1006 byteorder == 0: native byte order (writes a BOM mark)
1007 byteorder == 1: big endian
1009 If byteorder is 0, the output string will always start with the
1010 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
1011 prepended.
1013 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
1014 UCS-2. This trick makes it possible to add full UTF-16 capabilities
1015 at a later point without compromising the APIs.
1019 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
1020 const Py_UNICODE *data, /* Unicode char buffer */
1021 Py_ssize_t length, /* number of Py_UNICODE chars to encode */
1022 const char *errors, /* error handling */
1023 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
1026 /* --- Unicode-Escape Codecs ---------------------------------------------- */
1028 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
1029 const char *string, /* Unicode-Escape encoded string */
1030 Py_ssize_t length, /* size of string */
1031 const char *errors /* error handling */
1034 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
1035 PyObject *unicode /* Unicode object */
1038 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
1039 const Py_UNICODE *data, /* Unicode char buffer */
1040 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
1043 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
1045 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
1046 const char *string, /* Raw-Unicode-Escape encoded string */
1047 Py_ssize_t length, /* size of string */
1048 const char *errors /* error handling */
1051 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
1052 PyObject *unicode /* Unicode object */
1055 PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
1056 const Py_UNICODE *data, /* Unicode char buffer */
1057 Py_ssize_t length /* Number of Py_UNICODE chars to encode */
1060 /* --- Unicode Internal Codec ---------------------------------------------
1062 Only for internal use in _codecsmodule.c */
1064 PyObject *_PyUnicode_DecodeUnicodeInternal(
1065 const char *string,
1066 Py_ssize_t length,
1067 const char *errors
1070 /* --- Latin-1 Codecs -----------------------------------------------------
1072 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
1076 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
1077 const char *string, /* Latin-1 encoded string */
1078 Py_ssize_t length, /* size of string */
1079 const char *errors /* error handling */
1082 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
1083 PyObject *unicode /* Unicode object */
1086 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
1087 const Py_UNICODE *data, /* Unicode char buffer */
1088 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1089 const char *errors /* error handling */
1092 /* --- ASCII Codecs -------------------------------------------------------
1094 Only 7-bit ASCII data is excepted. All other codes generate errors.
1098 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
1099 const char *string, /* ASCII encoded string */
1100 Py_ssize_t length, /* size of string */
1101 const char *errors /* error handling */
1104 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
1105 PyObject *unicode /* Unicode object */
1108 PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
1109 const Py_UNICODE *data, /* Unicode char buffer */
1110 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1111 const char *errors /* error handling */
1114 /* --- Character Map Codecs -----------------------------------------------
1116 This codec uses mappings to encode and decode characters.
1118 Decoding mappings must map single string characters to single
1119 Unicode characters, integers (which are then interpreted as Unicode
1120 ordinals) or None (meaning "undefined mapping" and causing an
1121 error).
1123 Encoding mappings must map single Unicode characters to single
1124 string characters, integers (which are then interpreted as Latin-1
1125 ordinals) or None (meaning "undefined mapping" and causing an
1126 error).
1128 If a character lookup fails with a LookupError, the character is
1129 copied as-is meaning that its ordinal value will be interpreted as
1130 Unicode or Latin-1 ordinal resp. Because of this mappings only need
1131 to contain those mappings which map characters to different code
1132 points.
1136 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1137 const char *string, /* Encoded string */
1138 Py_ssize_t length, /* size of string */
1139 PyObject *mapping, /* character mapping
1140 (char ordinal -> unicode ordinal) */
1141 const char *errors /* error handling */
1144 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1145 PyObject *unicode, /* Unicode object */
1146 PyObject *mapping /* character mapping
1147 (unicode ordinal -> char ordinal) */
1150 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1151 const Py_UNICODE *data, /* Unicode char buffer */
1152 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1153 PyObject *mapping, /* character mapping
1154 (unicode ordinal -> char ordinal) */
1155 const char *errors /* error handling */
1158 /* Translate a Py_UNICODE buffer of the given length by applying a
1159 character mapping table to it and return the resulting Unicode
1160 object.
1162 The mapping table must map Unicode ordinal integers to Unicode
1163 ordinal integers or None (causing deletion of the character).
1165 Mapping tables may be dictionaries or sequences. Unmapped character
1166 ordinals (ones which cause a LookupError) are left untouched and
1167 are copied as-is.
1171 PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1172 const Py_UNICODE *data, /* Unicode char buffer */
1173 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1174 PyObject *table, /* Translate table */
1175 const char *errors /* error handling */
1178 #ifdef MS_WIN32
1180 /* --- MBCS codecs for Windows -------------------------------------------- */
1182 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1183 const char *string, /* MBCS encoded string */
1184 Py_ssize_t length, /* size of string */
1185 const char *errors /* error handling */
1188 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1189 const char *string, /* MBCS encoded string */
1190 Py_ssize_t length, /* size of string */
1191 const char *errors, /* error handling */
1192 Py_ssize_t *consumed /* bytes consumed */
1195 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1196 PyObject *unicode /* Unicode object */
1199 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1200 const Py_UNICODE *data, /* Unicode char buffer */
1201 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1202 const char *errors /* error handling */
1205 #endif /* MS_WIN32 */
1207 /* --- Decimal Encoder ---------------------------------------------------- */
1209 /* Takes a Unicode string holding a decimal value and writes it into
1210 an output buffer using standard ASCII digit codes.
1212 The output buffer has to provide at least length+1 bytes of storage
1213 area. The output string is 0-terminated.
1215 The encoder converts whitespace to ' ', decimal characters to their
1216 corresponding ASCII digit and all other Latin-1 characters except
1217 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1218 are treated as errors. This includes embedded NULL bytes.
1220 Error handling is defined by the errors argument:
1222 NULL or "strict": raise a ValueError
1223 "ignore": ignore the wrong characters (these are not copied to the
1224 output buffer)
1225 "replace": replaces illegal characters with '?'
1227 Returns 0 on success, -1 on failure.
1231 PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1232 Py_UNICODE *s, /* Unicode buffer */
1233 Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
1234 char *output, /* Output buffer; must have size >= length */
1235 const char *errors /* error handling */
1238 /* --- File system encoding ---------------------------------------------- */
1240 /* ParseTuple converter which converts a Unicode object into the file
1241 system encoding, using the PEP 383 error handler; bytes objects are
1242 output as-is. */
1244 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
1246 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding.
1248 If the encoding is supported by one of the built-in codecs (i.e., UTF-8,
1249 UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace
1250 invalid characters with '?'.
1252 The function is intended to be used for paths and file names only
1253 during bootstrapping process where the codecs are not set up.
1256 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
1257 const char *s /* encoded string */
1260 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
1261 const char *s, /* encoded string */
1262 Py_ssize_t size /* size */
1265 /* --- Methods & Slots ----------------------------------------------------
1267 These are capable of handling Unicode objects and strings on input
1268 (we refer to them as strings in the descriptions) and return
1269 Unicode objects or integers as apporpriate. */
1271 /* Concat two strings giving a new Unicode string. */
1273 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1274 PyObject *left, /* Left string */
1275 PyObject *right /* Right string */
1278 /* Concat two strings and put the result in *pleft
1279 (sets *pleft to NULL on error) */
1281 PyAPI_FUNC(void) PyUnicode_Append(
1282 PyObject **pleft, /* Pointer to left string */
1283 PyObject *right /* Right string */
1286 /* Concat two strings, put the result in *pleft and drop the right object
1287 (sets *pleft to NULL on error) */
1289 PyAPI_FUNC(void) PyUnicode_AppendAndDel(
1290 PyObject **pleft, /* Pointer to left string */
1291 PyObject *right /* Right string */
1294 /* Split a string giving a list of Unicode strings.
1296 If sep is NULL, splitting will be done at all whitespace
1297 substrings. Otherwise, splits occur at the given separator.
1299 At most maxsplit splits will be done. If negative, no limit is set.
1301 Separators are not included in the resulting list.
1305 PyAPI_FUNC(PyObject*) PyUnicode_Split(
1306 PyObject *s, /* String to split */
1307 PyObject *sep, /* String separator */
1308 Py_ssize_t maxsplit /* Maxsplit count */
1311 /* Dito, but split at line breaks.
1313 CRLF is considered to be one line break. Line breaks are not
1314 included in the resulting list. */
1316 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1317 PyObject *s, /* String to split */
1318 int keepends /* If true, line end markers are included */
1321 /* Partition a string using a given separator. */
1323 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1324 PyObject *s, /* String to partition */
1325 PyObject *sep /* String separator */
1328 /* Partition a string using a given separator, searching from the end of the
1329 string. */
1331 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1332 PyObject *s, /* String to partition */
1333 PyObject *sep /* String separator */
1336 /* Split a string giving a list of Unicode strings.
1338 If sep is NULL, splitting will be done at all whitespace
1339 substrings. Otherwise, splits occur at the given separator.
1341 At most maxsplit splits will be done. But unlike PyUnicode_Split
1342 PyUnicode_RSplit splits from the end of the string. If negative,
1343 no limit is set.
1345 Separators are not included in the resulting list.
1349 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1350 PyObject *s, /* String to split */
1351 PyObject *sep, /* String separator */
1352 Py_ssize_t maxsplit /* Maxsplit count */
1355 /* Translate a string by applying a character mapping table to it and
1356 return the resulting Unicode object.
1358 The mapping table must map Unicode ordinal integers to Unicode
1359 ordinal integers or None (causing deletion of the character).
1361 Mapping tables may be dictionaries or sequences. Unmapped character
1362 ordinals (ones which cause a LookupError) are left untouched and
1363 are copied as-is.
1367 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1368 PyObject *str, /* String */
1369 PyObject *table, /* Translate table */
1370 const char *errors /* error handling */
1373 /* Join a sequence of strings using the given separator and return
1374 the resulting Unicode string. */
1376 PyAPI_FUNC(PyObject*) PyUnicode_Join(
1377 PyObject *separator, /* Separator string */
1378 PyObject *seq /* Sequence object */
1381 /* Return 1 if substr matches str[start:end] at the given tail end, 0
1382 otherwise. */
1384 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1385 PyObject *str, /* String */
1386 PyObject *substr, /* Prefix or Suffix string */
1387 Py_ssize_t start, /* Start index */
1388 Py_ssize_t end, /* Stop index */
1389 int direction /* Tail end: -1 prefix, +1 suffix */
1392 /* Return the first position of substr in str[start:end] using the
1393 given search direction or -1 if not found. -2 is returned in case
1394 an error occurred and an exception is set. */
1396 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1397 PyObject *str, /* String */
1398 PyObject *substr, /* Substring to find */
1399 Py_ssize_t start, /* Start index */
1400 Py_ssize_t end, /* Stop index */
1401 int direction /* Find direction: +1 forward, -1 backward */
1404 /* Count the number of occurrences of substr in str[start:end]. */
1406 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1407 PyObject *str, /* String */
1408 PyObject *substr, /* Substring to count */
1409 Py_ssize_t start, /* Start index */
1410 Py_ssize_t end /* Stop index */
1413 /* Replace at most maxcount occurrences of substr in str with replstr
1414 and return the resulting Unicode object. */
1416 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1417 PyObject *str, /* String */
1418 PyObject *substr, /* Substring to find */
1419 PyObject *replstr, /* Substring to replace */
1420 Py_ssize_t maxcount /* Max. number of replacements to apply;
1421 -1 = all */
1424 /* Compare two strings and return -1, 0, 1 for less than, equal,
1425 greater than resp. */
1427 PyAPI_FUNC(int) PyUnicode_Compare(
1428 PyObject *left, /* Left string */
1429 PyObject *right /* Right string */
1432 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
1433 PyObject *left,
1434 const char *right
1437 /* Rich compare two strings and return one of the following:
1439 - NULL in case an exception was raised
1440 - Py_True or Py_False for successfuly comparisons
1441 - Py_NotImplemented in case the type combination is unknown
1443 Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1444 case the conversion of the arguments to Unicode fails with a
1445 UnicodeDecodeError.
1447 Possible values for op:
1449 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1453 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1454 PyObject *left, /* Left string */
1455 PyObject *right, /* Right string */
1456 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1459 /* Apply a argument tuple or dictionary to a format string and return
1460 the resulting Unicode string. */
1462 PyAPI_FUNC(PyObject *) PyUnicode_Format(
1463 PyObject *format, /* Format string */
1464 PyObject *args /* Argument tuple or dictionary */
1467 /* Checks whether element is contained in container and return 1/0
1468 accordingly.
1470 element has to coerce to an one element Unicode string. -1 is
1471 returned in case of an error. */
1473 PyAPI_FUNC(int) PyUnicode_Contains(
1474 PyObject *container, /* Container string */
1475 PyObject *element /* Element string */
1478 /* Checks whether argument is a valid identifier. */
1480 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1482 /* Externally visible for str.strip(unicode) */
1483 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1484 PyUnicodeObject *self,
1485 int striptype,
1486 PyObject *sepobj
1489 /* Using the current locale, insert the thousands grouping
1490 into the string pointed to by buffer. For the argument descriptions,
1491 see Objects/stringlib/localeutil.h */
1493 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buffer,
1494 Py_ssize_t n_buffer,
1495 Py_UNICODE *digits,
1496 Py_ssize_t n_digits,
1497 Py_ssize_t min_width);
1499 /* Using explicit passed-in values, insert the thousands grouping
1500 into the string pointed to by buffer. For the argument descriptions,
1501 see Objects/stringlib/localeutil.h */
1502 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(Py_UNICODE *buffer,
1503 Py_ssize_t n_buffer,
1504 Py_UNICODE *digits,
1505 Py_ssize_t n_digits,
1506 Py_ssize_t min_width,
1507 const char *grouping,
1508 const char *thousands_sep);
1509 /* === Characters Type APIs =============================================== */
1511 /* Helper array used by Py_UNICODE_ISSPACE(). */
1513 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1515 /* These should not be used directly. Use the Py_UNICODE_IS* and
1516 Py_UNICODE_TO* macros instead.
1518 These APIs are implemented in Objects/unicodectype.c.
1522 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1523 Py_UNICODE ch /* Unicode character */
1526 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1527 Py_UNICODE ch /* Unicode character */
1530 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1531 Py_UNICODE ch /* Unicode character */
1534 PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1535 Py_UNICODE ch /* Unicode character */
1538 PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1539 Py_UNICODE ch /* Unicode character */
1542 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1543 const Py_UNICODE ch /* Unicode character */
1546 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1547 const Py_UNICODE ch /* Unicode character */
1550 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
1551 Py_UNICODE ch /* Unicode character */
1554 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
1555 Py_UNICODE ch /* Unicode character */
1558 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
1559 Py_UNICODE ch /* Unicode character */
1562 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1563 Py_UNICODE ch /* Unicode character */
1566 PyAPI_FUNC(int) _PyUnicode_ToDigit(
1567 Py_UNICODE ch /* Unicode character */
1570 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1571 Py_UNICODE ch /* Unicode character */
1574 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1575 Py_UNICODE ch /* Unicode character */
1578 PyAPI_FUNC(int) _PyUnicode_IsDigit(
1579 Py_UNICODE ch /* Unicode character */
1582 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1583 Py_UNICODE ch /* Unicode character */
1586 PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1587 Py_UNICODE ch /* Unicode character */
1590 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1591 Py_UNICODE ch /* Unicode character */
1594 PyAPI_FUNC(size_t) Py_UNICODE_strlen(const Py_UNICODE *u);
1596 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strcpy(
1597 Py_UNICODE *s1, const Py_UNICODE *s2);
1599 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strncpy(
1600 Py_UNICODE *s1, const Py_UNICODE *s2, size_t n);
1602 PyAPI_FUNC(int) Py_UNICODE_strcmp(
1603 const Py_UNICODE *s1, const Py_UNICODE *s2);
1605 PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strchr(
1606 const Py_UNICODE *s, Py_UNICODE c
1609 #ifdef __cplusplus
1611 #endif
1612 #endif /* !Py_UNICODEOBJECT_H */