Include/unicodeobject.h

   1 #ifndef Py_UNICODEOBJECT_H
   2 #define Py_UNICODEOBJECT_H
   3
   4 #include <stdarg.h>
   5
   6 /*
   7
   8 Unicode implementation based on original code by Fredrik Lundh,
   9 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
  10 Unicode Integration Proposal (see file Misc/unicode.txt).
  11
  12 Copyright (c) Corporation for National Research Initiatives.
  13
  14
  15  Original header:
  16  --------------------------------------------------------------------
  17
  18  * Yet another Unicode string type for Python.  This type supports the
  19  * 16-bit Basic Multilingual Plane (BMP) only.
  20  *
  21  * Written by Fredrik Lundh, January 1999.
  22  *
  23  * Copyright (c) 1999 by Secret Labs AB.
  24  * Copyright (c) 1999 by Fredrik Lundh.
  25  *
  26  * fredrik@pythonware.com
  27  * http://www.pythonware.com
  28  *
  29  * --------------------------------------------------------------------
  30  * This Unicode String Type is
  31  *
  32  * Copyright (c) 1999 by Secret Labs AB
  33  * Copyright (c) 1999 by Fredrik Lundh
  34  *
  35  * By obtaining, using, and/or copying this software and/or its
  36  * associated documentation, you agree that you have read, understood,
  37  * and will comply with the following terms and conditions:
  38  *
  39  * Permission to use, copy, modify, and distribute this software and its
  40  * associated documentation for any purpose and without fee is hereby
  41  * granted, provided that the above copyright notice appears in all
  42  * copies, and that both that copyright notice and this permission notice
  43  * appear in supporting documentation, and that the name of Secret Labs
  44  * AB or the author not be used in advertising or publicity pertaining to
  45  * distribution of the software without specific, written prior
  46  * permission.
  47  *
  48  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  49  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  50  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  51  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  52  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  53  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  54  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  55  * -------------------------------------------------------------------- */
  56
  57 #include <ctype.h>
  58
  59 /* === Internal API ======================================================= */
  60
  61 /* --- Internal Unicode Format -------------------------------------------- */
  62
  63 #ifndef Py_USING_UNICODE
  64
  65 #define PyUnicode_Check(op)                 0
  66 #define PyUnicode_CheckExact(op)            0
  67
  68 #else
  69
  70 /* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
  71    properly set, but the default rules below doesn't set it.  I'll
  72    sort this out some other day -- fredrik@pythonware.com */
  73
  74 #ifndef Py_UNICODE_SIZE
  75 #error Must define Py_UNICODE_SIZE
  76 #endif
  77
  78 /* Setting Py_UNICODE_WIDE enables UCS-4 storage.  Otherwise, Unicode
  79    strings are stored as UCS-2 (with limited support for UTF-16) */
  80
  81 #if Py_UNICODE_SIZE >= 4
  82 #define Py_UNICODE_WIDE
  83 #endif
  84
  85 /* Set these flags if the platform has "wchar.h", "wctype.h" and the
  86    wchar_t type is a 16-bit unsigned type */
  87 /* #define HAVE_WCHAR_H */
  88 /* #define HAVE_USABLE_WCHAR_T */
  89
  90 /* Defaults for various platforms */
  91 #ifndef PY_UNICODE_TYPE
  92
  93 /* Windows has a usable wchar_t type (unless we're using UCS-4) */
  94 # if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
  95 #  define HAVE_USABLE_WCHAR_T
  96 #  define PY_UNICODE_TYPE wchar_t
  97 # endif
  98
  99 # if defined(Py_UNICODE_WIDE)
 100 #  define PY_UNICODE_TYPE Py_UCS4
 101 # endif
 102
 103 #endif
 104
 105 /* If the compiler provides a wchar_t type we try to support it
 106    through the interface functions PyUnicode_FromWideChar() and
 107    PyUnicode_AsWideChar(). */
 108
 109 #ifdef HAVE_USABLE_WCHAR_T
 110 # ifndef HAVE_WCHAR_H
 111 #  define HAVE_WCHAR_H
 112 # endif
 113 #endif
 114
 115 #ifdef HAVE_WCHAR_H
 116 /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
 117 # ifdef _HAVE_BSDI
 118 #  include <time.h>
 119 # endif
 120 #  include <wchar.h>
 121 #endif
 122
 123 /*
 124  * Use this typedef when you need to represent a UTF-16 surrogate pair
 125  * as single unsigned integer.
 126  */
 127 #if SIZEOF_INT >= 4
 128 typedef unsigned int Py_UCS4;
 129 #elif SIZEOF_LONG >= 4
 130 typedef unsigned long Py_UCS4;
 131 #endif
 132
 133 typedef PY_UNICODE_TYPE Py_UNICODE;
 134
 135 /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
 136
 137 /* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
 138    produce different external names and thus cause import errors in
 139    case Python interpreters and extensions with mixed compiled in
 140    Unicode width assumptions are combined. */
 141
 142 #ifndef Py_UNICODE_WIDE
 143
 144 # define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
 145 # define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
 146 # define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
 147 # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
 148 # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
 149 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
 150 # define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
 151 # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
 152 # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
 153 # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
 154 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
 155 # define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
 156 # define PyUnicode_Compare PyUnicodeUCS2_Compare
 157 # define PyUnicode_Concat PyUnicodeUCS2_Concat
 158 # define PyUnicode_Contains PyUnicodeUCS2_Contains
 159 # define PyUnicode_Count PyUnicodeUCS2_Count
 160 # define PyUnicode_Decode PyUnicodeUCS2_Decode
 161 # define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
 162 # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
 163 # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
 164 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
 165 # define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
 166 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
 167 # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
 168 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
 169 # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
 170 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
 171 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
 172 # define PyUnicode_Encode PyUnicodeUCS2_Encode
 173 # define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
 174 # define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
 175 # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
 176 # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
 177 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
 178 # define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
 179 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
 180 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
 181 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
 182 # define PyUnicode_Find PyUnicodeUCS2_Find
 183 # define PyUnicode_Format PyUnicodeUCS2_Format
 184 # define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
 185 # define PyUnicode_FromObject PyUnicodeUCS2_FromObject
 186 # define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
 187 # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
 188 # define PyUnicode_FromString PyUnicodeUCS2_FromString
 189 # define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
 190 # define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
 191 # define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
 192 # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
 193 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
 194 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
 195 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
 196 # define PyUnicode_Join PyUnicodeUCS2_Join
 197 # define PyUnicode_Partition PyUnicodeUCS2_Partition
 198 # define PyUnicode_RPartition PyUnicodeUCS2_RPartition
 199 # define PyUnicode_RSplit PyUnicodeUCS2_RSplit
 200 # define PyUnicode_Replace PyUnicodeUCS2_Replace
 201 # define PyUnicode_Resize PyUnicodeUCS2_Resize
 202 # define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
 203 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
 204 # define PyUnicode_Split PyUnicodeUCS2_Split
 205 # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
 206 # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
 207 # define PyUnicode_Translate PyUnicodeUCS2_Translate
 208 # define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
 209 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
 210 # define _PyUnicode_Fini _PyUnicodeUCS2_Fini
 211 # define _PyUnicode_Init _PyUnicodeUCS2_Init
 212 # define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
 213 # define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
 214 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
 215 # define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
 216 # define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
 217 # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
 218 # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
 219 # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
 220 # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
 221 # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
 222 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
 223 # define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
 224 # define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
 225 # define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
 226 # define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
 227 # define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
 228
 229 #else
 230
 231 # define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
 232 # define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
 233 # define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
 234 # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
 235 # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
 236 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
 237 # define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
 238 # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
 239 # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
 240 # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
 241 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
 242 # define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
 243 # define PyUnicode_Compare PyUnicodeUCS4_Compare
 244 # define PyUnicode_Concat PyUnicodeUCS4_Concat
 245 # define PyUnicode_Contains PyUnicodeUCS4_Contains
 246 # define PyUnicode_Count PyUnicodeUCS4_Count
 247 # define PyUnicode_Decode PyUnicodeUCS4_Decode
 248 # define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
 249 # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
 250 # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
 251 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
 252 # define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
 253 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
 254 # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
 255 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
 256 # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
 257 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
 258 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
 259 # define PyUnicode_Encode PyUnicodeUCS4_Encode
 260 # define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
 261 # define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
 262 # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
 263 # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
 264 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
 265 # define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
 266 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
 267 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
 268 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
 269 # define PyUnicode_Find PyUnicodeUCS4_Find
 270 # define PyUnicode_Format PyUnicodeUCS4_Format
 271 # define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
 272 # define PyUnicode_FromObject PyUnicodeUCS4_FromObject
 273 # define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
 274 # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
 275 # define PyUnicode_FromString PyUnicodeUCS4_FromString
 276 # define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
 277 # define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
 278 # define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
 279 # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
 280 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
 281 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
 282 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
 283 # define PyUnicode_Join PyUnicodeUCS4_Join
 284 # define PyUnicode_Partition PyUnicodeUCS4_Partition
 285 # define PyUnicode_RPartition PyUnicodeUCS4_RPartition
 286 # define PyUnicode_RSplit PyUnicodeUCS4_RSplit
 287 # define PyUnicode_Replace PyUnicodeUCS4_Replace
 288 # define PyUnicode_Resize PyUnicodeUCS4_Resize
 289 # define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
 290 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
 291 # define PyUnicode_Split PyUnicodeUCS4_Split
 292 # define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
 293 # define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
 294 # define PyUnicode_Translate PyUnicodeUCS4_Translate
 295 # define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
 296 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
 297 # define _PyUnicode_Fini _PyUnicodeUCS4_Fini
 298 # define _PyUnicode_Init _PyUnicodeUCS4_Init
 299 # define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
 300 # define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
 301 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
 302 # define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
 303 # define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
 304 # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
 305 # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
 306 # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
 307 # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
 308 # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
 309 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
 310 # define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
 311 # define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
 312 # define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
 313 # define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
 314 # define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
 315
 316
 317 #endif
 318
 319 /* --- Internal Unicode Operations ---------------------------------------- */
 320
 321 /* If you want Python to use the compiler's wctype.h functions instead
 322    of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
 323    configure Python using --with-wctype-functions.  This reduces the
 324    interpreter's code size. */
 325
 326 #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
 327
 328 #include <wctype.h>
 329
 330 #define Py_UNICODE_ISSPACE(ch) iswspace(ch)
 331
 332 #define Py_UNICODE_ISLOWER(ch) iswlower(ch)
 333 #define Py_UNICODE_ISUPPER(ch) iswupper(ch)
 334 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
 335 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
 336
 337 #define Py_UNICODE_TOLOWER(ch) towlower(ch)
 338 #define Py_UNICODE_TOUPPER(ch) towupper(ch)
 339 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
 340
 341 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
 342 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
 343 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
 344
 345 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
 346 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
 347 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
 348
 349 #define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
 350
 351 #else
 352
 353 /* Since splitting on whitespace is an important use case, and whitespace
 354    in most situations is solely ASCII whitespace, we optimize for the common
 355    case by using a quick look-up table with an inlined check.
 356  */
 357 extern const unsigned char _Py_ascii_whitespace[];
 358
 359 #define Py_UNICODE_ISSPACE(ch) \
 360         ((ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
 361
 362 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
 363 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
 364 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
 365 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
 366
 367 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
 368 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
 369 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
 370
 371 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
 372 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
 373 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
 374
 375 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
 376 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
 377 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
 378
 379 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
 380
 381 #endif
 382
 383 #define Py_UNICODE_ISALNUM(ch) \
 384        (Py_UNICODE_ISALPHA(ch) || \
 385         Py_UNICODE_ISDECIMAL(ch) || \
 386         Py_UNICODE_ISDIGIT(ch) || \
 387         Py_UNICODE_ISNUMERIC(ch))
 388
 389 #define Py_UNICODE_COPY(target, source, length)                         \
 390         Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
 391
 392 #define Py_UNICODE_FILL(target, value, length) do\
 393     {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
 394         for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
 395     } while (0)
 396
 397 /* check if substring matches at given offset.  the offset must be
 398    valid, and the substring must not be empty */
 399 #define Py_UNICODE_MATCH(string, offset, substring) \
 400     ((*((string)->str + (offset)) == *((substring)->str)) && \
 401     ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
 402      !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
 403
 404 #ifdef __cplusplus
 405 extern "C" {
 406 #endif
 407
 408 PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
 409
 410 /* --- Unicode Type ------------------------------------------------------- */
 411
 412 typedef struct {
 413     PyObject_HEAD
 414     Py_ssize_t length;          /* Length of raw Unicode data in buffer */
 415     Py_UNICODE *str;            /* Raw Unicode buffer */
 416     long hash;                  /* Hash value; -1 if not set */
 417     PyObject *defenc;           /* (Default) Encoded version as Python
 418                                    string, or NULL; this is used for
 419                                    implementing the buffer protocol */
 420 } PyUnicodeObject;
 421
 422 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
 423
 424 #define PyUnicode_Check(op) \
 425                  PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
 426 #define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
 427
 428 /* Fast access macros */
 429 #define PyUnicode_GET_SIZE(op) \
 430         (((PyUnicodeObject *)(op))->length)
 431 #define PyUnicode_GET_DATA_SIZE(op) \
 432         (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
 433 #define PyUnicode_AS_UNICODE(op) \
 434         (((PyUnicodeObject *)(op))->str)
 435 #define PyUnicode_AS_DATA(op) \
 436         ((const char *)((PyUnicodeObject *)(op))->str)
 437
 438 /* --- Constants ---------------------------------------------------------- */
 439
 440 /* This Unicode character will be used as replacement character during
 441    decoding if the errors argument is set to "replace". Note: the
 442    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
 443    Unicode 3.0. */
 444
 445 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
 446
 447 /* === Public API ========================================================= */
 448
 449 /* --- Plain Py_UNICODE --------------------------------------------------- */
 450
 451 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
 452    size.
 453
 454    u may be NULL which causes the contents to be undefined. It is the
 455    user's responsibility to fill in the needed data afterwards. Note
 456    that modifying the Unicode object contents after construction is
 457    only allowed if u was set to NULL.
 458
 459    The buffer is copied into the new object. */
 460
 461 PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
 462     const Py_UNICODE *u,        /* Unicode buffer */
 463     Py_ssize_t size             /* size of buffer */
 464     );
 465
 466 /* Similar to PyUnicode_FromUnicode(), but u points to Latin-1 encoded bytes */
 467 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
 468     const char *u,        /* char buffer */
 469     Py_ssize_t size       /* size of buffer */
 470     );
 471
 472 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
 473    Latin-1 encoded bytes */
 474 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
 475     const char *u        /* string */
 476     );
 477
 478 /* Return a read-only pointer to the Unicode object's internal
 479    Py_UNICODE buffer. */
 480
 481 PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
 482     PyObject *unicode           /* Unicode object */
 483     );
 484
 485 /* Get the length of the Unicode object. */
 486
 487 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
 488     PyObject *unicode           /* Unicode object */
 489     );
 490
 491 /* Get the maximum ordinal for a Unicode character. */
 492 PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
 493
 494 /* Resize an already allocated Unicode object to the new size length.
 495
 496    *unicode is modified to point to the new (resized) object and 0
 497    returned on success.
 498
 499    This API may only be called by the function which also called the
 500    Unicode constructor. The refcount on the object must be 1. Otherwise,
 501    an error is returned.
 502
 503    Error handling is implemented as follows: an exception is set, -1
 504    is returned and *unicode left untouched.
 505
 506 */
 507
 508 PyAPI_FUNC(int) PyUnicode_Resize(
 509     PyObject **unicode,         /* Pointer to the Unicode object */
 510     Py_ssize_t length           /* New length */
 511     );
 512
 513 /* Coerce obj to an Unicode object and return a reference with
 514    *incremented* refcount.
 515
 516    Coercion is done in the following way:
 517
 518    1. String and other char buffer compatible objects are decoded
 519       under the assumptions that they contain data using the current
 520       default encoding. Decoding is done in "strict" mode.
 521
 522    2. All other objects (including Unicode objects) raise an
 523       exception.
 524
 525    The API returns NULL in case of an error. The caller is responsible
 526    for decref'ing the returned objects.
 527
 528 */
 529
 530 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
 531     register PyObject *obj,     /* Object */
 532     const char *encoding,       /* encoding */
 533     const char *errors          /* error handling */
 534     );
 535
 536 /* Coerce obj to an Unicode object and return a reference with
 537    *incremented* refcount.
 538
 539    Unicode objects are passed back as-is (subclasses are converted to
 540    true Unicode objects), all other objects are delegated to
 541    PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
 542    using the default encoding as basis for decoding the object.
 543
 544    The API returns NULL in case of an error. The caller is responsible
 545    for decref'ing the returned objects.
 546
 547 */
 548
 549 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
 550     register PyObject *obj      /* Object */
 551     );
 552
 553 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(const char*, va_list);
 554 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(const char*, ...);
 555
 556 /* --- wchar_t support for platforms which support it --------------------- */
 557
 558 #ifdef HAVE_WCHAR_H
 559
 560 /* Create a Unicode Object from the whcar_t buffer w of the given
 561    size.
 562
 563    The buffer is copied into the new object. */
 564
 565 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
 566     register const wchar_t *w,  /* wchar_t buffer */
 567     Py_ssize_t size             /* size of buffer */
 568     );
 569
 570 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
 571    most size wchar_t characters are copied.
 572
 573    Note that the resulting wchar_t string may or may not be
 574    0-terminated.  It is the responsibility of the caller to make sure
 575    that the wchar_t string is 0-terminated in case this is required by
 576    the application.
 577
 578    Returns the number of wchar_t characters copied (excluding a
 579    possibly trailing 0-termination character) or -1 in case of an
 580    error. */
 581
 582 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
 583     PyUnicodeObject *unicode,   /* Unicode object */
 584     register wchar_t *w,        /* wchar_t buffer */
 585     Py_ssize_t size             /* size of buffer */
 586     );
 587
 588 #endif
 589
 590 /* --- Unicode ordinals --------------------------------------------------- */
 591
 592 /* Create a Unicode Object from the given Unicode code point ordinal.
 593
 594    The ordinal must be in range(0x10000) on narrow Python builds
 595    (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
 596    raised in case it is not.
 597
 598 */
 599
 600 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
 601
 602 /* === Builtin Codecs =====================================================
 603
 604    Many of these APIs take two arguments encoding and errors. These
 605    parameters encoding and errors have the same semantics as the ones
 606    of the builtin unicode() API.
 607
 608    Setting encoding to NULL causes the default encoding to be used.
 609
 610    Error handling is set by errors which may also be set to NULL
 611    meaning to use the default handling defined for the codec. Default
 612    error handling for all builtin codecs is "strict" (ValueErrors are
 613    raised).
 614
 615    The codecs all use a similar interface. Only deviation from the
 616    generic ones are documented.
 617
 618 */
 619
 620 /* --- Manage the default encoding ---------------------------------------- */
 621
 622 /* Return a Python string holding the default encoded value of the
 623    Unicode object.
 624
 625    The resulting string is cached in the Unicode object for subsequent
 626    usage by this function. The cached version is needed to implement
 627    the character buffer interface and will live (at least) as long as
 628    the Unicode object itself.
 629
 630    The refcount of the string is *not* incremented.
 631
 632    *** Exported for internal use by the interpreter only !!! ***
 633
 634 */
 635
 636 PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
 637     PyObject *, const char *);
 638
 639 /* Returns the currently active default encoding.
 640
 641    The default encoding is currently implemented as run-time settable
 642    process global.  This may change in future versions of the
 643    interpreter to become a parameter which is managed on a per-thread
 644    basis.
 645
 646  */
 647
 648 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
 649
 650 /* Sets the currently active default encoding.
 651
 652    Returns 0 on success, -1 in case of an error.
 653
 654  */
 655
 656 PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
 657     const char *encoding        /* Encoding name in standard form */
 658     );
 659
 660 /* --- Generic Codecs ----------------------------------------------------- */
 661
 662 /* Create a Unicode object by decoding the encoded string s of the
 663    given size. */
 664
 665 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
 666     const char *s,              /* encoded string */
 667     Py_ssize_t size,            /* size of buffer */
 668     const char *encoding,       /* encoding */
 669     const char *errors          /* error handling */
 670     );
 671
 672 /* Encodes a Py_UNICODE buffer of the given size and returns a
 673    Python string object. */
 674
 675 PyAPI_FUNC(PyObject*) PyUnicode_Encode(
 676     const Py_UNICODE *s,        /* Unicode char buffer */
 677     Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
 678     const char *encoding,       /* encoding */
 679     const char *errors          /* error handling */
 680     );
 681
 682 /* Encodes a Unicode object and returns the result as Python
 683    object. */
 684
 685 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
 686     PyObject *unicode,          /* Unicode object */
 687     const char *encoding,       /* encoding */
 688     const char *errors          /* error handling */
 689     );
 690
 691 /* Encodes a Unicode object and returns the result as Python string
 692    object. */
 693
 694 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
 695     PyObject *unicode,          /* Unicode object */
 696     const char *encoding,       /* encoding */
 697     const char *errors          /* error handling */
 698     );
 699
 700 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
 701     PyObject* string            /* 256 character map */
 702    );
 703
 704
 705 /* --- UTF-7 Codecs ------------------------------------------------------- */
 706
 707 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
 708     const char *string,         /* UTF-7 encoded string */
 709     Py_ssize_t length,          /* size of string */
 710     const char *errors          /* error handling */
 711     );
 712
 713 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
 714     const char *string,         /* UTF-7 encoded string */
 715     Py_ssize_t length,          /* size of string */
 716     const char *errors,         /* error handling */
 717     Py_ssize_t *consumed        /* bytes consumed */
 718     );
 719
 720 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
 721     const Py_UNICODE *data,     /* Unicode char buffer */
 722     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 723     int encodeSetO,             /* force the encoder to encode characters in
 724                                    Set O, as described in RFC2152 */
 725     int encodeWhiteSpace,       /* force the encoder to encode space, tab,
 726                                    carriage return and linefeed characters */
 727     const char *errors          /* error handling */
 728     );
 729
 730 /* --- UTF-8 Codecs ------------------------------------------------------- */
 731
 732 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
 733     const char *string,         /* UTF-8 encoded string */
 734     Py_ssize_t length,          /* size of string */
 735     const char *errors          /* error handling */
 736     );
 737
 738 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
 739     const char *string,         /* UTF-8 encoded string */
 740     Py_ssize_t length,          /* size of string */
 741     const char *errors,         /* error handling */
 742     Py_ssize_t *consumed                /* bytes consumed */
 743     );
 744
 745 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
 746     PyObject *unicode           /* Unicode object */
 747     );
 748
 749 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
 750     const Py_UNICODE *data,     /* Unicode char buffer */
 751     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 752     const char *errors          /* error handling */
 753     );
 754
 755 /* --- UTF-32 Codecs ------------------------------------------------------ */
 756
 757 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
 758    the corresponding Unicode object.
 759
 760    errors (if non-NULL) defines the error handling. It defaults
 761    to "strict".
 762
 763    If byteorder is non-NULL, the decoder starts decoding using the
 764    given byte order:
 765
 766         *byteorder == -1: little endian
 767         *byteorder == 0:  native order
 768         *byteorder == 1:  big endian
 769
 770    In native mode, the first four bytes of the stream are checked for a
 771    BOM mark. If found, the BOM mark is analysed, the byte order
 772    adjusted and the BOM skipped.  In the other modes, no BOM mark
 773    interpretation is done. After completion, *byteorder is set to the
 774    current byte order at the end of input data.
 775
 776    If byteorder is NULL, the codec starts in native order mode.
 777
 778 */
 779
 780 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
 781     const char *string,         /* UTF-32 encoded string */
 782     Py_ssize_t length,          /* size of string */
 783     const char *errors,         /* error handling */
 784     int *byteorder              /* pointer to byteorder to use
 785                                    0=native;-1=LE,1=BE; updated on
 786                                    exit */
 787     );
 788
 789 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
 790     const char *string,         /* UTF-32 encoded string */
 791     Py_ssize_t length,          /* size of string */
 792     const char *errors,         /* error handling */
 793     int *byteorder,             /* pointer to byteorder to use
 794                                    0=native;-1=LE,1=BE; updated on
 795                                    exit */
 796     Py_ssize_t *consumed        /* bytes consumed */
 797     );
 798
 799 /* Returns a Python string using the UTF-32 encoding in native byte
 800    order. The string always starts with a BOM mark.  */
 801
 802 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
 803     PyObject *unicode           /* Unicode object */
 804     );
 805
 806 /* Returns a Python string object holding the UTF-32 encoded value of
 807    the Unicode data.
 808
 809    If byteorder is not 0, output is written according to the following
 810    byte order:
 811
 812    byteorder == -1: little endian
 813    byteorder == 0:  native byte order (writes a BOM mark)
 814    byteorder == 1:  big endian
 815
 816    If byteorder is 0, the output string will always start with the
 817    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
 818    prepended.
 819
 820 */
 821
 822 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
 823     const Py_UNICODE *data,     /* Unicode char buffer */
 824     Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
 825     const char *errors,         /* error handling */
 826     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
 827     );
 828
 829 /* --- UTF-16 Codecs ------------------------------------------------------ */
 830
 831 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
 832    the corresponding Unicode object.
 833
 834    errors (if non-NULL) defines the error handling. It defaults
 835    to "strict".
 836
 837    If byteorder is non-NULL, the decoder starts decoding using the
 838    given byte order:
 839
 840         *byteorder == -1: little endian
 841         *byteorder == 0:  native order
 842         *byteorder == 1:  big endian
 843
 844    In native mode, the first two bytes of the stream are checked for a
 845    BOM mark. If found, the BOM mark is analysed, the byte order
 846    adjusted and the BOM skipped.  In the other modes, no BOM mark
 847    interpretation is done. After completion, *byteorder is set to the
 848    current byte order at the end of input data.
 849
 850    If byteorder is NULL, the codec starts in native order mode.
 851
 852 */
 853
 854 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
 855     const char *string,         /* UTF-16 encoded string */
 856     Py_ssize_t length,          /* size of string */
 857     const char *errors,         /* error handling */
 858     int *byteorder              /* pointer to byteorder to use
 859                                    0=native;-1=LE,1=BE; updated on
 860                                    exit */
 861     );
 862
 863 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
 864     const char *string,         /* UTF-16 encoded string */
 865     Py_ssize_t length,          /* size of string */
 866     const char *errors,         /* error handling */
 867     int *byteorder,             /* pointer to byteorder to use
 868                                    0=native;-1=LE,1=BE; updated on
 869                                    exit */
 870     Py_ssize_t *consumed                /* bytes consumed */
 871     );
 872
 873 /* Returns a Python string using the UTF-16 encoding in native byte
 874    order. The string always starts with a BOM mark.  */
 875
 876 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
 877     PyObject *unicode           /* Unicode object */
 878     );
 879
 880 /* Returns a Python string object holding the UTF-16 encoded value of
 881    the Unicode data.
 882
 883    If byteorder is not 0, output is written according to the following
 884    byte order:
 885
 886    byteorder == -1: little endian
 887    byteorder == 0:  native byte order (writes a BOM mark)
 888    byteorder == 1:  big endian
 889
 890    If byteorder is 0, the output string will always start with the
 891    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
 892    prepended.
 893
 894    Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
 895    UCS-2. This trick makes it possible to add full UTF-16 capabilities
 896    at a later point without compromising the APIs.
 897
 898 */
 899
 900 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
 901     const Py_UNICODE *data,     /* Unicode char buffer */
 902     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 903     const char *errors,         /* error handling */
 904     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
 905     );
 906
 907 /* --- Unicode-Escape Codecs ---------------------------------------------- */
 908
 909 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
 910     const char *string,         /* Unicode-Escape encoded string */
 911     Py_ssize_t length,          /* size of string */
 912     const char *errors          /* error handling */
 913     );
 914
 915 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
 916     PyObject *unicode           /* Unicode object */
 917     );
 918
 919 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
 920     const Py_UNICODE *data,     /* Unicode char buffer */
 921     Py_ssize_t length                   /* Number of Py_UNICODE chars to encode */
 922     );
 923
 924 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
 925
 926 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
 927     const char *string,         /* Raw-Unicode-Escape encoded string */
 928     Py_ssize_t length,          /* size of string */
 929     const char *errors          /* error handling */
 930     );
 931
 932 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
 933     PyObject *unicode           /* Unicode object */
 934     );
 935
 936 PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
 937     const Py_UNICODE *data,     /* Unicode char buffer */
 938     Py_ssize_t length                   /* Number of Py_UNICODE chars to encode */
 939     );
 940
 941 /* --- Unicode Internal Codec ---------------------------------------------
 942
 943     Only for internal use in _codecsmodule.c */
 944
 945 PyObject *_PyUnicode_DecodeUnicodeInternal(
 946     const char *string,
 947     Py_ssize_t length,
 948     const char *errors
 949     );
 950
 951 /* --- Latin-1 Codecs -----------------------------------------------------
 952
 953    Note: Latin-1 corresponds to the first 256 Unicode ordinals.
 954
 955 */
 956
 957 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
 958     const char *string,         /* Latin-1 encoded string */
 959     Py_ssize_t length,          /* size of string */
 960     const char *errors          /* error handling */
 961     );
 962
 963 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
 964     PyObject *unicode           /* Unicode object */
 965     );
 966
 967 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
 968     const Py_UNICODE *data,     /* Unicode char buffer */
 969     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 970     const char *errors          /* error handling */
 971     );
 972
 973 /* --- ASCII Codecs -------------------------------------------------------
 974
 975    Only 7-bit ASCII data is excepted. All other codes generate errors.
 976
 977 */
 978
 979 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
 980     const char *string,         /* ASCII encoded string */
 981     Py_ssize_t length,          /* size of string */
 982     const char *errors          /* error handling */
 983     );
 984
 985 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
 986     PyObject *unicode           /* Unicode object */
 987     );
 988
 989 PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
 990     const Py_UNICODE *data,     /* Unicode char buffer */
 991     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 992     const char *errors          /* error handling */
 993     );
 994
 995 /* --- Character Map Codecs -----------------------------------------------
 996
 997    This codec uses mappings to encode and decode characters.
 998
 999    Decoding mappings must map single string characters to single
1000    Unicode characters, integers (which are then interpreted as Unicode
1001    ordinals) or None (meaning "undefined mapping" and causing an
1002    error).
1003
1004    Encoding mappings must map single Unicode characters to single
1005    string characters, integers (which are then interpreted as Latin-1
1006    ordinals) or None (meaning "undefined mapping" and causing an
1007    error).
1008
1009    If a character lookup fails with a LookupError, the character is
1010    copied as-is meaning that its ordinal value will be interpreted as
1011    Unicode or Latin-1 ordinal resp. Because of this mappings only need
1012    to contain those mappings which map characters to different code
1013    points.
1014
1015 */
1016
1017 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
1018     const char *string,         /* Encoded string */
1019     Py_ssize_t length,          /* size of string */
1020     PyObject *mapping,          /* character mapping
1021                                    (char ordinal -> unicode ordinal) */
1022     const char *errors          /* error handling */
1023     );
1024
1025 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
1026     PyObject *unicode,          /* Unicode object */
1027     PyObject *mapping           /* character mapping
1028                                    (unicode ordinal -> char ordinal) */
1029     );
1030
1031 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
1032     const Py_UNICODE *data,     /* Unicode char buffer */
1033     Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
1034     PyObject *mapping,          /* character mapping
1035                                    (unicode ordinal -> char ordinal) */
1036     const char *errors          /* error handling */
1037     );
1038
1039 /* Translate a Py_UNICODE buffer of the given length by applying a
1040    character mapping table to it and return the resulting Unicode
1041    object.
1042
1043    The mapping table must map Unicode ordinal integers to Unicode
1044    ordinal integers or None (causing deletion of the character).
1045
1046    Mapping tables may be dictionaries or sequences. Unmapped character
1047    ordinals (ones which cause a LookupError) are left untouched and
1048    are copied as-is.
1049
1050 */
1051
1052 PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1053     const Py_UNICODE *data,     /* Unicode char buffer */
1054     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
1055     PyObject *table,            /* Translate table */
1056     const char *errors          /* error handling */
1057     );
1058
1059 #ifdef MS_WIN32
1060
1061 /* --- MBCS codecs for Windows -------------------------------------------- */
1062
1063 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1064     const char *string,         /* MBCS encoded string */
1065     Py_ssize_t length,              /* size of string */
1066     const char *errors          /* error handling */
1067     );
1068
1069 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1070     const char *string,         /* MBCS encoded string */
1071     Py_ssize_t length,          /* size of string */
1072     const char *errors,         /* error handling */
1073     Py_ssize_t *consumed        /* bytes consumed */
1074     );
1075
1076 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1077     PyObject *unicode           /* Unicode object */
1078     );
1079
1080 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1081     const Py_UNICODE *data,     /* Unicode char buffer */
1082     Py_ssize_t length,              /* Number of Py_UNICODE chars to encode */
1083     const char *errors          /* error handling */
1084     );
1085
1086 #endif /* MS_WIN32 */
1087
1088 /* --- Decimal Encoder ---------------------------------------------------- */
1089
1090 /* Takes a Unicode string holding a decimal value and writes it into
1091    an output buffer using standard ASCII digit codes.
1092
1093    The output buffer has to provide at least length+1 bytes of storage
1094    area. The output string is 0-terminated.
1095
1096    The encoder converts whitespace to ' ', decimal characters to their
1097    corresponding ASCII digit and all other Latin-1 characters except
1098    \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1099    are treated as errors. This includes embedded NULL bytes.
1100
1101    Error handling is defined by the errors argument:
1102
1103       NULL or "strict": raise a ValueError
1104       "ignore": ignore the wrong characters (these are not copied to the
1105                 output buffer)
1106       "replace": replaces illegal characters with '?'
1107
1108    Returns 0 on success, -1 on failure.
1109
1110 */
1111
1112 PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1113     Py_UNICODE *s,              /* Unicode buffer */
1114     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
1115     char *output,               /* Output buffer; must have size >= length */
1116     const char *errors          /* error handling */
1117     );
1118
1119 /* --- Methods & Slots ----------------------------------------------------
1120
1121    These are capable of handling Unicode objects and strings on input
1122    (we refer to them as strings in the descriptions) and return
1123    Unicode objects or integers as apporpriate. */
1124
1125 /* Concat two strings giving a new Unicode string. */
1126
1127 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1128     PyObject *left,             /* Left string */
1129     PyObject *right             /* Right string */
1130     );
1131
1132 /* Split a string giving a list of Unicode strings.
1133
1134    If sep is NULL, splitting will be done at all whitespace
1135    substrings. Otherwise, splits occur at the given separator.
1136
1137    At most maxsplit splits will be done. If negative, no limit is set.
1138
1139    Separators are not included in the resulting list.
1140
1141 */
1142
1143 PyAPI_FUNC(PyObject*) PyUnicode_Split(
1144     PyObject *s,                /* String to split */
1145     PyObject *sep,              /* String separator */
1146     Py_ssize_t maxsplit         /* Maxsplit count */
1147     );
1148
1149 /* Dito, but split at line breaks.
1150
1151    CRLF is considered to be one line break. Line breaks are not
1152    included in the resulting list. */
1153
1154 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1155     PyObject *s,                /* String to split */
1156     int keepends                /* If true, line end markers are included */
1157     );
1158
1159 /* Partition a string using a given separator. */
1160
1161 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1162     PyObject *s,                /* String to partition */
1163     PyObject *sep               /* String separator */
1164     );
1165
1166 /* Partition a string using a given separator, searching from the end of the
1167    string. */
1168
1169 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1170     PyObject *s,                /* String to partition */
1171     PyObject *sep               /* String separator */
1172     );
1173
1174 /* Split a string giving a list of Unicode strings.
1175
1176    If sep is NULL, splitting will be done at all whitespace
1177    substrings. Otherwise, splits occur at the given separator.
1178
1179    At most maxsplit splits will be done. But unlike PyUnicode_Split
1180    PyUnicode_RSplit splits from the end of the string. If negative,
1181    no limit is set.
1182
1183    Separators are not included in the resulting list.
1184
1185 */
1186
1187 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1188     PyObject *s,                /* String to split */
1189     PyObject *sep,              /* String separator */
1190     Py_ssize_t maxsplit         /* Maxsplit count */
1191     );
1192
1193 /* Translate a string by applying a character mapping table to it and
1194    return the resulting Unicode object.
1195
1196    The mapping table must map Unicode ordinal integers to Unicode
1197    ordinal integers or None (causing deletion of the character).
1198
1199    Mapping tables may be dictionaries or sequences. Unmapped character
1200    ordinals (ones which cause a LookupError) are left untouched and
1201    are copied as-is.
1202
1203 */
1204
1205 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1206     PyObject *str,              /* String */
1207     PyObject *table,            /* Translate table */
1208     const char *errors          /* error handling */
1209     );
1210
1211 /* Join a sequence of strings using the given separator and return
1212    the resulting Unicode string. */
1213
1214 PyAPI_FUNC(PyObject*) PyUnicode_Join(
1215     PyObject *separator,        /* Separator string */
1216     PyObject *seq               /* Sequence object */
1217     );
1218
1219 /* Return 1 if substr matches str[start:end] at the given tail end, 0
1220    otherwise. */
1221
1222 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1223     PyObject *str,              /* String */
1224     PyObject *substr,           /* Prefix or Suffix string */
1225     Py_ssize_t start,           /* Start index */
1226     Py_ssize_t end,             /* Stop index */
1227     int direction               /* Tail end: -1 prefix, +1 suffix */
1228     );
1229
1230 /* Return the first position of substr in str[start:end] using the
1231    given search direction or -1 if not found. -2 is returned in case
1232    an error occurred and an exception is set. */
1233
1234 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1235     PyObject *str,              /* String */
1236     PyObject *substr,           /* Substring to find */
1237     Py_ssize_t start,           /* Start index */
1238     Py_ssize_t end,             /* Stop index */
1239     int direction               /* Find direction: +1 forward, -1 backward */
1240     );
1241
1242 /* Count the number of occurrences of substr in str[start:end]. */
1243
1244 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1245     PyObject *str,              /* String */
1246     PyObject *substr,           /* Substring to count */
1247     Py_ssize_t start,           /* Start index */
1248     Py_ssize_t end              /* Stop index */
1249     );
1250
1251 /* Replace at most maxcount occurrences of substr in str with replstr
1252    and return the resulting Unicode object. */
1253
1254 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1255     PyObject *str,              /* String */
1256     PyObject *substr,           /* Substring to find */
1257     PyObject *replstr,          /* Substring to replace */
1258     Py_ssize_t maxcount         /* Max. number of replacements to apply;
1259                                    -1 = all */
1260     );
1261
1262 /* Compare two strings and return -1, 0, 1 for less than, equal,
1263    greater than resp. */
1264
1265 PyAPI_FUNC(int) PyUnicode_Compare(
1266     PyObject *left,             /* Left string */
1267     PyObject *right             /* Right string */
1268     );
1269
1270 /* Rich compare two strings and return one of the following:
1271
1272    - NULL in case an exception was raised
1273    - Py_True or Py_False for successfuly comparisons
1274    - Py_NotImplemented in case the type combination is unknown
1275
1276    Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1277    case the conversion of the arguments to Unicode fails with a
1278    UnicodeDecodeError.
1279
1280    Possible values for op:
1281
1282      Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1283
1284 */
1285
1286 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1287     PyObject *left,             /* Left string */
1288     PyObject *right,            /* Right string */
1289     int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1290     );
1291
1292 /* Apply a argument tuple or dictionary to a format string and return
1293    the resulting Unicode string. */
1294
1295 PyAPI_FUNC(PyObject *) PyUnicode_Format(
1296     PyObject *format,           /* Format string */
1297     PyObject *args              /* Argument tuple or dictionary */
1298     );
1299
1300 /* Checks whether element is contained in container and return 1/0
1301    accordingly.
1302
1303    element has to coerce to an one element Unicode string. -1 is
1304    returned in case of an error. */
1305
1306 PyAPI_FUNC(int) PyUnicode_Contains(
1307     PyObject *container,        /* Container string */
1308     PyObject *element           /* Element string */
1309     );
1310
1311 /* Externally visible for str.strip(unicode) */
1312 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1313     PyUnicodeObject *self,
1314     int striptype,
1315     PyObject *sepobj
1316     );
1317
1318 /* === Characters Type APIs =============================================== */
1319
1320 /* These should not be used directly. Use the Py_UNICODE_IS* and
1321    Py_UNICODE_TO* macros instead.
1322
1323    These APIs are implemented in Objects/unicodectype.c.
1324
1325 */
1326
1327 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1328     Py_UNICODE ch       /* Unicode character */
1329     );
1330
1331 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1332     Py_UNICODE ch       /* Unicode character */
1333     );
1334
1335 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1336     Py_UNICODE ch       /* Unicode character */
1337     );
1338
1339 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1340     const Py_UNICODE ch         /* Unicode character */
1341     );
1342
1343 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1344     const Py_UNICODE ch         /* Unicode character */
1345     );
1346
1347 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
1348     Py_UNICODE ch       /* Unicode character */
1349     );
1350
1351 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
1352     Py_UNICODE ch       /* Unicode character */
1353     );
1354
1355 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
1356     Py_UNICODE ch       /* Unicode character */
1357     );
1358
1359 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1360     Py_UNICODE ch       /* Unicode character */
1361     );
1362
1363 PyAPI_FUNC(int) _PyUnicode_ToDigit(
1364     Py_UNICODE ch       /* Unicode character */
1365     );
1366
1367 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1368     Py_UNICODE ch       /* Unicode character */
1369     );
1370
1371 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1372     Py_UNICODE ch       /* Unicode character */
1373     );
1374
1375 PyAPI_FUNC(int) _PyUnicode_IsDigit(
1376     Py_UNICODE ch       /* Unicode character */
1377     );
1378
1379 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1380     Py_UNICODE ch       /* Unicode character */
1381     );
1382
1383 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1384     Py_UNICODE ch       /* Unicode character */
1385     );
1386
1387 #ifdef __cplusplus
1388 }
1389 #endif
1390 #endif /* Py_USING_UNICODE */
1391 #endif /* !Py_UNICODEOBJECT_H */