Include/unicodeobject.h

   1 #ifndef Py_UNICODEOBJECT_H
   2 #define Py_UNICODEOBJECT_H
   3
   4 /*
   5
   6 Unicode implementation based on original code by Fredrik Lundh,
   7 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
   8 Unicode Integration Proposal (see file Misc/unicode.txt).
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12
  13  Original header:
  14  --------------------------------------------------------------------
  15
  16  * Yet another Unicode string type for Python.  This type supports the
  17  * 16-bit Basic Multilingual Plane (BMP) only.
  18  *
  19  * Written by Fredrik Lundh, January 1999.
  20  *
  21  * Copyright (c) 1999 by Secret Labs AB.
  22  * Copyright (c) 1999 by Fredrik Lundh.
  23  *
  24  * fredrik@pythonware.com
  25  * http://www.pythonware.com
  26  *
  27  * --------------------------------------------------------------------
  28  * This Unicode String Type is
  29  *
  30  * Copyright (c) 1999 by Secret Labs AB
  31  * Copyright (c) 1999 by Fredrik Lundh
  32  *
  33  * By obtaining, using, and/or copying this software and/or its
  34  * associated documentation, you agree that you have read, understood,
  35  * and will comply with the following terms and conditions:
  36  *
  37  * Permission to use, copy, modify, and distribute this software and its
  38  * associated documentation for any purpose and without fee is hereby
  39  * granted, provided that the above copyright notice appears in all
  40  * copies, and that both that copyright notice and this permission notice
  41  * appear in supporting documentation, and that the name of Secret Labs
  42  * AB or the author not be used in advertising or publicity pertaining to
  43  * distribution of the software without specific, written prior
  44  * permission.
  45  *
  46  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  47  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  48  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  49  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  50  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  51  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  52  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  53  * -------------------------------------------------------------------- */
  54
  55 #include <ctype.h>
  56
  57 /* === Internal API ======================================================= */
  58
  59 /* --- Internal Unicode Format -------------------------------------------- */
  60
  61 #ifndef Py_USING_UNICODE
  62
  63 #define PyUnicode_Check(op)                 0
  64 #define PyUnicode_CheckExact(op)            0
  65
  66 #else
  67
  68 /* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
  69    properly set, but the default rules below doesn't set it.  I'll
  70    sort this out some other day -- fredrik@pythonware.com */
  71
  72 #ifndef Py_UNICODE_SIZE
  73 #error Must define Py_UNICODE_SIZE
  74 #endif
  75
  76 /* Setting Py_UNICODE_WIDE enables UCS-4 storage.  Otherwise, Unicode
  77    strings are stored as UCS-2 (with limited support for UTF-16) */
  78
  79 #if Py_UNICODE_SIZE >= 4
  80 #define Py_UNICODE_WIDE
  81 #endif
  82
  83 /* Set these flags if the platform has "wchar.h", "wctype.h" and the
  84    wchar_t type is a 16-bit unsigned type */
  85 /* #define HAVE_WCHAR_H */
  86 /* #define HAVE_USABLE_WCHAR_T */
  87
  88 /* Defaults for various platforms */
  89 #ifndef PY_UNICODE_TYPE
  90
  91 /* Windows has a usable wchar_t type (unless we're using UCS-4) */
  92 # if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
  93 #  define HAVE_USABLE_WCHAR_T
  94 #  define PY_UNICODE_TYPE wchar_t
  95 # endif
  96
  97 # if defined(Py_UNICODE_WIDE)
  98 #  define PY_UNICODE_TYPE Py_UCS4
  99 # endif
 100
 101 #endif
 102
 103 /* If the compiler provides a wchar_t type we try to support it
 104    through the interface functions PyUnicode_FromWideChar() and
 105    PyUnicode_AsWideChar(). */
 106
 107 #ifdef HAVE_USABLE_WCHAR_T
 108 # ifndef HAVE_WCHAR_H
 109 #  define HAVE_WCHAR_H
 110 # endif
 111 #endif
 112
 113 #ifdef HAVE_WCHAR_H
 114 /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
 115 # ifdef _HAVE_BSDI
 116 #  include <time.h>
 117 # endif
 118 #  include <wchar.h>
 119 #endif
 120
 121 /*
 122  * Use this typedef when you need to represent a UTF-16 surrogate pair
 123  * as single unsigned integer.
 124  */
 125 #if SIZEOF_INT >= 4
 126 typedef unsigned int Py_UCS4;
 127 #elif SIZEOF_LONG >= 4
 128 typedef unsigned long Py_UCS4;
 129 #endif
 130
 131 typedef PY_UNICODE_TYPE Py_UNICODE;
 132
 133 /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
 134
 135 /* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
 136    produce different external names and thus cause import errors in
 137    case Python interpreters and extensions with mixed compiled in
 138    Unicode width assumptions are combined. */
 139
 140 #ifndef Py_UNICODE_WIDE
 141
 142 # define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
 143 # define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
 144 # define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
 145 # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
 146 # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
 147 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
 148 # define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
 149 # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
 150 # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
 151 # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
 152 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
 153 # define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
 154 # define PyUnicode_Compare PyUnicodeUCS2_Compare
 155 # define PyUnicode_Concat PyUnicodeUCS2_Concat
 156 # define PyUnicode_Contains PyUnicodeUCS2_Contains
 157 # define PyUnicode_Count PyUnicodeUCS2_Count
 158 # define PyUnicode_Decode PyUnicodeUCS2_Decode
 159 # define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
 160 # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
 161 # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
 162 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
 163 # define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
 164 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
 165 # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
 166 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
 167 # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
 168 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
 169 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
 170 # define PyUnicode_Encode PyUnicodeUCS2_Encode
 171 # define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
 172 # define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
 173 # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
 174 # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
 175 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
 176 # define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
 177 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
 178 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
 179 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
 180 # define PyUnicode_Find PyUnicodeUCS2_Find
 181 # define PyUnicode_Format PyUnicodeUCS2_Format
 182 # define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
 183 # define PyUnicode_FromObject PyUnicodeUCS2_FromObject
 184 # define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
 185 # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
 186 # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
 187 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
 188 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
 189 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
 190 # define PyUnicode_Join PyUnicodeUCS2_Join
 191 # define PyUnicode_Partition PyUnicodeUCS2_Partition
 192 # define PyUnicode_RPartition PyUnicodeUCS2_RPartition
 193 # define PyUnicode_RSplit PyUnicodeUCS2_RSplit
 194 # define PyUnicode_Replace PyUnicodeUCS2_Replace
 195 # define PyUnicode_Resize PyUnicodeUCS2_Resize
 196 # define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
 197 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
 198 # define PyUnicode_Split PyUnicodeUCS2_Split
 199 # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
 200 # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
 201 # define PyUnicode_Translate PyUnicodeUCS2_Translate
 202 # define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
 203 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
 204 # define _PyUnicode_Fini _PyUnicodeUCS2_Fini
 205 # define _PyUnicode_Init _PyUnicodeUCS2_Init
 206 # define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
 207 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
 208 # define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
 209 # define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
 210 # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
 211 # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
 212 # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
 213 # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
 214 # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
 215 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
 216 # define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
 217 # define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
 218 # define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
 219 # define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
 220 # define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
 221
 222 #else
 223
 224 # define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
 225 # define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
 226 # define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
 227 # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
 228 # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
 229 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
 230 # define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
 231 # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
 232 # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
 233 # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
 234 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
 235 # define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
 236 # define PyUnicode_Compare PyUnicodeUCS4_Compare
 237 # define PyUnicode_Concat PyUnicodeUCS4_Concat
 238 # define PyUnicode_Contains PyUnicodeUCS4_Contains
 239 # define PyUnicode_Count PyUnicodeUCS4_Count
 240 # define PyUnicode_Decode PyUnicodeUCS4_Decode
 241 # define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
 242 # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
 243 # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
 244 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
 245 # define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
 246 # define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
 247 # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
 248 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
 249 # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
 250 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
 251 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
 252 # define PyUnicode_Encode PyUnicodeUCS4_Encode
 253 # define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
 254 # define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
 255 # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
 256 # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
 257 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
 258 # define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
 259 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
 260 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
 261 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
 262 # define PyUnicode_Find PyUnicodeUCS4_Find
 263 # define PyUnicode_Format PyUnicodeUCS4_Format
 264 # define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
 265 # define PyUnicode_FromObject PyUnicodeUCS4_FromObject
 266 # define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
 267 # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
 268 # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
 269 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
 270 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
 271 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
 272 # define PyUnicode_Join PyUnicodeUCS4_Join
 273 # define PyUnicode_Partition PyUnicodeUCS4_Partition
 274 # define PyUnicode_RPartition PyUnicodeUCS4_RPartition
 275 # define PyUnicode_RSplit PyUnicodeUCS4_RSplit
 276 # define PyUnicode_Replace PyUnicodeUCS4_Replace
 277 # define PyUnicode_Resize PyUnicodeUCS4_Resize
 278 # define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
 279 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
 280 # define PyUnicode_Split PyUnicodeUCS4_Split
 281 # define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
 282 # define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
 283 # define PyUnicode_Translate PyUnicodeUCS4_Translate
 284 # define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
 285 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
 286 # define _PyUnicode_Fini _PyUnicodeUCS4_Fini
 287 # define _PyUnicode_Init _PyUnicodeUCS4_Init
 288 # define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
 289 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
 290 # define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
 291 # define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
 292 # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
 293 # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
 294 # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
 295 # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
 296 # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
 297 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
 298 # define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
 299 # define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
 300 # define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
 301 # define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
 302 # define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
 303
 304
 305 #endif
 306
 307 /* --- Internal Unicode Operations ---------------------------------------- */
 308
 309 /* If you want Python to use the compiler's wctype.h functions instead
 310    of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
 311    configure Python using --with-wctype-functions.  This reduces the
 312    interpreter's code size. */
 313
 314 #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
 315
 316 #include <wctype.h>
 317
 318 #define Py_UNICODE_ISSPACE(ch) iswspace(ch)
 319
 320 #define Py_UNICODE_ISLOWER(ch) iswlower(ch)
 321 #define Py_UNICODE_ISUPPER(ch) iswupper(ch)
 322 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
 323 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
 324
 325 #define Py_UNICODE_TOLOWER(ch) towlower(ch)
 326 #define Py_UNICODE_TOUPPER(ch) towupper(ch)
 327 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
 328
 329 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
 330 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
 331 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
 332
 333 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
 334 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
 335 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
 336
 337 #define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
 338
 339 #else
 340
 341 #define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
 342
 343 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
 344 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
 345 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
 346 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
 347
 348 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
 349 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
 350 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
 351
 352 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
 353 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
 354 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
 355
 356 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
 357 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
 358 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
 359
 360 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
 361
 362 #endif
 363
 364 #define Py_UNICODE_ISALNUM(ch) \
 365        (Py_UNICODE_ISALPHA(ch) || \
 366         Py_UNICODE_ISDECIMAL(ch) || \
 367         Py_UNICODE_ISDIGIT(ch) || \
 368         Py_UNICODE_ISNUMERIC(ch))
 369
 370 #define Py_UNICODE_COPY(target, source, length)                         \
 371         Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
 372
 373 #define Py_UNICODE_FILL(target, value, length) do\
 374     {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
 375         for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
 376     } while (0)
 377
 378 /* check if substring matches at given offset.  the offset must be
 379    valid, and the substring must not be empty */
 380 #define Py_UNICODE_MATCH(string, offset, substring) \
 381     ((*((string)->str + (offset)) == *((substring)->str)) && \
 382     ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
 383      !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
 384
 385 #ifdef __cplusplus
 386 extern "C" {
 387 #endif
 388
 389 /* --- Unicode Type ------------------------------------------------------- */
 390
 391 typedef struct {
 392     PyObject_HEAD
 393     Py_ssize_t length;          /* Length of raw Unicode data in buffer */
 394     Py_UNICODE *str;            /* Raw Unicode buffer */
 395     long hash;                  /* Hash value; -1 if not set */
 396     PyObject *defenc;           /* (Default) Encoded version as Python
 397                                    string, or NULL; this is used for
 398                                    implementing the buffer protocol */
 399 } PyUnicodeObject;
 400
 401 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
 402
 403 #define PyUnicode_Check(op) \
 404                  PyType_FastSubclass(Py_Type(op), Py_TPFLAGS_UNICODE_SUBCLASS)
 405 #define PyUnicode_CheckExact(op) (Py_Type(op) == &PyUnicode_Type)
 406
 407 /* Fast access macros */
 408 #define PyUnicode_GET_SIZE(op) \
 409         (((PyUnicodeObject *)(op))->length)
 410 #define PyUnicode_GET_DATA_SIZE(op) \
 411         (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
 412 #define PyUnicode_AS_UNICODE(op) \
 413         (((PyUnicodeObject *)(op))->str)
 414 #define PyUnicode_AS_DATA(op) \
 415         ((const char *)((PyUnicodeObject *)(op))->str)
 416
 417 /* --- Constants ---------------------------------------------------------- */
 418
 419 /* This Unicode character will be used as replacement character during
 420    decoding if the errors argument is set to "replace". Note: the
 421    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
 422    Unicode 3.0. */
 423
 424 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
 425
 426 /* === Public API ========================================================= */
 427
 428 /* --- Plain Py_UNICODE --------------------------------------------------- */
 429
 430 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
 431    size.
 432
 433    u may be NULL which causes the contents to be undefined. It is the
 434    user's responsibility to fill in the needed data afterwards. Note
 435    that modifying the Unicode object contents after construction is
 436    only allowed if u was set to NULL.
 437
 438    The buffer is copied into the new object. */
 439
 440 PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
 441     const Py_UNICODE *u,        /* Unicode buffer */
 442     Py_ssize_t size             /* size of buffer */
 443     );
 444
 445 /* Return a read-only pointer to the Unicode object's internal
 446    Py_UNICODE buffer. */
 447
 448 PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
 449     PyObject *unicode           /* Unicode object */
 450     );
 451
 452 /* Get the length of the Unicode object. */
 453
 454 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
 455     PyObject *unicode           /* Unicode object */
 456     );
 457
 458 /* Get the maximum ordinal for a Unicode character. */
 459 PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
 460
 461 /* Resize an already allocated Unicode object to the new size length.
 462
 463    *unicode is modified to point to the new (resized) object and 0
 464    returned on success.
 465
 466    This API may only be called by the function which also called the
 467    Unicode constructor. The refcount on the object must be 1. Otherwise,
 468    an error is returned.
 469
 470    Error handling is implemented as follows: an exception is set, -1
 471    is returned and *unicode left untouched.
 472
 473 */
 474
 475 PyAPI_FUNC(int) PyUnicode_Resize(
 476     PyObject **unicode,         /* Pointer to the Unicode object */
 477     Py_ssize_t length           /* New length */
 478     );
 479
 480 /* Coerce obj to an Unicode object and return a reference with
 481    *incremented* refcount.
 482
 483    Coercion is done in the following way:
 484
 485    1. String and other char buffer compatible objects are decoded
 486       under the assumptions that they contain data using the current
 487       default encoding. Decoding is done in "strict" mode.
 488
 489    2. All other objects (including Unicode objects) raise an
 490       exception.
 491
 492    The API returns NULL in case of an error. The caller is responsible
 493    for decref'ing the returned objects.
 494
 495 */
 496
 497 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
 498     register PyObject *obj,     /* Object */
 499     const char *encoding,       /* encoding */
 500     const char *errors          /* error handling */
 501     );
 502
 503 /* Coerce obj to an Unicode object and return a reference with
 504    *incremented* refcount.
 505
 506    Unicode objects are passed back as-is (subclasses are converted to
 507    true Unicode objects), all other objects are delegated to
 508    PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
 509    using the default encoding as basis for decoding the object.
 510
 511    The API returns NULL in case of an error. The caller is responsible
 512    for decref'ing the returned objects.
 513
 514 */
 515
 516 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
 517     register PyObject *obj      /* Object */
 518     );
 519
 520 /* --- wchar_t support for platforms which support it --------------------- */
 521
 522 #ifdef HAVE_WCHAR_H
 523
 524 /* Create a Unicode Object from the whcar_t buffer w of the given
 525    size.
 526
 527    The buffer is copied into the new object. */
 528
 529 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
 530     register const wchar_t *w,  /* wchar_t buffer */
 531     Py_ssize_t size             /* size of buffer */
 532     );
 533
 534 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
 535    most size wchar_t characters are copied.
 536
 537    Note that the resulting wchar_t string may or may not be
 538    0-terminated.  It is the responsibility of the caller to make sure
 539    that the wchar_t string is 0-terminated in case this is required by
 540    the application.
 541
 542    Returns the number of wchar_t characters copied (excluding a
 543    possibly trailing 0-termination character) or -1 in case of an
 544    error. */
 545
 546 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
 547     PyUnicodeObject *unicode,   /* Unicode object */
 548     register wchar_t *w,        /* wchar_t buffer */
 549     Py_ssize_t size             /* size of buffer */
 550     );
 551
 552 #endif
 553
 554 /* --- Unicode ordinals --------------------------------------------------- */
 555
 556 /* Create a Unicode Object from the given Unicode code point ordinal.
 557
 558    The ordinal must be in range(0x10000) on narrow Python builds
 559    (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
 560    raised in case it is not.
 561
 562 */
 563
 564 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
 565
 566 /* === Builtin Codecs =====================================================
 567
 568    Many of these APIs take two arguments encoding and errors. These
 569    parameters encoding and errors have the same semantics as the ones
 570    of the builtin unicode() API.
 571
 572    Setting encoding to NULL causes the default encoding to be used.
 573
 574    Error handling is set by errors which may also be set to NULL
 575    meaning to use the default handling defined for the codec. Default
 576    error handling for all builtin codecs is "strict" (ValueErrors are
 577    raised).
 578
 579    The codecs all use a similar interface. Only deviation from the
 580    generic ones are documented.
 581
 582 */
 583
 584 /* --- Manage the default encoding ---------------------------------------- */
 585
 586 /* Return a Python string holding the default encoded value of the
 587    Unicode object.
 588
 589    The resulting string is cached in the Unicode object for subsequent
 590    usage by this function. The cached version is needed to implement
 591    the character buffer interface and will live (at least) as long as
 592    the Unicode object itself.
 593
 594    The refcount of the string is *not* incremented.
 595
 596    *** Exported for internal use by the interpreter only !!! ***
 597
 598 */
 599
 600 PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
 601     PyObject *, const char *);
 602
 603 /* Returns the currently active default encoding.
 604
 605    The default encoding is currently implemented as run-time settable
 606    process global.  This may change in future versions of the
 607    interpreter to become a parameter which is managed on a per-thread
 608    basis.
 609
 610  */
 611
 612 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
 613
 614 /* Sets the currently active default encoding.
 615
 616    Returns 0 on success, -1 in case of an error.
 617
 618  */
 619
 620 PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
 621     const char *encoding        /* Encoding name in standard form */
 622     );
 623
 624 /* --- Generic Codecs ----------------------------------------------------- */
 625
 626 /* Create a Unicode object by decoding the encoded string s of the
 627    given size. */
 628
 629 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
 630     const char *s,              /* encoded string */
 631     Py_ssize_t size,            /* size of buffer */
 632     const char *encoding,       /* encoding */
 633     const char *errors          /* error handling */
 634     );
 635
 636 /* Encodes a Py_UNICODE buffer of the given size and returns a
 637    Python string object. */
 638
 639 PyAPI_FUNC(PyObject*) PyUnicode_Encode(
 640     const Py_UNICODE *s,        /* Unicode char buffer */
 641     Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
 642     const char *encoding,       /* encoding */
 643     const char *errors          /* error handling */
 644     );
 645
 646 /* Encodes a Unicode object and returns the result as Python
 647    object. */
 648
 649 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
 650     PyObject *unicode,          /* Unicode object */
 651     const char *encoding,       /* encoding */
 652     const char *errors          /* error handling */
 653     );
 654
 655 /* Encodes a Unicode object and returns the result as Python string
 656    object. */
 657
 658 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
 659     PyObject *unicode,          /* Unicode object */
 660     const char *encoding,       /* encoding */
 661     const char *errors          /* error handling */
 662     );
 663
 664 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
 665     PyObject* string            /* 256 character map */
 666    );
 667
 668
 669 /* --- UTF-7 Codecs ------------------------------------------------------- */
 670
 671 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
 672     const char *string,         /* UTF-7 encoded string */
 673     Py_ssize_t length,          /* size of string */
 674     const char *errors          /* error handling */
 675     );
 676
 677 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
 678     const Py_UNICODE *data,     /* Unicode char buffer */
 679     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 680     int encodeSetO,             /* force the encoder to encode characters in
 681                                    Set O, as described in RFC2152 */
 682     int encodeWhiteSpace,       /* force the encoder to encode space, tab,
 683                                    carriage return and linefeed characters */
 684     const char *errors          /* error handling */
 685     );
 686
 687 /* --- UTF-8 Codecs ------------------------------------------------------- */
 688
 689 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
 690     const char *string,         /* UTF-8 encoded string */
 691     Py_ssize_t length,          /* size of string */
 692     const char *errors          /* error handling */
 693     );
 694
 695 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
 696     const char *string,         /* UTF-8 encoded string */
 697     Py_ssize_t length,          /* size of string */
 698     const char *errors,         /* error handling */
 699     Py_ssize_t *consumed                /* bytes consumed */
 700     );
 701
 702 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
 703     PyObject *unicode           /* Unicode object */
 704     );
 705
 706 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
 707     const Py_UNICODE *data,     /* Unicode char buffer */
 708     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 709     const char *errors          /* error handling */
 710     );
 711
 712 /* --- UTF-32 Codecs ------------------------------------------------------ */
 713
 714 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
 715    the corresponding Unicode object.
 716
 717    errors (if non-NULL) defines the error handling. It defaults
 718    to "strict".
 719
 720    If byteorder is non-NULL, the decoder starts decoding using the
 721    given byte order:
 722
 723         *byteorder == -1: little endian
 724         *byteorder == 0:  native order
 725         *byteorder == 1:  big endian
 726
 727    In native mode, the first four bytes of the stream are checked for a
 728    BOM mark. If found, the BOM mark is analysed, the byte order
 729    adjusted and the BOM skipped.  In the other modes, no BOM mark
 730    interpretation is done. After completion, *byteorder is set to the
 731    current byte order at the end of input data.
 732
 733    If byteorder is NULL, the codec starts in native order mode.
 734
 735 */
 736
 737 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
 738     const char *string,         /* UTF-32 encoded string */
 739     Py_ssize_t length,          /* size of string */
 740     const char *errors,         /* error handling */
 741     int *byteorder              /* pointer to byteorder to use
 742                                    0=native;-1=LE,1=BE; updated on
 743                                    exit */
 744     );
 745
 746 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
 747     const char *string,         /* UTF-32 encoded string */
 748     Py_ssize_t length,          /* size of string */
 749     const char *errors,         /* error handling */
 750     int *byteorder,             /* pointer to byteorder to use
 751                                    0=native;-1=LE,1=BE; updated on
 752                                    exit */
 753     Py_ssize_t *consumed        /* bytes consumed */
 754     );
 755
 756 /* Returns a Python string using the UTF-32 encoding in native byte
 757    order. The string always starts with a BOM mark.  */
 758
 759 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
 760     PyObject *unicode           /* Unicode object */
 761     );
 762
 763 /* Returns a Python string object holding the UTF-32 encoded value of
 764    the Unicode data.
 765
 766    If byteorder is not 0, output is written according to the following
 767    byte order:
 768
 769    byteorder == -1: little endian
 770    byteorder == 0:  native byte order (writes a BOM mark)
 771    byteorder == 1:  big endian
 772
 773    If byteorder is 0, the output string will always start with the
 774    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
 775    prepended.
 776
 777 */
 778
 779 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF32(
 780     const Py_UNICODE *data,     /* Unicode char buffer */
 781     Py_ssize_t length,          /* number of Py_UNICODE chars to encode */
 782     const char *errors,         /* error handling */
 783     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
 784     );
 785
 786 /* --- UTF-16 Codecs ------------------------------------------------------ */
 787
 788 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
 789    the corresponding Unicode object.
 790
 791    errors (if non-NULL) defines the error handling. It defaults
 792    to "strict".
 793
 794    If byteorder is non-NULL, the decoder starts decoding using the
 795    given byte order:
 796
 797         *byteorder == -1: little endian
 798         *byteorder == 0:  native order
 799         *byteorder == 1:  big endian
 800
 801    In native mode, the first two bytes of the stream are checked for a
 802    BOM mark. If found, the BOM mark is analysed, the byte order
 803    adjusted and the BOM skipped.  In the other modes, no BOM mark
 804    interpretation is done. After completion, *byteorder is set to the
 805    current byte order at the end of input data.
 806
 807    If byteorder is NULL, the codec starts in native order mode.
 808
 809 */
 810
 811 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
 812     const char *string,         /* UTF-16 encoded string */
 813     Py_ssize_t length,          /* size of string */
 814     const char *errors,         /* error handling */
 815     int *byteorder              /* pointer to byteorder to use
 816                                    0=native;-1=LE,1=BE; updated on
 817                                    exit */
 818     );
 819
 820 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
 821     const char *string,         /* UTF-16 encoded string */
 822     Py_ssize_t length,          /* size of string */
 823     const char *errors,         /* error handling */
 824     int *byteorder,             /* pointer to byteorder to use
 825                                    0=native;-1=LE,1=BE; updated on
 826                                    exit */
 827     Py_ssize_t *consumed                /* bytes consumed */
 828     );
 829
 830 /* Returns a Python string using the UTF-16 encoding in native byte
 831    order. The string always starts with a BOM mark.  */
 832
 833 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
 834     PyObject *unicode           /* Unicode object */
 835     );
 836
 837 /* Returns a Python string object holding the UTF-16 encoded value of
 838    the Unicode data.
 839
 840    If byteorder is not 0, output is written according to the following
 841    byte order:
 842
 843    byteorder == -1: little endian
 844    byteorder == 0:  native byte order (writes a BOM mark)
 845    byteorder == 1:  big endian
 846
 847    If byteorder is 0, the output string will always start with the
 848    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
 849    prepended.
 850
 851    Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
 852    UCS-2. This trick makes it possible to add full UTF-16 capabilities
 853    at a later point without compromising the APIs.
 854
 855 */
 856
 857 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
 858     const Py_UNICODE *data,     /* Unicode char buffer */
 859     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 860     const char *errors,         /* error handling */
 861     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
 862     );
 863
 864 /* --- Unicode-Escape Codecs ---------------------------------------------- */
 865
 866 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
 867     const char *string,         /* Unicode-Escape encoded string */
 868     Py_ssize_t length,          /* size of string */
 869     const char *errors          /* error handling */
 870     );
 871
 872 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
 873     PyObject *unicode           /* Unicode object */
 874     );
 875
 876 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
 877     const Py_UNICODE *data,     /* Unicode char buffer */
 878     Py_ssize_t length                   /* Number of Py_UNICODE chars to encode */
 879     );
 880
 881 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
 882
 883 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
 884     const char *string,         /* Raw-Unicode-Escape encoded string */
 885     Py_ssize_t length,          /* size of string */
 886     const char *errors          /* error handling */
 887     );
 888
 889 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
 890     PyObject *unicode           /* Unicode object */
 891     );
 892
 893 PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
 894     const Py_UNICODE *data,     /* Unicode char buffer */
 895     Py_ssize_t length                   /* Number of Py_UNICODE chars to encode */
 896     );
 897
 898 /* --- Unicode Internal Codec ---------------------------------------------
 899
 900     Only for internal use in _codecsmodule.c */
 901
 902 PyObject *_PyUnicode_DecodeUnicodeInternal(
 903     const char *string,
 904     Py_ssize_t length,
 905     const char *errors
 906     );
 907
 908 /* --- Latin-1 Codecs -----------------------------------------------------
 909
 910    Note: Latin-1 corresponds to the first 256 Unicode ordinals.
 911
 912 */
 913
 914 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
 915     const char *string,         /* Latin-1 encoded string */
 916     Py_ssize_t length,          /* size of string */
 917     const char *errors          /* error handling */
 918     );
 919
 920 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
 921     PyObject *unicode           /* Unicode object */
 922     );
 923
 924 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
 925     const Py_UNICODE *data,     /* Unicode char buffer */
 926     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 927     const char *errors          /* error handling */
 928     );
 929
 930 /* --- ASCII Codecs -------------------------------------------------------
 931
 932    Only 7-bit ASCII data is excepted. All other codes generate errors.
 933
 934 */
 935
 936 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
 937     const char *string,         /* ASCII encoded string */
 938     Py_ssize_t length,          /* size of string */
 939     const char *errors          /* error handling */
 940     );
 941
 942 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
 943     PyObject *unicode           /* Unicode object */
 944     );
 945
 946 PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
 947     const Py_UNICODE *data,     /* Unicode char buffer */
 948     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 949     const char *errors          /* error handling */
 950     );
 951
 952 /* --- Character Map Codecs -----------------------------------------------
 953
 954    This codec uses mappings to encode and decode characters.
 955
 956    Decoding mappings must map single string characters to single
 957    Unicode characters, integers (which are then interpreted as Unicode
 958    ordinals) or None (meaning "undefined mapping" and causing an
 959    error).
 960
 961    Encoding mappings must map single Unicode characters to single
 962    string characters, integers (which are then interpreted as Latin-1
 963    ordinals) or None (meaning "undefined mapping" and causing an
 964    error).
 965
 966    If a character lookup fails with a LookupError, the character is
 967    copied as-is meaning that its ordinal value will be interpreted as
 968    Unicode or Latin-1 ordinal resp. Because of this mappings only need
 969    to contain those mappings which map characters to different code
 970    points.
 971
 972 */
 973
 974 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
 975     const char *string,         /* Encoded string */
 976     Py_ssize_t length,          /* size of string */
 977     PyObject *mapping,          /* character mapping
 978                                    (char ordinal -> unicode ordinal) */
 979     const char *errors          /* error handling */
 980     );
 981
 982 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
 983     PyObject *unicode,          /* Unicode object */
 984     PyObject *mapping           /* character mapping
 985                                    (unicode ordinal -> char ordinal) */
 986     );
 987
 988 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
 989     const Py_UNICODE *data,     /* Unicode char buffer */
 990     Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
 991     PyObject *mapping,          /* character mapping
 992                                    (unicode ordinal -> char ordinal) */
 993     const char *errors          /* error handling */
 994     );
 995
 996 /* Translate a Py_UNICODE buffer of the given length by applying a
 997    character mapping table to it and return the resulting Unicode
 998    object.
 999
1000    The mapping table must map Unicode ordinal integers to Unicode
1001    ordinal integers or None (causing deletion of the character).
1002
1003    Mapping tables may be dictionaries or sequences. Unmapped character
1004    ordinals (ones which cause a LookupError) are left untouched and
1005    are copied as-is.
1006
1007 */
1008
1009 PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
1010     const Py_UNICODE *data,     /* Unicode char buffer */
1011     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
1012     PyObject *table,            /* Translate table */
1013     const char *errors          /* error handling */
1014     );
1015
1016 #ifdef MS_WIN32
1017
1018 /* --- MBCS codecs for Windows -------------------------------------------- */
1019
1020 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
1021     const char *string,         /* MBCS encoded string */
1022     Py_ssize_t length,              /* size of string */
1023     const char *errors          /* error handling */
1024     );
1025
1026 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
1027     const char *string,         /* MBCS encoded string */
1028     Py_ssize_t length,          /* size of string */
1029     const char *errors,         /* error handling */
1030     Py_ssize_t *consumed        /* bytes consumed */
1031     );
1032
1033 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
1034     PyObject *unicode           /* Unicode object */
1035     );
1036
1037 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
1038     const Py_UNICODE *data,     /* Unicode char buffer */
1039     Py_ssize_t length,              /* Number of Py_UNICODE chars to encode */
1040     const char *errors          /* error handling */
1041     );
1042
1043 #endif /* MS_WIN32 */
1044
1045 /* --- Decimal Encoder ---------------------------------------------------- */
1046
1047 /* Takes a Unicode string holding a decimal value and writes it into
1048    an output buffer using standard ASCII digit codes.
1049
1050    The output buffer has to provide at least length+1 bytes of storage
1051    area. The output string is 0-terminated.
1052
1053    The encoder converts whitespace to ' ', decimal characters to their
1054    corresponding ASCII digit and all other Latin-1 characters except
1055    \0 as-is. Characters outside this range (Unicode ordinals 1-256)
1056    are treated as errors. This includes embedded NULL bytes.
1057
1058    Error handling is defined by the errors argument:
1059
1060       NULL or "strict": raise a ValueError
1061       "ignore": ignore the wrong characters (these are not copied to the
1062                 output buffer)
1063       "replace": replaces illegal characters with '?'
1064
1065    Returns 0 on success, -1 on failure.
1066
1067 */
1068
1069 PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
1070     Py_UNICODE *s,              /* Unicode buffer */
1071     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
1072     char *output,               /* Output buffer; must have size >= length */
1073     const char *errors          /* error handling */
1074     );
1075
1076 /* --- Methods & Slots ----------------------------------------------------
1077
1078    These are capable of handling Unicode objects and strings on input
1079    (we refer to them as strings in the descriptions) and return
1080    Unicode objects or integers as apporpriate. */
1081
1082 /* Concat two strings giving a new Unicode string. */
1083
1084 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1085     PyObject *left,             /* Left string */
1086     PyObject *right             /* Right string */
1087     );
1088
1089 /* Split a string giving a list of Unicode strings.
1090
1091    If sep is NULL, splitting will be done at all whitespace
1092    substrings. Otherwise, splits occur at the given separator.
1093
1094    At most maxsplit splits will be done. If negative, no limit is set.
1095
1096    Separators are not included in the resulting list.
1097
1098 */
1099
1100 PyAPI_FUNC(PyObject*) PyUnicode_Split(
1101     PyObject *s,                /* String to split */
1102     PyObject *sep,              /* String separator */
1103     Py_ssize_t maxsplit         /* Maxsplit count */
1104     );
1105
1106 /* Dito, but split at line breaks.
1107
1108    CRLF is considered to be one line break. Line breaks are not
1109    included in the resulting list. */
1110
1111 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1112     PyObject *s,                /* String to split */
1113     int keepends                /* If true, line end markers are included */
1114     );
1115
1116 /* Partition a string using a given separator. */
1117
1118 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1119     PyObject *s,                /* String to partition */
1120     PyObject *sep               /* String separator */
1121     );
1122
1123 /* Partition a string using a given separator, searching from the end of the
1124    string. */
1125
1126 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1127     PyObject *s,                /* String to partition */
1128     PyObject *sep               /* String separator */
1129     );
1130
1131 /* Split a string giving a list of Unicode strings.
1132
1133    If sep is NULL, splitting will be done at all whitespace
1134    substrings. Otherwise, splits occur at the given separator.
1135
1136    At most maxsplit splits will be done. But unlike PyUnicode_Split
1137    PyUnicode_RSplit splits from the end of the string. If negative,
1138    no limit is set.
1139
1140    Separators are not included in the resulting list.
1141
1142 */
1143
1144 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1145     PyObject *s,                /* String to split */
1146     PyObject *sep,              /* String separator */
1147     Py_ssize_t maxsplit         /* Maxsplit count */
1148     );
1149
1150 /* Translate a string by applying a character mapping table to it and
1151    return the resulting Unicode object.
1152
1153    The mapping table must map Unicode ordinal integers to Unicode
1154    ordinal integers or None (causing deletion of the character).
1155
1156    Mapping tables may be dictionaries or sequences. Unmapped character
1157    ordinals (ones which cause a LookupError) are left untouched and
1158    are copied as-is.
1159
1160 */
1161
1162 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1163     PyObject *str,              /* String */
1164     PyObject *table,            /* Translate table */
1165     const char *errors          /* error handling */
1166     );
1167
1168 /* Join a sequence of strings using the given separator and return
1169    the resulting Unicode string. */
1170
1171 PyAPI_FUNC(PyObject*) PyUnicode_Join(
1172     PyObject *separator,        /* Separator string */
1173     PyObject *seq               /* Sequence object */
1174     );
1175
1176 /* Return 1 if substr matches str[start:end] at the given tail end, 0
1177    otherwise. */
1178
1179 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1180     PyObject *str,              /* String */
1181     PyObject *substr,           /* Prefix or Suffix string */
1182     Py_ssize_t start,           /* Start index */
1183     Py_ssize_t end,             /* Stop index */
1184     int direction               /* Tail end: -1 prefix, +1 suffix */
1185     );
1186
1187 /* Return the first position of substr in str[start:end] using the
1188    given search direction or -1 if not found. -2 is returned in case
1189    an error occurred and an exception is set. */
1190
1191 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1192     PyObject *str,              /* String */
1193     PyObject *substr,           /* Substring to find */
1194     Py_ssize_t start,           /* Start index */
1195     Py_ssize_t end,             /* Stop index */
1196     int direction               /* Find direction: +1 forward, -1 backward */
1197     );
1198
1199 /* Count the number of occurrences of substr in str[start:end]. */
1200
1201 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1202     PyObject *str,              /* String */
1203     PyObject *substr,           /* Substring to count */
1204     Py_ssize_t start,           /* Start index */
1205     Py_ssize_t end              /* Stop index */
1206     );
1207
1208 /* Replace at most maxcount occurrences of substr in str with replstr
1209    and return the resulting Unicode object. */
1210
1211 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1212     PyObject *str,              /* String */
1213     PyObject *substr,           /* Substring to find */
1214     PyObject *replstr,          /* Substring to replace */
1215     Py_ssize_t maxcount         /* Max. number of replacements to apply;
1216                                    -1 = all */
1217     );
1218
1219 /* Compare two strings and return -1, 0, 1 for less than, equal,
1220    greater than resp. */
1221
1222 PyAPI_FUNC(int) PyUnicode_Compare(
1223     PyObject *left,             /* Left string */
1224     PyObject *right             /* Right string */
1225     );
1226
1227 /* Rich compare two strings and return one of the following:
1228
1229    - NULL in case an exception was raised
1230    - Py_True or Py_False for successfuly comparisons
1231    - Py_NotImplemented in case the type combination is unknown
1232
1233    Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1234    case the conversion of the arguments to Unicode fails with a
1235    UnicodeDecodeError.
1236
1237    Possible values for op:
1238
1239      Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1240
1241 */
1242
1243 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1244     PyObject *left,             /* Left string */
1245     PyObject *right,            /* Right string */
1246     int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1247     );
1248
1249 /* Apply a argument tuple or dictionary to a format string and return
1250    the resulting Unicode string. */
1251
1252 PyAPI_FUNC(PyObject *) PyUnicode_Format(
1253     PyObject *format,           /* Format string */
1254     PyObject *args              /* Argument tuple or dictionary */
1255     );
1256
1257 /* Checks whether element is contained in container and return 1/0
1258    accordingly.
1259
1260    element has to coerce to an one element Unicode string. -1 is
1261    returned in case of an error. */
1262
1263 PyAPI_FUNC(int) PyUnicode_Contains(
1264     PyObject *container,        /* Container string */
1265     PyObject *element           /* Element string */
1266     );
1267
1268 /* Externally visible for str.strip(unicode) */
1269 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1270     PyUnicodeObject *self,
1271     int striptype,
1272     PyObject *sepobj
1273     );
1274
1275 /* === Characters Type APIs =============================================== */
1276
1277 /* These should not be used directly. Use the Py_UNICODE_IS* and
1278    Py_UNICODE_TO* macros instead.
1279
1280    These APIs are implemented in Objects/unicodectype.c.
1281
1282 */
1283
1284 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1285     Py_UNICODE ch       /* Unicode character */
1286     );
1287
1288 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1289     Py_UNICODE ch       /* Unicode character */
1290     );
1291
1292 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1293     Py_UNICODE ch       /* Unicode character */
1294     );
1295
1296 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1297     const Py_UNICODE ch         /* Unicode character */
1298     );
1299
1300 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1301     const Py_UNICODE ch         /* Unicode character */
1302     );
1303
1304 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
1305     Py_UNICODE ch       /* Unicode character */
1306     );
1307
1308 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
1309     Py_UNICODE ch       /* Unicode character */
1310     );
1311
1312 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
1313     Py_UNICODE ch       /* Unicode character */
1314     );
1315
1316 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1317     Py_UNICODE ch       /* Unicode character */
1318     );
1319
1320 PyAPI_FUNC(int) _PyUnicode_ToDigit(
1321     Py_UNICODE ch       /* Unicode character */
1322     );
1323
1324 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1325     Py_UNICODE ch       /* Unicode character */
1326     );
1327
1328 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1329     Py_UNICODE ch       /* Unicode character */
1330     );
1331
1332 PyAPI_FUNC(int) _PyUnicode_IsDigit(
1333     Py_UNICODE ch       /* Unicode character */
1334     );
1335
1336 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1337     Py_UNICODE ch       /* Unicode character */
1338     );
1339
1340 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1341     Py_UNICODE ch       /* Unicode character */
1342     );
1343
1344 #ifdef __cplusplus
1345 }
1346 #endif
1347 #endif /* Py_USING_UNICODE */
1348 #endif /* !Py_UNICODEOBJECT_H */