Include/unicodeobject.h

   1 #ifndef Py_UNICODEOBJECT_H
   2 #define Py_UNICODEOBJECT_H
   3
   4 /*
   5
   6 Unicode implementation based on original code by Fredrik Lundh,
   7 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
   8 Unicode Integration Proposal (see file Misc/unicode.txt).
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12
  13  Original header:
  14  --------------------------------------------------------------------
  15
  16  * Yet another Unicode string type for Python.  This type supports the
  17  * 16-bit Basic Multilingual Plane (BMP) only.
  18  *
  19  * Written by Fredrik Lundh, January 1999.
  20  *
  21  * Copyright (c) 1999 by Secret Labs AB.
  22  * Copyright (c) 1999 by Fredrik Lundh.
  23  *
  24  * fredrik@pythonware.com
  25  * http://www.pythonware.com
  26  *
  27  * --------------------------------------------------------------------
  28  * This Unicode String Type is
  29  *
  30  * Copyright (c) 1999 by Secret Labs AB
  31  * Copyright (c) 1999 by Fredrik Lundh
  32  *
  33  * By obtaining, using, and/or copying this software and/or its
  34  * associated documentation, you agree that you have read, understood,
  35  * and will comply with the following terms and conditions:
  36  *
  37  * Permission to use, copy, modify, and distribute this software and its
  38  * associated documentation for any purpose and without fee is hereby
  39  * granted, provided that the above copyright notice appears in all
  40  * copies, and that both that copyright notice and this permission notice
  41  * appear in supporting documentation, and that the name of Secret Labs
  42  * AB or the author not be used in advertising or publicity pertaining to
  43  * distribution of the software without specific, written prior
  44  * permission.
  45  *
  46  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  47  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  48  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  49  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  50  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  51  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  52  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  53  * -------------------------------------------------------------------- */
  54
  55 #include <ctype.h>
  56
  57 /* === Internal API ======================================================= */
  58
  59 /* --- Internal Unicode Format -------------------------------------------- */
  60
  61 #ifndef Py_USING_UNICODE
  62
  63 #define PyUnicode_Check(op)                 0
  64 #define PyUnicode_CheckExact(op)            0
  65
  66 #else
  67
  68 /* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
  69    properly set, but the default rules below doesn't set it.  I'll
  70    sort this out some other day -- fredrik@pythonware.com */
  71
  72 #ifndef Py_UNICODE_SIZE
  73 #error Must define Py_UNICODE_SIZE
  74 #endif
  75
  76 /* Setting Py_UNICODE_WIDE enables UCS-4 storage.  Otherwise, Unicode
  77    strings are stored as UCS-2 (with limited support for UTF-16) */
  78
  79 #if Py_UNICODE_SIZE >= 4
  80 #define Py_UNICODE_WIDE
  81 #endif
  82
  83 /* Set these flags if the platform has "wchar.h", "wctype.h" and the
  84    wchar_t type is a 16-bit unsigned type */
  85 /* #define HAVE_WCHAR_H */
  86 /* #define HAVE_USABLE_WCHAR_T */
  87
  88 /* Defaults for various platforms */
  89 #ifndef PY_UNICODE_TYPE
  90
  91 /* Windows has a usable wchar_t type (unless we're using UCS-4) */
  92 # if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
  93 #  define HAVE_USABLE_WCHAR_T
  94 #  define PY_UNICODE_TYPE wchar_t
  95 # endif
  96
  97 # if defined(Py_UNICODE_WIDE)
  98 #  define PY_UNICODE_TYPE Py_UCS4
  99 # endif
 100
 101 #endif
 102
 103 /* If the compiler provides a wchar_t type we try to support it
 104    through the interface functions PyUnicode_FromWideChar() and
 105    PyUnicode_AsWideChar(). */
 106
 107 #ifdef HAVE_USABLE_WCHAR_T
 108 # ifndef HAVE_WCHAR_H
 109 #  define HAVE_WCHAR_H
 110 # endif
 111 #endif
 112
 113 #ifdef HAVE_WCHAR_H
 114 /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
 115 # ifdef _HAVE_BSDI
 116 #  include <time.h>
 117 # endif
 118 #  include <wchar.h>
 119 #endif
 120
 121 /*
 122  * Use this typedef when you need to represent a UTF-16 surrogate pair
 123  * as single unsigned integer.
 124  */
 125 #if SIZEOF_INT >= 4
 126 typedef unsigned int Py_UCS4;
 127 #elif SIZEOF_LONG >= 4
 128 typedef unsigned long Py_UCS4;
 129 #endif
 130
 131 typedef PY_UNICODE_TYPE Py_UNICODE;
 132
 133 /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
 134
 135 /* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
 136    produce different external names and thus cause import errors in
 137    case Python interpreters and extensions with mixed compiled in
 138    Unicode width assumptions are combined. */
 139
 140 #ifndef Py_UNICODE_WIDE
 141
 142 # define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
 143 # define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
 144 # define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
 145 # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
 146 # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
 147 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
 148 # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
 149 # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
 150 # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
 151 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
 152 # define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
 153 # define PyUnicode_Compare PyUnicodeUCS2_Compare
 154 # define PyUnicode_Concat PyUnicodeUCS2_Concat
 155 # define PyUnicode_Contains PyUnicodeUCS2_Contains
 156 # define PyUnicode_Count PyUnicodeUCS2_Count
 157 # define PyUnicode_Decode PyUnicodeUCS2_Decode
 158 # define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
 159 # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
 160 # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
 161 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
 162 # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
 163 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
 164 # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
 165 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
 166 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
 167 # define PyUnicode_Encode PyUnicodeUCS2_Encode
 168 # define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
 169 # define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
 170 # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
 171 # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
 172 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
 173 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
 174 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
 175 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
 176 # define PyUnicode_Find PyUnicodeUCS2_Find
 177 # define PyUnicode_Format PyUnicodeUCS2_Format
 178 # define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
 179 # define PyUnicode_FromObject PyUnicodeUCS2_FromObject
 180 # define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
 181 # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
 182 # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
 183 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
 184 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
 185 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
 186 # define PyUnicode_Join PyUnicodeUCS2_Join
 187 # define PyUnicode_Partition PyUnicodeUCS2_Partition
 188 # define PyUnicode_RPartition PyUnicodeUCS2_RPartition
 189 # define PyUnicode_RSplit PyUnicodeUCS2_RSplit
 190 # define PyUnicode_Replace PyUnicodeUCS2_Replace
 191 # define PyUnicode_Resize PyUnicodeUCS2_Resize
 192 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
 193 # define PyUnicode_Split PyUnicodeUCS2_Split
 194 # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
 195 # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
 196 # define PyUnicode_Translate PyUnicodeUCS2_Translate
 197 # define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
 198 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
 199 # define _PyUnicode_Fini _PyUnicodeUCS2_Fini
 200 # define _PyUnicode_Init _PyUnicodeUCS2_Init
 201 # define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
 202 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
 203 # define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
 204 # define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
 205 # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
 206 # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
 207 # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
 208 # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
 209 # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
 210 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
 211 # define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
 212 # define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
 213 # define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
 214 # define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
 215 # define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
 216
 217 #else
 218
 219 # define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
 220 # define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
 221 # define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
 222 # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
 223 # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
 224 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
 225 # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
 226 # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
 227 # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
 228 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
 229 # define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
 230 # define PyUnicode_Compare PyUnicodeUCS4_Compare
 231 # define PyUnicode_Concat PyUnicodeUCS4_Concat
 232 # define PyUnicode_Contains PyUnicodeUCS4_Contains
 233 # define PyUnicode_Count PyUnicodeUCS4_Count
 234 # define PyUnicode_Decode PyUnicodeUCS4_Decode
 235 # define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
 236 # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
 237 # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
 238 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
 239 # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
 240 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
 241 # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
 242 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
 243 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
 244 # define PyUnicode_Encode PyUnicodeUCS4_Encode
 245 # define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
 246 # define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
 247 # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
 248 # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
 249 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
 250 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
 251 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
 252 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
 253 # define PyUnicode_Find PyUnicodeUCS4_Find
 254 # define PyUnicode_Format PyUnicodeUCS4_Format
 255 # define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
 256 # define PyUnicode_FromObject PyUnicodeUCS4_FromObject
 257 # define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
 258 # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
 259 # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
 260 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
 261 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
 262 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
 263 # define PyUnicode_Join PyUnicodeUCS4_Join
 264 # define PyUnicode_Partition PyUnicodeUCS4_Partition
 265 # define PyUnicode_RPartition PyUnicodeUCS4_RPartition
 266 # define PyUnicode_RSplit PyUnicodeUCS4_RSplit
 267 # define PyUnicode_Replace PyUnicodeUCS4_Replace
 268 # define PyUnicode_Resize PyUnicodeUCS4_Resize
 269 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
 270 # define PyUnicode_Split PyUnicodeUCS4_Split
 271 # define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
 272 # define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
 273 # define PyUnicode_Translate PyUnicodeUCS4_Translate
 274 # define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
 275 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
 276 # define _PyUnicode_Fini _PyUnicodeUCS4_Fini
 277 # define _PyUnicode_Init _PyUnicodeUCS4_Init
 278 # define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
 279 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
 280 # define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
 281 # define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
 282 # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
 283 # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
 284 # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
 285 # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
 286 # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
 287 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
 288 # define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
 289 # define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
 290 # define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
 291 # define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
 292 # define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
 293
 294
 295 #endif
 296
 297 /* --- Internal Unicode Operations ---------------------------------------- */
 298
 299 /* If you want Python to use the compiler's wctype.h functions instead
 300    of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
 301    configure Python using --with-wctype-functions.  This reduces the
 302    interpreter's code size. */
 303
 304 #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
 305
 306 #include <wctype.h>
 307
 308 #define Py_UNICODE_ISSPACE(ch) iswspace(ch)
 309
 310 #define Py_UNICODE_ISLOWER(ch) iswlower(ch)
 311 #define Py_UNICODE_ISUPPER(ch) iswupper(ch)
 312 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
 313 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
 314
 315 #define Py_UNICODE_TOLOWER(ch) towlower(ch)
 316 #define Py_UNICODE_TOUPPER(ch) towupper(ch)
 317 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
 318
 319 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
 320 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
 321 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
 322
 323 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
 324 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
 325 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
 326
 327 #define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
 328
 329 #else
 330
 331 #define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
 332
 333 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
 334 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
 335 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
 336 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
 337
 338 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
 339 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
 340 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
 341
 342 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
 343 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
 344 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
 345
 346 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
 347 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
 348 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
 349
 350 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
 351
 352 #endif
 353
 354 #define Py_UNICODE_ISALNUM(ch) \
 355        (Py_UNICODE_ISALPHA(ch) || \
 356         Py_UNICODE_ISDECIMAL(ch) || \
 357         Py_UNICODE_ISDIGIT(ch) || \
 358         Py_UNICODE_ISNUMERIC(ch))
 359
 360 #define Py_UNICODE_COPY(target, source, length)                         \
 361         Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
 362
 363 #define Py_UNICODE_FILL(target, value, length) do\
 364     {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
 365         for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
 366     } while (0)
 367
 368 /* check if substring matches at given offset.  the offset must be
 369    valid, and the substring must not be empty */
 370 #define Py_UNICODE_MATCH(string, offset, substring) \
 371     ((*((string)->str + (offset)) == *((substring)->str)) && \
 372     ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
 373      !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
 374
 375 #ifdef __cplusplus
 376 extern "C" {
 377 #endif
 378
 379 /* --- Unicode Type ------------------------------------------------------- */
 380
 381 typedef struct {
 382     PyObject_HEAD
 383     Py_ssize_t length;          /* Length of raw Unicode data in buffer */
 384     Py_UNICODE *str;            /* Raw Unicode buffer */
 385     long hash;                  /* Hash value; -1 if not set */
 386     PyObject *defenc;           /* (Default) Encoded version as Python
 387                                    string, or NULL; this is used for
 388                                    implementing the buffer protocol */
 389 } PyUnicodeObject;
 390
 391 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
 392
 393 #define PyUnicode_Check(op) PyObject_TypeCheck(op, &PyUnicode_Type)
 394 #define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type)
 395
 396 /* Fast access macros */
 397 #define PyUnicode_GET_SIZE(op) \
 398         (((PyUnicodeObject *)(op))->length)
 399 #define PyUnicode_GET_DATA_SIZE(op) \
 400         (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
 401 #define PyUnicode_AS_UNICODE(op) \
 402         (((PyUnicodeObject *)(op))->str)
 403 #define PyUnicode_AS_DATA(op) \
 404         ((const char *)((PyUnicodeObject *)(op))->str)
 405
 406 /* --- Constants ---------------------------------------------------------- */
 407
 408 /* This Unicode character will be used as replacement character during
 409    decoding if the errors argument is set to "replace". Note: the
 410    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
 411    Unicode 3.0. */
 412
 413 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
 414
 415 /* === Public API ========================================================= */
 416
 417 /* --- Plain Py_UNICODE --------------------------------------------------- */
 418
 419 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
 420    size.
 421
 422    u may be NULL which causes the contents to be undefined. It is the
 423    user's responsibility to fill in the needed data afterwards. Note
 424    that modifying the Unicode object contents after construction is
 425    only allowed if u was set to NULL.
 426
 427    The buffer is copied into the new object. */
 428
 429 PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
 430     const Py_UNICODE *u,        /* Unicode buffer */
 431     Py_ssize_t size             /* size of buffer */
 432     );
 433
 434 /* Return a read-only pointer to the Unicode object's internal
 435    Py_UNICODE buffer. */
 436
 437 PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
 438     PyObject *unicode           /* Unicode object */
 439     );
 440
 441 /* Get the length of the Unicode object. */
 442
 443 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
 444     PyObject *unicode           /* Unicode object */
 445     );
 446
 447 /* Get the maximum ordinal for a Unicode character. */
 448 PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
 449
 450 /* Resize an already allocated Unicode object to the new size length.
 451
 452    *unicode is modified to point to the new (resized) object and 0
 453    returned on success.
 454
 455    This API may only be called by the function which also called the
 456    Unicode constructor. The refcount on the object must be 1. Otherwise,
 457    an error is returned.
 458
 459    Error handling is implemented as follows: an exception is set, -1
 460    is returned and *unicode left untouched.
 461
 462 */
 463
 464 PyAPI_FUNC(int) PyUnicode_Resize(
 465     PyObject **unicode,         /* Pointer to the Unicode object */
 466     Py_ssize_t length           /* New length */
 467     );
 468
 469 /* Coerce obj to an Unicode object and return a reference with
 470    *incremented* refcount.
 471
 472    Coercion is done in the following way:
 473
 474    1. String and other char buffer compatible objects are decoded
 475       under the assumptions that they contain data using the current
 476       default encoding. Decoding is done in "strict" mode.
 477
 478    2. All other objects (including Unicode objects) raise an
 479       exception.
 480
 481    The API returns NULL in case of an error. The caller is responsible
 482    for decref'ing the returned objects.
 483
 484 */
 485
 486 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
 487     register PyObject *obj,     /* Object */
 488     const char *encoding,       /* encoding */
 489     const char *errors          /* error handling */
 490     );
 491
 492 /* Coerce obj to an Unicode object and return a reference with
 493    *incremented* refcount.
 494
 495    Unicode objects are passed back as-is (subclasses are converted to
 496    true Unicode objects), all other objects are delegated to
 497    PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
 498    using the default encoding as basis for decoding the object.
 499
 500    The API returns NULL in case of an error. The caller is responsible
 501    for decref'ing the returned objects.
 502
 503 */
 504
 505 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
 506     register PyObject *obj      /* Object */
 507     );
 508
 509 /* --- wchar_t support for platforms which support it --------------------- */
 510
 511 #ifdef HAVE_WCHAR_H
 512
 513 /* Create a Unicode Object from the whcar_t buffer w of the given
 514    size.
 515
 516    The buffer is copied into the new object. */
 517
 518 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
 519     register const wchar_t *w,  /* wchar_t buffer */
 520     Py_ssize_t size             /* size of buffer */
 521     );
 522
 523 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
 524    most size wchar_t characters are copied.
 525
 526    Note that the resulting wchar_t string may or may not be
 527    0-terminated.  It is the responsibility of the caller to make sure
 528    that the wchar_t string is 0-terminated in case this is required by
 529    the application.
 530
 531    Returns the number of wchar_t characters copied (excluding a
 532    possibly trailing 0-termination character) or -1 in case of an
 533    error. */
 534
 535 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
 536     PyUnicodeObject *unicode,   /* Unicode object */
 537     register wchar_t *w,        /* wchar_t buffer */
 538     Py_ssize_t size             /* size of buffer */
 539     );
 540
 541 #endif
 542
 543 /* --- Unicode ordinals --------------------------------------------------- */
 544
 545 /* Create a Unicode Object from the given Unicode code point ordinal.
 546
 547    The ordinal must be in range(0x10000) on narrow Python builds
 548    (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
 549    raised in case it is not.
 550
 551 */
 552
 553 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
 554
 555 /* === Builtin Codecs =====================================================
 556
 557    Many of these APIs take two arguments encoding and errors. These
 558    parameters encoding and errors have the same semantics as the ones
 559    of the builtin unicode() API.
 560
 561    Setting encoding to NULL causes the default encoding to be used.
 562
 563    Error handling is set by errors which may also be set to NULL
 564    meaning to use the default handling defined for the codec. Default
 565    error handling for all builtin codecs is "strict" (ValueErrors are
 566    raised).
 567
 568    The codecs all use a similar interface. Only deviation from the
 569    generic ones are documented.
 570
 571 */
 572
 573 /* --- Manage the default encoding ---------------------------------------- */
 574
 575 /* Return a Python string holding the default encoded value of the
 576    Unicode object.
 577
 578    The resulting string is cached in the Unicode object for subsequent
 579    usage by this function. The cached version is needed to implement
 580    the character buffer interface and will live (at least) as long as
 581    the Unicode object itself.
 582
 583    The refcount of the string is *not* incremented.
 584
 585    *** Exported for internal use by the interpreter only !!! ***
 586
 587 */
 588
 589 PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
 590     PyObject *, const char *);
 591
 592 /* Returns the currently active default encoding.
 593
 594    The default encoding is currently implemented as run-time settable
 595    process global.  This may change in future versions of the
 596    interpreter to become a parameter which is managed on a per-thread
 597    basis.
 598
 599  */
 600
 601 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
 602
 603 /* Sets the currently active default encoding.
 604
 605    Returns 0 on success, -1 in case of an error.
 606
 607  */
 608
 609 PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
 610     const char *encoding        /* Encoding name in standard form */
 611     );
 612
 613 /* --- Generic Codecs ----------------------------------------------------- */
 614
 615 /* Create a Unicode object by decoding the encoded string s of the
 616    given size. */
 617
 618 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
 619     const char *s,              /* encoded string */
 620     Py_ssize_t size,            /* size of buffer */
 621     const char *encoding,       /* encoding */
 622     const char *errors          /* error handling */
 623     );
 624
 625 /* Encodes a Py_UNICODE buffer of the given size and returns a
 626    Python string object. */
 627
 628 PyAPI_FUNC(PyObject*) PyUnicode_Encode(
 629     const Py_UNICODE *s,        /* Unicode char buffer */
 630     Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
 631     const char *encoding,       /* encoding */
 632     const char *errors          /* error handling */
 633     );
 634
 635 /* Encodes a Unicode object and returns the result as Python
 636    object. */
 637
 638 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
 639     PyObject *unicode,          /* Unicode object */
 640     const char *encoding,       /* encoding */
 641     const char *errors          /* error handling */
 642     );
 643
 644 /* Encodes a Unicode object and returns the result as Python string
 645    object. */
 646
 647 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
 648     PyObject *unicode,          /* Unicode object */
 649     const char *encoding,       /* encoding */
 650     const char *errors          /* error handling */
 651     );
 652
 653 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
 654     PyObject* string            /* 256 character map */
 655    );
 656
 657
 658 /* --- UTF-7 Codecs ------------------------------------------------------- */
 659
 660 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
 661     const char *string,         /* UTF-7 encoded string */
 662     Py_ssize_t length,          /* size of string */
 663     const char *errors          /* error handling */
 664     );
 665
 666 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
 667     const Py_UNICODE *data,     /* Unicode char buffer */
 668     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 669     int encodeSetO,             /* force the encoder to encode characters in
 670                                    Set O, as described in RFC2152 */
 671     int encodeWhiteSpace,       /* force the encoder to encode space, tab,
 672                                    carriage return and linefeed characters */
 673     const char *errors          /* error handling */
 674     );
 675
 676 /* --- UTF-8 Codecs ------------------------------------------------------- */
 677
 678 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
 679     const char *string,         /* UTF-8 encoded string */
 680     Py_ssize_t length,          /* size of string */
 681     const char *errors          /* error handling */
 682     );
 683
 684 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
 685     const char *string,         /* UTF-8 encoded string */
 686     Py_ssize_t length,          /* size of string */
 687     const char *errors,         /* error handling */
 688     Py_ssize_t *consumed                /* bytes consumed */
 689     );
 690
 691 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
 692     PyObject *unicode           /* Unicode object */
 693     );
 694
 695 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
 696     const Py_UNICODE *data,     /* Unicode char buffer */
 697     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 698     const char *errors          /* error handling */
 699     );
 700
 701 /* --- UTF-16 Codecs ------------------------------------------------------ */
 702
 703 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
 704    the corresponding Unicode object.
 705
 706    errors (if non-NULL) defines the error handling. It defaults
 707    to "strict".
 708
 709    If byteorder is non-NULL, the decoder starts decoding using the
 710    given byte order:
 711
 712         *byteorder == -1: little endian
 713         *byteorder == 0:  native order
 714         *byteorder == 1:  big endian
 715
 716    In native mode, the first two bytes of the stream are checked for a
 717    BOM mark. If found, the BOM mark is analysed, the byte order
 718    adjusted and the BOM skipped.  In the other modes, no BOM mark
 719    interpretation is done. After completion, *byteorder is set to the
 720    current byte order at the end of input data.
 721
 722    If byteorder is NULL, the codec starts in native order mode.
 723
 724 */
 725
 726 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
 727     const char *string,         /* UTF-16 encoded string */
 728     Py_ssize_t length,          /* size of string */
 729     const char *errors,         /* error handling */
 730     int *byteorder              /* pointer to byteorder to use
 731                                    0=native;-1=LE,1=BE; updated on
 732                                    exit */
 733     );
 734
 735 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
 736     const char *string,         /* UTF-16 encoded string */
 737     Py_ssize_t length,          /* size of string */
 738     const char *errors,         /* error handling */
 739     int *byteorder,             /* pointer to byteorder to use
 740                                    0=native;-1=LE,1=BE; updated on
 741                                    exit */
 742     Py_ssize_t *consumed                /* bytes consumed */
 743     );
 744
 745 /* Returns a Python string using the UTF-16 encoding in native byte
 746    order. The string always starts with a BOM mark.  */
 747
 748 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
 749     PyObject *unicode           /* Unicode object */
 750     );
 751
 752 /* Returns a Python string object holding the UTF-16 encoded value of
 753    the Unicode data.
 754
 755    If byteorder is not 0, output is written according to the following
 756    byte order:
 757
 758    byteorder == -1: little endian
 759    byteorder == 0:  native byte order (writes a BOM mark)
 760    byteorder == 1:  big endian
 761
 762    If byteorder is 0, the output string will always start with the
 763    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
 764    prepended.
 765
 766    Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
 767    UCS-2. This trick makes it possible to add full UTF-16 capabilities
 768    at a later point without compromising the APIs.
 769
 770 */
 771
 772 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
 773     const Py_UNICODE *data,     /* Unicode char buffer */
 774     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 775     const char *errors,         /* error handling */
 776     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
 777     );
 778
 779 /* --- Unicode-Escape Codecs ---------------------------------------------- */
 780
 781 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
 782     const char *string,         /* Unicode-Escape encoded string */
 783     Py_ssize_t length,          /* size of string */
 784     const char *errors          /* error handling */
 785     );
 786
 787 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
 788     PyObject *unicode           /* Unicode object */
 789     );
 790
 791 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
 792     const Py_UNICODE *data,     /* Unicode char buffer */
 793     Py_ssize_t length                   /* Number of Py_UNICODE chars to encode */
 794     );
 795
 796 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
 797
 798 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
 799     const char *string,         /* Raw-Unicode-Escape encoded string */
 800     Py_ssize_t length,          /* size of string */
 801     const char *errors          /* error handling */
 802     );
 803
 804 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
 805     PyObject *unicode           /* Unicode object */
 806     );
 807
 808 PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
 809     const Py_UNICODE *data,     /* Unicode char buffer */
 810     Py_ssize_t length                   /* Number of Py_UNICODE chars to encode */
 811     );
 812
 813 /* --- Unicode Internal Codec ---------------------------------------------
 814
 815     Only for internal use in _codecsmodule.c */
 816
 817 PyObject *_PyUnicode_DecodeUnicodeInternal(
 818     const char *string,
 819     Py_ssize_t length,
 820     const char *errors
 821     );
 822
 823 /* --- Latin-1 Codecs -----------------------------------------------------
 824
 825    Note: Latin-1 corresponds to the first 256 Unicode ordinals.
 826
 827 */
 828
 829 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
 830     const char *string,         /* Latin-1 encoded string */
 831     Py_ssize_t length,          /* size of string */
 832     const char *errors          /* error handling */
 833     );
 834
 835 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
 836     PyObject *unicode           /* Unicode object */
 837     );
 838
 839 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
 840     const Py_UNICODE *data,     /* Unicode char buffer */
 841     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 842     const char *errors          /* error handling */
 843     );
 844
 845 /* --- ASCII Codecs -------------------------------------------------------
 846
 847    Only 7-bit ASCII data is excepted. All other codes generate errors.
 848
 849 */
 850
 851 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
 852     const char *string,         /* ASCII encoded string */
 853     Py_ssize_t length,          /* size of string */
 854     const char *errors          /* error handling */
 855     );
 856
 857 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
 858     PyObject *unicode           /* Unicode object */
 859     );
 860
 861 PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
 862     const Py_UNICODE *data,     /* Unicode char buffer */
 863     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 864     const char *errors          /* error handling */
 865     );
 866
 867 /* --- Character Map Codecs -----------------------------------------------
 868
 869    This codec uses mappings to encode and decode characters.
 870
 871    Decoding mappings must map single string characters to single
 872    Unicode characters, integers (which are then interpreted as Unicode
 873    ordinals) or None (meaning "undefined mapping" and causing an
 874    error).
 875
 876    Encoding mappings must map single Unicode characters to single
 877    string characters, integers (which are then interpreted as Latin-1
 878    ordinals) or None (meaning "undefined mapping" and causing an
 879    error).
 880
 881    If a character lookup fails with a LookupError, the character is
 882    copied as-is meaning that its ordinal value will be interpreted as
 883    Unicode or Latin-1 ordinal resp. Because of this mappings only need
 884    to contain those mappings which map characters to different code
 885    points.
 886
 887 */
 888
 889 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
 890     const char *string,         /* Encoded string */
 891     Py_ssize_t length,          /* size of string */
 892     PyObject *mapping,          /* character mapping
 893                                    (char ordinal -> unicode ordinal) */
 894     const char *errors          /* error handling */
 895     );
 896
 897 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
 898     PyObject *unicode,          /* Unicode object */
 899     PyObject *mapping           /* character mapping
 900                                    (unicode ordinal -> char ordinal) */
 901     );
 902
 903 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
 904     const Py_UNICODE *data,     /* Unicode char buffer */
 905     Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
 906     PyObject *mapping,          /* character mapping
 907                                    (unicode ordinal -> char ordinal) */
 908     const char *errors          /* error handling */
 909     );
 910
 911 /* Translate a Py_UNICODE buffer of the given length by applying a
 912    character mapping table to it and return the resulting Unicode
 913    object.
 914
 915    The mapping table must map Unicode ordinal integers to Unicode
 916    ordinal integers or None (causing deletion of the character).
 917
 918    Mapping tables may be dictionaries or sequences. Unmapped character
 919    ordinals (ones which cause a LookupError) are left untouched and
 920    are copied as-is.
 921
 922 */
 923
 924 PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
 925     const Py_UNICODE *data,     /* Unicode char buffer */
 926     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 927     PyObject *table,            /* Translate table */
 928     const char *errors          /* error handling */
 929     );
 930
 931 #ifdef MS_WIN32
 932
 933 /* --- MBCS codecs for Windows -------------------------------------------- */
 934
 935 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
 936     const char *string,         /* MBCS encoded string */
 937     Py_ssize_t length,              /* size of string */
 938     const char *errors          /* error handling */
 939     );
 940
 941 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
 942     const char *string,         /* MBCS encoded string */
 943     Py_ssize_t length,          /* size of string */
 944     const char *errors,         /* error handling */
 945     Py_ssize_t *consumed        /* bytes consumed */
 946     );
 947
 948 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
 949     PyObject *unicode           /* Unicode object */
 950     );
 951
 952 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
 953     const Py_UNICODE *data,     /* Unicode char buffer */
 954     Py_ssize_t length,              /* Number of Py_UNICODE chars to encode */
 955     const char *errors          /* error handling */
 956     );
 957
 958 #endif /* MS_WIN32 */
 959
 960 /* --- Decimal Encoder ---------------------------------------------------- */
 961
 962 /* Takes a Unicode string holding a decimal value and writes it into
 963    an output buffer using standard ASCII digit codes.
 964
 965    The output buffer has to provide at least length+1 bytes of storage
 966    area. The output string is 0-terminated.
 967
 968    The encoder converts whitespace to ' ', decimal characters to their
 969    corresponding ASCII digit and all other Latin-1 characters except
 970    \0 as-is. Characters outside this range (Unicode ordinals 1-256)
 971    are treated as errors. This includes embedded NULL bytes.
 972
 973    Error handling is defined by the errors argument:
 974
 975       NULL or "strict": raise a ValueError
 976       "ignore": ignore the wrong characters (these are not copied to the
 977                 output buffer)
 978       "replace": replaces illegal characters with '?'
 979
 980    Returns 0 on success, -1 on failure.
 981
 982 */
 983
 984 PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
 985     Py_UNICODE *s,              /* Unicode buffer */
 986     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 987     char *output,               /* Output buffer; must have size >= length */
 988     const char *errors          /* error handling */
 989     );
 990
 991 /* --- Methods & Slots ----------------------------------------------------
 992
 993    These are capable of handling Unicode objects and strings on input
 994    (we refer to them as strings in the descriptions) and return
 995    Unicode objects or integers as apporpriate. */
 996
 997 /* Concat two strings giving a new Unicode string. */
 998
 999 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1000     PyObject *left,             /* Left string */
1001     PyObject *right             /* Right string */
1002     );
1003
1004 /* Split a string giving a list of Unicode strings.
1005
1006    If sep is NULL, splitting will be done at all whitespace
1007    substrings. Otherwise, splits occur at the given separator.
1008
1009    At most maxsplit splits will be done. If negative, no limit is set.
1010
1011    Separators are not included in the resulting list.
1012
1013 */
1014
1015 PyAPI_FUNC(PyObject*) PyUnicode_Split(
1016     PyObject *s,                /* String to split */
1017     PyObject *sep,              /* String separator */
1018     Py_ssize_t maxsplit         /* Maxsplit count */
1019     );
1020
1021 /* Dito, but split at line breaks.
1022
1023    CRLF is considered to be one line break. Line breaks are not
1024    included in the resulting list. */
1025
1026 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1027     PyObject *s,                /* String to split */
1028     int keepends                /* If true, line end markers are included */
1029     );
1030
1031 /* Partition a string using a given separator. */
1032
1033 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1034     PyObject *s,                /* String to partition */
1035     PyObject *sep               /* String separator */
1036     );
1037
1038 /* Partition a string using a given separator, searching from the end of the
1039    string. */
1040
1041 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1042     PyObject *s,                /* String to partition */
1043     PyObject *sep               /* String separator */
1044     );
1045
1046 /* Split a string giving a list of Unicode strings.
1047
1048    If sep is NULL, splitting will be done at all whitespace
1049    substrings. Otherwise, splits occur at the given separator.
1050
1051    At most maxsplit splits will be done. But unlike PyUnicode_Split
1052    PyUnicode_RSplit splits from the end of the string. If negative,
1053    no limit is set.
1054
1055    Separators are not included in the resulting list.
1056
1057 */
1058
1059 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1060     PyObject *s,                /* String to split */
1061     PyObject *sep,              /* String separator */
1062     Py_ssize_t maxsplit         /* Maxsplit count */
1063     );
1064
1065 /* Translate a string by applying a character mapping table to it and
1066    return the resulting Unicode object.
1067
1068    The mapping table must map Unicode ordinal integers to Unicode
1069    ordinal integers or None (causing deletion of the character).
1070
1071    Mapping tables may be dictionaries or sequences. Unmapped character
1072    ordinals (ones which cause a LookupError) are left untouched and
1073    are copied as-is.
1074
1075 */
1076
1077 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1078     PyObject *str,              /* String */
1079     PyObject *table,            /* Translate table */
1080     const char *errors          /* error handling */
1081     );
1082
1083 /* Join a sequence of strings using the given separator and return
1084    the resulting Unicode string. */
1085
1086 PyAPI_FUNC(PyObject*) PyUnicode_Join(
1087     PyObject *separator,        /* Separator string */
1088     PyObject *seq               /* Sequence object */
1089     );
1090
1091 /* Return 1 if substr matches str[start:end] at the given tail end, 0
1092    otherwise. */
1093
1094 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1095     PyObject *str,              /* String */
1096     PyObject *substr,           /* Prefix or Suffix string */
1097     Py_ssize_t start,           /* Start index */
1098     Py_ssize_t end,             /* Stop index */
1099     int direction               /* Tail end: -1 prefix, +1 suffix */
1100     );
1101
1102 /* Return the first position of substr in str[start:end] using the
1103    given search direction or -1 if not found. -2 is returned in case
1104    an error occurred and an exception is set. */
1105
1106 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1107     PyObject *str,              /* String */
1108     PyObject *substr,           /* Substring to find */
1109     Py_ssize_t start,           /* Start index */
1110     Py_ssize_t end,             /* Stop index */
1111     int direction               /* Find direction: +1 forward, -1 backward */
1112     );
1113
1114 /* Count the number of occurrences of substr in str[start:end]. */
1115
1116 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1117     PyObject *str,              /* String */
1118     PyObject *substr,           /* Substring to count */
1119     Py_ssize_t start,           /* Start index */
1120     Py_ssize_t end              /* Stop index */
1121     );
1122
1123 /* Replace at most maxcount occurrences of substr in str with replstr
1124    and return the resulting Unicode object. */
1125
1126 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1127     PyObject *str,              /* String */
1128     PyObject *substr,           /* Substring to find */
1129     PyObject *replstr,          /* Substring to replace */
1130     Py_ssize_t maxcount         /* Max. number of replacements to apply;
1131                                    -1 = all */
1132     );
1133
1134 /* Compare two strings and return -1, 0, 1 for less than, equal,
1135    greater than resp. */
1136
1137 PyAPI_FUNC(int) PyUnicode_Compare(
1138     PyObject *left,             /* Left string */
1139     PyObject *right             /* Right string */
1140     );
1141
1142 /* Apply a argument tuple or dictionary to a format string and return
1143    the resulting Unicode string. */
1144
1145 PyAPI_FUNC(PyObject *) PyUnicode_Format(
1146     PyObject *format,           /* Format string */
1147     PyObject *args              /* Argument tuple or dictionary */
1148     );
1149
1150 /* Checks whether element is contained in container and return 1/0
1151    accordingly.
1152
1153    element has to coerce to an one element Unicode string. -1 is
1154    returned in case of an error. */
1155
1156 PyAPI_FUNC(int) PyUnicode_Contains(
1157     PyObject *container,        /* Container string */
1158     PyObject *element           /* Element string */
1159     );
1160
1161 /* Externally visible for str.strip(unicode) */
1162 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1163     PyUnicodeObject *self,
1164     int striptype,
1165     PyObject *sepobj
1166     );
1167
1168 /* === Characters Type APIs =============================================== */
1169
1170 /* These should not be used directly. Use the Py_UNICODE_IS* and
1171    Py_UNICODE_TO* macros instead.
1172
1173    These APIs are implemented in Objects/unicodectype.c.
1174
1175 */
1176
1177 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1178     Py_UNICODE ch       /* Unicode character */
1179     );
1180
1181 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1182     Py_UNICODE ch       /* Unicode character */
1183     );
1184
1185 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1186     Py_UNICODE ch       /* Unicode character */
1187     );
1188
1189 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1190     const Py_UNICODE ch         /* Unicode character */
1191     );
1192
1193 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1194     const Py_UNICODE ch         /* Unicode character */
1195     );
1196
1197 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
1198     Py_UNICODE ch       /* Unicode character */
1199     );
1200
1201 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
1202     Py_UNICODE ch       /* Unicode character */
1203     );
1204
1205 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
1206     Py_UNICODE ch       /* Unicode character */
1207     );
1208
1209 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1210     Py_UNICODE ch       /* Unicode character */
1211     );
1212
1213 PyAPI_FUNC(int) _PyUnicode_ToDigit(
1214     Py_UNICODE ch       /* Unicode character */
1215     );
1216
1217 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1218     Py_UNICODE ch       /* Unicode character */
1219     );
1220
1221 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1222     Py_UNICODE ch       /* Unicode character */
1223     );
1224
1225 PyAPI_FUNC(int) _PyUnicode_IsDigit(
1226     Py_UNICODE ch       /* Unicode character */
1227     );
1228
1229 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1230     Py_UNICODE ch       /* Unicode character */
1231     );
1232
1233 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1234     Py_UNICODE ch       /* Unicode character */
1235     );
1236
1237 #ifdef __cplusplus
1238 }
1239 #endif
1240 #endif /* Py_USING_UNICODE */
1241 #endif /* !Py_UNICODEOBJECT_H */