Include/unicodeobject.h

   1 #ifndef Py_UNICODEOBJECT_H
   2 #define Py_UNICODEOBJECT_H
   3
   4 /*
   5
   6 Unicode implementation based on original code by Fredrik Lundh,
   7 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
   8 Unicode Integration Proposal (see file Misc/unicode.txt).
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12
  13  Original header:
  14  --------------------------------------------------------------------
  15
  16  * Yet another Unicode string type for Python.  This type supports the
  17  * 16-bit Basic Multilingual Plane (BMP) only.
  18  *
  19  * Written by Fredrik Lundh, January 1999.
  20  *
  21  * Copyright (c) 1999 by Secret Labs AB.
  22  * Copyright (c) 1999 by Fredrik Lundh.
  23  *
  24  * fredrik@pythonware.com
  25  * http://www.pythonware.com
  26  *
  27  * --------------------------------------------------------------------
  28  * This Unicode String Type is
  29  *
  30  * Copyright (c) 1999 by Secret Labs AB
  31  * Copyright (c) 1999 by Fredrik Lundh
  32  *
  33  * By obtaining, using, and/or copying this software and/or its
  34  * associated documentation, you agree that you have read, understood,
  35  * and will comply with the following terms and conditions:
  36  *
  37  * Permission to use, copy, modify, and distribute this software and its
  38  * associated documentation for any purpose and without fee is hereby
  39  * granted, provided that the above copyright notice appears in all
  40  * copies, and that both that copyright notice and this permission notice
  41  * appear in supporting documentation, and that the name of Secret Labs
  42  * AB or the author not be used in advertising or publicity pertaining to
  43  * distribution of the software without specific, written prior
  44  * permission.
  45  *
  46  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  47  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  48  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  49  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  50  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  51  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  52  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  53  * -------------------------------------------------------------------- */
  54
  55 #include <ctype.h>
  56
  57 /* === Internal API ======================================================= */
  58
  59 /* --- Internal Unicode Format -------------------------------------------- */
  60
  61 #ifndef Py_USING_UNICODE
  62
  63 #define PyUnicode_Check(op)                 0
  64 #define PyUnicode_CheckExact(op)            0
  65
  66 #else
  67
  68 /* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
  69    properly set, but the default rules below doesn't set it.  I'll
  70    sort this out some other day -- fredrik@pythonware.com */
  71
  72 #ifndef Py_UNICODE_SIZE
  73 #error Must define Py_UNICODE_SIZE
  74 #endif
  75
  76 /* Setting Py_UNICODE_WIDE enables UCS-4 storage.  Otherwise, Unicode
  77    strings are stored as UCS-2 (with limited support for UTF-16) */
  78
  79 #if Py_UNICODE_SIZE >= 4
  80 #define Py_UNICODE_WIDE
  81 #endif
  82
  83 /* Set these flags if the platform has "wchar.h", "wctype.h" and the
  84    wchar_t type is a 16-bit unsigned type */
  85 /* #define HAVE_WCHAR_H */
  86 /* #define HAVE_USABLE_WCHAR_T */
  87
  88 /* Defaults for various platforms */
  89 #ifndef PY_UNICODE_TYPE
  90
  91 /* Windows has a usable wchar_t type (unless we're using UCS-4) */
  92 # if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
  93 #  define HAVE_USABLE_WCHAR_T
  94 #  define PY_UNICODE_TYPE wchar_t
  95 # endif
  96
  97 # if defined(Py_UNICODE_WIDE)
  98 #  define PY_UNICODE_TYPE Py_UCS4
  99 # endif
 100
 101 #endif
 102
 103 /* If the compiler provides a wchar_t type we try to support it
 104    through the interface functions PyUnicode_FromWideChar() and
 105    PyUnicode_AsWideChar(). */
 106
 107 #ifdef HAVE_USABLE_WCHAR_T
 108 # ifndef HAVE_WCHAR_H
 109 #  define HAVE_WCHAR_H
 110 # endif
 111 #endif
 112
 113 #ifdef HAVE_WCHAR_H
 114 /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
 115 # ifdef _HAVE_BSDI
 116 #  include <time.h>
 117 # endif
 118 #  include <wchar.h>
 119 #endif
 120
 121 /*
 122  * Use this typedef when you need to represent a UTF-16 surrogate pair
 123  * as single unsigned integer.
 124  */
 125 #if SIZEOF_INT >= 4
 126 typedef unsigned int Py_UCS4;
 127 #elif SIZEOF_LONG >= 4
 128 typedef unsigned long Py_UCS4;
 129 #endif
 130
 131 typedef PY_UNICODE_TYPE Py_UNICODE;
 132
 133 /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
 134
 135 /* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
 136    produce different external names and thus cause import errors in
 137    case Python interpreters and extensions with mixed compiled in
 138    Unicode width assumptions are combined. */
 139
 140 #ifndef Py_UNICODE_WIDE
 141
 142 # define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
 143 # define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
 144 # define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
 145 # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
 146 # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
 147 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
 148 # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
 149 # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
 150 # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
 151 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
 152 # define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
 153 # define PyUnicode_Compare PyUnicodeUCS2_Compare
 154 # define PyUnicode_Concat PyUnicodeUCS2_Concat
 155 # define PyUnicode_Contains PyUnicodeUCS2_Contains
 156 # define PyUnicode_Count PyUnicodeUCS2_Count
 157 # define PyUnicode_Decode PyUnicodeUCS2_Decode
 158 # define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
 159 # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
 160 # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
 161 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
 162 # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
 163 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
 164 # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
 165 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
 166 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
 167 # define PyUnicode_Encode PyUnicodeUCS2_Encode
 168 # define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
 169 # define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
 170 # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
 171 # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
 172 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
 173 # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
 174 # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
 175 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
 176 # define PyUnicode_Find PyUnicodeUCS2_Find
 177 # define PyUnicode_Format PyUnicodeUCS2_Format
 178 # define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
 179 # define PyUnicode_FromObject PyUnicodeUCS2_FromObject
 180 # define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
 181 # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
 182 # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
 183 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
 184 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
 185 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
 186 # define PyUnicode_Join PyUnicodeUCS2_Join
 187 # define PyUnicode_Partition PyUnicodeUCS2_Partition
 188 # define PyUnicode_RPartition PyUnicodeUCS2_RPartition
 189 # define PyUnicode_RSplit PyUnicodeUCS2_RSplit
 190 # define PyUnicode_Replace PyUnicodeUCS2_Replace
 191 # define PyUnicode_Resize PyUnicodeUCS2_Resize
 192 # define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
 193 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
 194 # define PyUnicode_Split PyUnicodeUCS2_Split
 195 # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
 196 # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
 197 # define PyUnicode_Translate PyUnicodeUCS2_Translate
 198 # define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
 199 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
 200 # define _PyUnicode_Fini _PyUnicodeUCS2_Fini
 201 # define _PyUnicode_Init _PyUnicodeUCS2_Init
 202 # define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
 203 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
 204 # define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
 205 # define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
 206 # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
 207 # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
 208 # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
 209 # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
 210 # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
 211 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
 212 # define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
 213 # define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
 214 # define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
 215 # define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
 216 # define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
 217
 218 #else
 219
 220 # define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
 221 # define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
 222 # define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
 223 # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
 224 # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
 225 # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
 226 # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
 227 # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
 228 # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
 229 # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
 230 # define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
 231 # define PyUnicode_Compare PyUnicodeUCS4_Compare
 232 # define PyUnicode_Concat PyUnicodeUCS4_Concat
 233 # define PyUnicode_Contains PyUnicodeUCS4_Contains
 234 # define PyUnicode_Count PyUnicodeUCS4_Count
 235 # define PyUnicode_Decode PyUnicodeUCS4_Decode
 236 # define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
 237 # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
 238 # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
 239 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
 240 # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
 241 # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
 242 # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
 243 # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
 244 # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
 245 # define PyUnicode_Encode PyUnicodeUCS4_Encode
 246 # define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
 247 # define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
 248 # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
 249 # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
 250 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
 251 # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
 252 # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
 253 # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
 254 # define PyUnicode_Find PyUnicodeUCS4_Find
 255 # define PyUnicode_Format PyUnicodeUCS4_Format
 256 # define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
 257 # define PyUnicode_FromObject PyUnicodeUCS4_FromObject
 258 # define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
 259 # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
 260 # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
 261 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
 262 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
 263 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
 264 # define PyUnicode_Join PyUnicodeUCS4_Join
 265 # define PyUnicode_Partition PyUnicodeUCS4_Partition
 266 # define PyUnicode_RPartition PyUnicodeUCS4_RPartition
 267 # define PyUnicode_RSplit PyUnicodeUCS4_RSplit
 268 # define PyUnicode_Replace PyUnicodeUCS4_Replace
 269 # define PyUnicode_Resize PyUnicodeUCS4_Resize
 270 # define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
 271 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
 272 # define PyUnicode_Split PyUnicodeUCS4_Split
 273 # define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
 274 # define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
 275 # define PyUnicode_Translate PyUnicodeUCS4_Translate
 276 # define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
 277 # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
 278 # define _PyUnicode_Fini _PyUnicodeUCS4_Fini
 279 # define _PyUnicode_Init _PyUnicodeUCS4_Init
 280 # define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
 281 # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
 282 # define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
 283 # define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
 284 # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
 285 # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
 286 # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
 287 # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
 288 # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
 289 # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
 290 # define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
 291 # define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
 292 # define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
 293 # define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
 294 # define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
 295
 296
 297 #endif
 298
 299 /* --- Internal Unicode Operations ---------------------------------------- */
 300
 301 /* If you want Python to use the compiler's wctype.h functions instead
 302    of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
 303    configure Python using --with-wctype-functions.  This reduces the
 304    interpreter's code size. */
 305
 306 #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
 307
 308 #include <wctype.h>
 309
 310 #define Py_UNICODE_ISSPACE(ch) iswspace(ch)
 311
 312 #define Py_UNICODE_ISLOWER(ch) iswlower(ch)
 313 #define Py_UNICODE_ISUPPER(ch) iswupper(ch)
 314 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
 315 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
 316
 317 #define Py_UNICODE_TOLOWER(ch) towlower(ch)
 318 #define Py_UNICODE_TOUPPER(ch) towupper(ch)
 319 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
 320
 321 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
 322 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
 323 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
 324
 325 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
 326 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
 327 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
 328
 329 #define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
 330
 331 #else
 332
 333 #define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
 334
 335 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
 336 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
 337 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
 338 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
 339
 340 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
 341 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
 342 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
 343
 344 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
 345 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
 346 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
 347
 348 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
 349 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
 350 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
 351
 352 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
 353
 354 #endif
 355
 356 #define Py_UNICODE_ISALNUM(ch) \
 357        (Py_UNICODE_ISALPHA(ch) || \
 358         Py_UNICODE_ISDECIMAL(ch) || \
 359         Py_UNICODE_ISDIGIT(ch) || \
 360         Py_UNICODE_ISNUMERIC(ch))
 361
 362 #define Py_UNICODE_COPY(target, source, length)                         \
 363         Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
 364
 365 #define Py_UNICODE_FILL(target, value, length) do\
 366     {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
 367         for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
 368     } while (0)
 369
 370 /* check if substring matches at given offset.  the offset must be
 371    valid, and the substring must not be empty */
 372 #define Py_UNICODE_MATCH(string, offset, substring) \
 373     ((*((string)->str + (offset)) == *((substring)->str)) && \
 374     ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
 375      !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
 376
 377 #ifdef __cplusplus
 378 extern "C" {
 379 #endif
 380
 381 /* --- Unicode Type ------------------------------------------------------- */
 382
 383 typedef struct {
 384     PyObject_HEAD
 385     Py_ssize_t length;          /* Length of raw Unicode data in buffer */
 386     Py_UNICODE *str;            /* Raw Unicode buffer */
 387     long hash;                  /* Hash value; -1 if not set */
 388     PyObject *defenc;           /* (Default) Encoded version as Python
 389                                    string, or NULL; this is used for
 390                                    implementing the buffer protocol */
 391 } PyUnicodeObject;
 392
 393 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
 394
 395 #define PyUnicode_Check(op) PyObject_TypeCheck(op, &PyUnicode_Type)
 396 #define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type)
 397
 398 /* Fast access macros */
 399 #define PyUnicode_GET_SIZE(op) \
 400         (((PyUnicodeObject *)(op))->length)
 401 #define PyUnicode_GET_DATA_SIZE(op) \
 402         (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
 403 #define PyUnicode_AS_UNICODE(op) \
 404         (((PyUnicodeObject *)(op))->str)
 405 #define PyUnicode_AS_DATA(op) \
 406         ((const char *)((PyUnicodeObject *)(op))->str)
 407
 408 /* --- Constants ---------------------------------------------------------- */
 409
 410 /* This Unicode character will be used as replacement character during
 411    decoding if the errors argument is set to "replace". Note: the
 412    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
 413    Unicode 3.0. */
 414
 415 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
 416
 417 /* === Public API ========================================================= */
 418
 419 /* --- Plain Py_UNICODE --------------------------------------------------- */
 420
 421 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
 422    size.
 423
 424    u may be NULL which causes the contents to be undefined. It is the
 425    user's responsibility to fill in the needed data afterwards. Note
 426    that modifying the Unicode object contents after construction is
 427    only allowed if u was set to NULL.
 428
 429    The buffer is copied into the new object. */
 430
 431 PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
 432     const Py_UNICODE *u,        /* Unicode buffer */
 433     Py_ssize_t size             /* size of buffer */
 434     );
 435
 436 /* Return a read-only pointer to the Unicode object's internal
 437    Py_UNICODE buffer. */
 438
 439 PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
 440     PyObject *unicode           /* Unicode object */
 441     );
 442
 443 /* Get the length of the Unicode object. */
 444
 445 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
 446     PyObject *unicode           /* Unicode object */
 447     );
 448
 449 /* Get the maximum ordinal for a Unicode character. */
 450 PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
 451
 452 /* Resize an already allocated Unicode object to the new size length.
 453
 454    *unicode is modified to point to the new (resized) object and 0
 455    returned on success.
 456
 457    This API may only be called by the function which also called the
 458    Unicode constructor. The refcount on the object must be 1. Otherwise,
 459    an error is returned.
 460
 461    Error handling is implemented as follows: an exception is set, -1
 462    is returned and *unicode left untouched.
 463
 464 */
 465
 466 PyAPI_FUNC(int) PyUnicode_Resize(
 467     PyObject **unicode,         /* Pointer to the Unicode object */
 468     Py_ssize_t length           /* New length */
 469     );
 470
 471 /* Coerce obj to an Unicode object and return a reference with
 472    *incremented* refcount.
 473
 474    Coercion is done in the following way:
 475
 476    1. String and other char buffer compatible objects are decoded
 477       under the assumptions that they contain data using the current
 478       default encoding. Decoding is done in "strict" mode.
 479
 480    2. All other objects (including Unicode objects) raise an
 481       exception.
 482
 483    The API returns NULL in case of an error. The caller is responsible
 484    for decref'ing the returned objects.
 485
 486 */
 487
 488 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
 489     register PyObject *obj,     /* Object */
 490     const char *encoding,       /* encoding */
 491     const char *errors          /* error handling */
 492     );
 493
 494 /* Coerce obj to an Unicode object and return a reference with
 495    *incremented* refcount.
 496
 497    Unicode objects are passed back as-is (subclasses are converted to
 498    true Unicode objects), all other objects are delegated to
 499    PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
 500    using the default encoding as basis for decoding the object.
 501
 502    The API returns NULL in case of an error. The caller is responsible
 503    for decref'ing the returned objects.
 504
 505 */
 506
 507 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
 508     register PyObject *obj      /* Object */
 509     );
 510
 511 /* --- wchar_t support for platforms which support it --------------------- */
 512
 513 #ifdef HAVE_WCHAR_H
 514
 515 /* Create a Unicode Object from the whcar_t buffer w of the given
 516    size.
 517
 518    The buffer is copied into the new object. */
 519
 520 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
 521     register const wchar_t *w,  /* wchar_t buffer */
 522     Py_ssize_t size             /* size of buffer */
 523     );
 524
 525 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
 526    most size wchar_t characters are copied.
 527
 528    Note that the resulting wchar_t string may or may not be
 529    0-terminated.  It is the responsibility of the caller to make sure
 530    that the wchar_t string is 0-terminated in case this is required by
 531    the application.
 532
 533    Returns the number of wchar_t characters copied (excluding a
 534    possibly trailing 0-termination character) or -1 in case of an
 535    error. */
 536
 537 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
 538     PyUnicodeObject *unicode,   /* Unicode object */
 539     register wchar_t *w,        /* wchar_t buffer */
 540     Py_ssize_t size             /* size of buffer */
 541     );
 542
 543 #endif
 544
 545 /* --- Unicode ordinals --------------------------------------------------- */
 546
 547 /* Create a Unicode Object from the given Unicode code point ordinal.
 548
 549    The ordinal must be in range(0x10000) on narrow Python builds
 550    (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
 551    raised in case it is not.
 552
 553 */
 554
 555 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
 556
 557 /* === Builtin Codecs =====================================================
 558
 559    Many of these APIs take two arguments encoding and errors. These
 560    parameters encoding and errors have the same semantics as the ones
 561    of the builtin unicode() API.
 562
 563    Setting encoding to NULL causes the default encoding to be used.
 564
 565    Error handling is set by errors which may also be set to NULL
 566    meaning to use the default handling defined for the codec. Default
 567    error handling for all builtin codecs is "strict" (ValueErrors are
 568    raised).
 569
 570    The codecs all use a similar interface. Only deviation from the
 571    generic ones are documented.
 572
 573 */
 574
 575 /* --- Manage the default encoding ---------------------------------------- */
 576
 577 /* Return a Python string holding the default encoded value of the
 578    Unicode object.
 579
 580    The resulting string is cached in the Unicode object for subsequent
 581    usage by this function. The cached version is needed to implement
 582    the character buffer interface and will live (at least) as long as
 583    the Unicode object itself.
 584
 585    The refcount of the string is *not* incremented.
 586
 587    *** Exported for internal use by the interpreter only !!! ***
 588
 589 */
 590
 591 PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
 592     PyObject *, const char *);
 593
 594 /* Returns the currently active default encoding.
 595
 596    The default encoding is currently implemented as run-time settable
 597    process global.  This may change in future versions of the
 598    interpreter to become a parameter which is managed on a per-thread
 599    basis.
 600
 601  */
 602
 603 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
 604
 605 /* Sets the currently active default encoding.
 606
 607    Returns 0 on success, -1 in case of an error.
 608
 609  */
 610
 611 PyAPI_FUNC(int) PyUnicode_SetDefaultEncoding(
 612     const char *encoding        /* Encoding name in standard form */
 613     );
 614
 615 /* --- Generic Codecs ----------------------------------------------------- */
 616
 617 /* Create a Unicode object by decoding the encoded string s of the
 618    given size. */
 619
 620 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
 621     const char *s,              /* encoded string */
 622     Py_ssize_t size,            /* size of buffer */
 623     const char *encoding,       /* encoding */
 624     const char *errors          /* error handling */
 625     );
 626
 627 /* Encodes a Py_UNICODE buffer of the given size and returns a
 628    Python string object. */
 629
 630 PyAPI_FUNC(PyObject*) PyUnicode_Encode(
 631     const Py_UNICODE *s,        /* Unicode char buffer */
 632     Py_ssize_t size,            /* number of Py_UNICODE chars to encode */
 633     const char *encoding,       /* encoding */
 634     const char *errors          /* error handling */
 635     );
 636
 637 /* Encodes a Unicode object and returns the result as Python
 638    object. */
 639
 640 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
 641     PyObject *unicode,          /* Unicode object */
 642     const char *encoding,       /* encoding */
 643     const char *errors          /* error handling */
 644     );
 645
 646 /* Encodes a Unicode object and returns the result as Python string
 647    object. */
 648
 649 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
 650     PyObject *unicode,          /* Unicode object */
 651     const char *encoding,       /* encoding */
 652     const char *errors          /* error handling */
 653     );
 654
 655 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
 656     PyObject* string            /* 256 character map */
 657    );
 658
 659
 660 /* --- UTF-7 Codecs ------------------------------------------------------- */
 661
 662 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
 663     const char *string,         /* UTF-7 encoded string */
 664     Py_ssize_t length,          /* size of string */
 665     const char *errors          /* error handling */
 666     );
 667
 668 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
 669     const Py_UNICODE *data,     /* Unicode char buffer */
 670     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 671     int encodeSetO,             /* force the encoder to encode characters in
 672                                    Set O, as described in RFC2152 */
 673     int encodeWhiteSpace,       /* force the encoder to encode space, tab,
 674                                    carriage return and linefeed characters */
 675     const char *errors          /* error handling */
 676     );
 677
 678 /* --- UTF-8 Codecs ------------------------------------------------------- */
 679
 680 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
 681     const char *string,         /* UTF-8 encoded string */
 682     Py_ssize_t length,          /* size of string */
 683     const char *errors          /* error handling */
 684     );
 685
 686 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
 687     const char *string,         /* UTF-8 encoded string */
 688     Py_ssize_t length,          /* size of string */
 689     const char *errors,         /* error handling */
 690     Py_ssize_t *consumed                /* bytes consumed */
 691     );
 692
 693 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
 694     PyObject *unicode           /* Unicode object */
 695     );
 696
 697 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
 698     const Py_UNICODE *data,     /* Unicode char buffer */
 699     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 700     const char *errors          /* error handling */
 701     );
 702
 703 /* --- UTF-16 Codecs ------------------------------------------------------ */
 704
 705 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
 706    the corresponding Unicode object.
 707
 708    errors (if non-NULL) defines the error handling. It defaults
 709    to "strict".
 710
 711    If byteorder is non-NULL, the decoder starts decoding using the
 712    given byte order:
 713
 714         *byteorder == -1: little endian
 715         *byteorder == 0:  native order
 716         *byteorder == 1:  big endian
 717
 718    In native mode, the first two bytes of the stream are checked for a
 719    BOM mark. If found, the BOM mark is analysed, the byte order
 720    adjusted and the BOM skipped.  In the other modes, no BOM mark
 721    interpretation is done. After completion, *byteorder is set to the
 722    current byte order at the end of input data.
 723
 724    If byteorder is NULL, the codec starts in native order mode.
 725
 726 */
 727
 728 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
 729     const char *string,         /* UTF-16 encoded string */
 730     Py_ssize_t length,          /* size of string */
 731     const char *errors,         /* error handling */
 732     int *byteorder              /* pointer to byteorder to use
 733                                    0=native;-1=LE,1=BE; updated on
 734                                    exit */
 735     );
 736
 737 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
 738     const char *string,         /* UTF-16 encoded string */
 739     Py_ssize_t length,          /* size of string */
 740     const char *errors,         /* error handling */
 741     int *byteorder,             /* pointer to byteorder to use
 742                                    0=native;-1=LE,1=BE; updated on
 743                                    exit */
 744     Py_ssize_t *consumed                /* bytes consumed */
 745     );
 746
 747 /* Returns a Python string using the UTF-16 encoding in native byte
 748    order. The string always starts with a BOM mark.  */
 749
 750 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
 751     PyObject *unicode           /* Unicode object */
 752     );
 753
 754 /* Returns a Python string object holding the UTF-16 encoded value of
 755    the Unicode data.
 756
 757    If byteorder is not 0, output is written according to the following
 758    byte order:
 759
 760    byteorder == -1: little endian
 761    byteorder == 0:  native byte order (writes a BOM mark)
 762    byteorder == 1:  big endian
 763
 764    If byteorder is 0, the output string will always start with the
 765    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
 766    prepended.
 767
 768    Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
 769    UCS-2. This trick makes it possible to add full UTF-16 capabilities
 770    at a later point without compromising the APIs.
 771
 772 */
 773
 774 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF16(
 775     const Py_UNICODE *data,     /* Unicode char buffer */
 776     Py_ssize_t length,                  /* number of Py_UNICODE chars to encode */
 777     const char *errors,         /* error handling */
 778     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
 779     );
 780
 781 /* --- Unicode-Escape Codecs ---------------------------------------------- */
 782
 783 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
 784     const char *string,         /* Unicode-Escape encoded string */
 785     Py_ssize_t length,          /* size of string */
 786     const char *errors          /* error handling */
 787     );
 788
 789 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
 790     PyObject *unicode           /* Unicode object */
 791     );
 792
 793 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUnicodeEscape(
 794     const Py_UNICODE *data,     /* Unicode char buffer */
 795     Py_ssize_t length                   /* Number of Py_UNICODE chars to encode */
 796     );
 797
 798 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
 799
 800 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
 801     const char *string,         /* Raw-Unicode-Escape encoded string */
 802     Py_ssize_t length,          /* size of string */
 803     const char *errors          /* error handling */
 804     );
 805
 806 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
 807     PyObject *unicode           /* Unicode object */
 808     );
 809
 810 PyAPI_FUNC(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
 811     const Py_UNICODE *data,     /* Unicode char buffer */
 812     Py_ssize_t length                   /* Number of Py_UNICODE chars to encode */
 813     );
 814
 815 /* --- Unicode Internal Codec ---------------------------------------------
 816
 817     Only for internal use in _codecsmodule.c */
 818
 819 PyObject *_PyUnicode_DecodeUnicodeInternal(
 820     const char *string,
 821     Py_ssize_t length,
 822     const char *errors
 823     );
 824
 825 /* --- Latin-1 Codecs -----------------------------------------------------
 826
 827    Note: Latin-1 corresponds to the first 256 Unicode ordinals.
 828
 829 */
 830
 831 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
 832     const char *string,         /* Latin-1 encoded string */
 833     Py_ssize_t length,          /* size of string */
 834     const char *errors          /* error handling */
 835     );
 836
 837 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
 838     PyObject *unicode           /* Unicode object */
 839     );
 840
 841 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
 842     const Py_UNICODE *data,     /* Unicode char buffer */
 843     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 844     const char *errors          /* error handling */
 845     );
 846
 847 /* --- ASCII Codecs -------------------------------------------------------
 848
 849    Only 7-bit ASCII data is excepted. All other codes generate errors.
 850
 851 */
 852
 853 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
 854     const char *string,         /* ASCII encoded string */
 855     Py_ssize_t length,          /* size of string */
 856     const char *errors          /* error handling */
 857     );
 858
 859 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
 860     PyObject *unicode           /* Unicode object */
 861     );
 862
 863 PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
 864     const Py_UNICODE *data,     /* Unicode char buffer */
 865     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 866     const char *errors          /* error handling */
 867     );
 868
 869 /* --- Character Map Codecs -----------------------------------------------
 870
 871    This codec uses mappings to encode and decode characters.
 872
 873    Decoding mappings must map single string characters to single
 874    Unicode characters, integers (which are then interpreted as Unicode
 875    ordinals) or None (meaning "undefined mapping" and causing an
 876    error).
 877
 878    Encoding mappings must map single Unicode characters to single
 879    string characters, integers (which are then interpreted as Latin-1
 880    ordinals) or None (meaning "undefined mapping" and causing an
 881    error).
 882
 883    If a character lookup fails with a LookupError, the character is
 884    copied as-is meaning that its ordinal value will be interpreted as
 885    Unicode or Latin-1 ordinal resp. Because of this mappings only need
 886    to contain those mappings which map characters to different code
 887    points.
 888
 889 */
 890
 891 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
 892     const char *string,         /* Encoded string */
 893     Py_ssize_t length,          /* size of string */
 894     PyObject *mapping,          /* character mapping
 895                                    (char ordinal -> unicode ordinal) */
 896     const char *errors          /* error handling */
 897     );
 898
 899 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
 900     PyObject *unicode,          /* Unicode object */
 901     PyObject *mapping           /* character mapping
 902                                    (unicode ordinal -> char ordinal) */
 903     );
 904
 905 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCharmap(
 906     const Py_UNICODE *data,     /* Unicode char buffer */
 907     Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
 908     PyObject *mapping,          /* character mapping
 909                                    (unicode ordinal -> char ordinal) */
 910     const char *errors          /* error handling */
 911     );
 912
 913 /* Translate a Py_UNICODE buffer of the given length by applying a
 914    character mapping table to it and return the resulting Unicode
 915    object.
 916
 917    The mapping table must map Unicode ordinal integers to Unicode
 918    ordinal integers or None (causing deletion of the character).
 919
 920    Mapping tables may be dictionaries or sequences. Unmapped character
 921    ordinals (ones which cause a LookupError) are left untouched and
 922    are copied as-is.
 923
 924 */
 925
 926 PyAPI_FUNC(PyObject *) PyUnicode_TranslateCharmap(
 927     const Py_UNICODE *data,     /* Unicode char buffer */
 928     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 929     PyObject *table,            /* Translate table */
 930     const char *errors          /* error handling */
 931     );
 932
 933 #ifdef MS_WIN32
 934
 935 /* --- MBCS codecs for Windows -------------------------------------------- */
 936
 937 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
 938     const char *string,         /* MBCS encoded string */
 939     Py_ssize_t length,              /* size of string */
 940     const char *errors          /* error handling */
 941     );
 942
 943 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
 944     const char *string,         /* MBCS encoded string */
 945     Py_ssize_t length,          /* size of string */
 946     const char *errors,         /* error handling */
 947     Py_ssize_t *consumed        /* bytes consumed */
 948     );
 949
 950 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
 951     PyObject *unicode           /* Unicode object */
 952     );
 953
 954 PyAPI_FUNC(PyObject*) PyUnicode_EncodeMBCS(
 955     const Py_UNICODE *data,     /* Unicode char buffer */
 956     Py_ssize_t length,              /* Number of Py_UNICODE chars to encode */
 957     const char *errors          /* error handling */
 958     );
 959
 960 #endif /* MS_WIN32 */
 961
 962 /* --- Decimal Encoder ---------------------------------------------------- */
 963
 964 /* Takes a Unicode string holding a decimal value and writes it into
 965    an output buffer using standard ASCII digit codes.
 966
 967    The output buffer has to provide at least length+1 bytes of storage
 968    area. The output string is 0-terminated.
 969
 970    The encoder converts whitespace to ' ', decimal characters to their
 971    corresponding ASCII digit and all other Latin-1 characters except
 972    \0 as-is. Characters outside this range (Unicode ordinals 1-256)
 973    are treated as errors. This includes embedded NULL bytes.
 974
 975    Error handling is defined by the errors argument:
 976
 977       NULL or "strict": raise a ValueError
 978       "ignore": ignore the wrong characters (these are not copied to the
 979                 output buffer)
 980       "replace": replaces illegal characters with '?'
 981
 982    Returns 0 on success, -1 on failure.
 983
 984 */
 985
 986 PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
 987     Py_UNICODE *s,              /* Unicode buffer */
 988     Py_ssize_t length,                  /* Number of Py_UNICODE chars to encode */
 989     char *output,               /* Output buffer; must have size >= length */
 990     const char *errors          /* error handling */
 991     );
 992
 993 /* --- Methods & Slots ----------------------------------------------------
 994
 995    These are capable of handling Unicode objects and strings on input
 996    (we refer to them as strings in the descriptions) and return
 997    Unicode objects or integers as apporpriate. */
 998
 999 /* Concat two strings giving a new Unicode string. */
1000
1001 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
1002     PyObject *left,             /* Left string */
1003     PyObject *right             /* Right string */
1004     );
1005
1006 /* Split a string giving a list of Unicode strings.
1007
1008    If sep is NULL, splitting will be done at all whitespace
1009    substrings. Otherwise, splits occur at the given separator.
1010
1011    At most maxsplit splits will be done. If negative, no limit is set.
1012
1013    Separators are not included in the resulting list.
1014
1015 */
1016
1017 PyAPI_FUNC(PyObject*) PyUnicode_Split(
1018     PyObject *s,                /* String to split */
1019     PyObject *sep,              /* String separator */
1020     Py_ssize_t maxsplit         /* Maxsplit count */
1021     );
1022
1023 /* Dito, but split at line breaks.
1024
1025    CRLF is considered to be one line break. Line breaks are not
1026    included in the resulting list. */
1027
1028 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
1029     PyObject *s,                /* String to split */
1030     int keepends                /* If true, line end markers are included */
1031     );
1032
1033 /* Partition a string using a given separator. */
1034
1035 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
1036     PyObject *s,                /* String to partition */
1037     PyObject *sep               /* String separator */
1038     );
1039
1040 /* Partition a string using a given separator, searching from the end of the
1041    string. */
1042
1043 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
1044     PyObject *s,                /* String to partition */
1045     PyObject *sep               /* String separator */
1046     );
1047
1048 /* Split a string giving a list of Unicode strings.
1049
1050    If sep is NULL, splitting will be done at all whitespace
1051    substrings. Otherwise, splits occur at the given separator.
1052
1053    At most maxsplit splits will be done. But unlike PyUnicode_Split
1054    PyUnicode_RSplit splits from the end of the string. If negative,
1055    no limit is set.
1056
1057    Separators are not included in the resulting list.
1058
1059 */
1060
1061 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
1062     PyObject *s,                /* String to split */
1063     PyObject *sep,              /* String separator */
1064     Py_ssize_t maxsplit         /* Maxsplit count */
1065     );
1066
1067 /* Translate a string by applying a character mapping table to it and
1068    return the resulting Unicode object.
1069
1070    The mapping table must map Unicode ordinal integers to Unicode
1071    ordinal integers or None (causing deletion of the character).
1072
1073    Mapping tables may be dictionaries or sequences. Unmapped character
1074    ordinals (ones which cause a LookupError) are left untouched and
1075    are copied as-is.
1076
1077 */
1078
1079 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
1080     PyObject *str,              /* String */
1081     PyObject *table,            /* Translate table */
1082     const char *errors          /* error handling */
1083     );
1084
1085 /* Join a sequence of strings using the given separator and return
1086    the resulting Unicode string. */
1087
1088 PyAPI_FUNC(PyObject*) PyUnicode_Join(
1089     PyObject *separator,        /* Separator string */
1090     PyObject *seq               /* Sequence object */
1091     );
1092
1093 /* Return 1 if substr matches str[start:end] at the given tail end, 0
1094    otherwise. */
1095
1096 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
1097     PyObject *str,              /* String */
1098     PyObject *substr,           /* Prefix or Suffix string */
1099     Py_ssize_t start,           /* Start index */
1100     Py_ssize_t end,             /* Stop index */
1101     int direction               /* Tail end: -1 prefix, +1 suffix */
1102     );
1103
1104 /* Return the first position of substr in str[start:end] using the
1105    given search direction or -1 if not found. -2 is returned in case
1106    an error occurred and an exception is set. */
1107
1108 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
1109     PyObject *str,              /* String */
1110     PyObject *substr,           /* Substring to find */
1111     Py_ssize_t start,           /* Start index */
1112     Py_ssize_t end,             /* Stop index */
1113     int direction               /* Find direction: +1 forward, -1 backward */
1114     );
1115
1116 /* Count the number of occurrences of substr in str[start:end]. */
1117
1118 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
1119     PyObject *str,              /* String */
1120     PyObject *substr,           /* Substring to count */
1121     Py_ssize_t start,           /* Start index */
1122     Py_ssize_t end              /* Stop index */
1123     );
1124
1125 /* Replace at most maxcount occurrences of substr in str with replstr
1126    and return the resulting Unicode object. */
1127
1128 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
1129     PyObject *str,              /* String */
1130     PyObject *substr,           /* Substring to find */
1131     PyObject *replstr,          /* Substring to replace */
1132     Py_ssize_t maxcount         /* Max. number of replacements to apply;
1133                                    -1 = all */
1134     );
1135
1136 /* Compare two strings and return -1, 0, 1 for less than, equal,
1137    greater than resp. */
1138
1139 PyAPI_FUNC(int) PyUnicode_Compare(
1140     PyObject *left,             /* Left string */
1141     PyObject *right             /* Right string */
1142     );
1143
1144 /* Rich compare two strings and return one of the following:
1145
1146    - NULL in case an exception was raised
1147    - Py_True or Py_False for successfuly comparisons
1148    - Py_NotImplemented in case the type combination is unknown
1149
1150    Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in
1151    case the conversion of the arguments to Unicode fails with a
1152    UnicodeDecodeError.
1153
1154    Possible values for op:
1155
1156      Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1157
1158 */
1159
1160 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1161     PyObject *left,             /* Left string */
1162     PyObject *right,            /* Right string */
1163     int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1164     );
1165
1166 /* Apply a argument tuple or dictionary to a format string and return
1167    the resulting Unicode string. */
1168
1169 PyAPI_FUNC(PyObject *) PyUnicode_Format(
1170     PyObject *format,           /* Format string */
1171     PyObject *args              /* Argument tuple or dictionary */
1172     );
1173
1174 /* Checks whether element is contained in container and return 1/0
1175    accordingly.
1176
1177    element has to coerce to an one element Unicode string. -1 is
1178    returned in case of an error. */
1179
1180 PyAPI_FUNC(int) PyUnicode_Contains(
1181     PyObject *container,        /* Container string */
1182     PyObject *element           /* Element string */
1183     );
1184
1185 /* Externally visible for str.strip(unicode) */
1186 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1187     PyUnicodeObject *self,
1188     int striptype,
1189     PyObject *sepobj
1190     );
1191
1192 /* === Characters Type APIs =============================================== */
1193
1194 /* These should not be used directly. Use the Py_UNICODE_IS* and
1195    Py_UNICODE_TO* macros instead.
1196
1197    These APIs are implemented in Objects/unicodectype.c.
1198
1199 */
1200
1201 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1202     Py_UNICODE ch       /* Unicode character */
1203     );
1204
1205 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1206     Py_UNICODE ch       /* Unicode character */
1207     );
1208
1209 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1210     Py_UNICODE ch       /* Unicode character */
1211     );
1212
1213 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1214     const Py_UNICODE ch         /* Unicode character */
1215     );
1216
1217 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1218     const Py_UNICODE ch         /* Unicode character */
1219     );
1220
1221 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToLowercase(
1222     Py_UNICODE ch       /* Unicode character */
1223     );
1224
1225 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToUppercase(
1226     Py_UNICODE ch       /* Unicode character */
1227     );
1228
1229 PyAPI_FUNC(Py_UNICODE) _PyUnicode_ToTitlecase(
1230     Py_UNICODE ch       /* Unicode character */
1231     );
1232
1233 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1234     Py_UNICODE ch       /* Unicode character */
1235     );
1236
1237 PyAPI_FUNC(int) _PyUnicode_ToDigit(
1238     Py_UNICODE ch       /* Unicode character */
1239     );
1240
1241 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1242     Py_UNICODE ch       /* Unicode character */
1243     );
1244
1245 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1246     Py_UNICODE ch       /* Unicode character */
1247     );
1248
1249 PyAPI_FUNC(int) _PyUnicode_IsDigit(
1250     Py_UNICODE ch       /* Unicode character */
1251     );
1252
1253 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1254     Py_UNICODE ch       /* Unicode character */
1255     );
1256
1257 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1258     Py_UNICODE ch       /* Unicode character */
1259     );
1260
1261 #ifdef __cplusplus
1262 }
1263 #endif
1264 #endif /* Py_USING_UNICODE */
1265 #endif /* !Py_UNICODEOBJECT_H */