Modules/unicodedata.c

   1 /* ------------------------------------------------------------------------
   2
   3    unicodedata -- Provides access to the Unicode 4.1 data base.
   4
   5    Data was extracted from the Unicode 4.1 UnicodeData.txt file.
   6
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9    Modified by Martin v. Löwis (martin@v.loewis.de)
  10
  11    Copyright (c) Corporation for National Research Initiatives.
  12
  13    ------------------------------------------------------------------------ */
  14
  15 #include "Python.h"
  16 #include "ucnhash.h"
  17 #include "structmember.h"
  18
  19 /* character properties */
  20
  21 typedef struct {
  22     const unsigned char category;       /* index into
  23                                            _PyUnicode_CategoryNames */
  24     const unsigned char combining;      /* combining class value 0 - 255 */
  25     const unsigned char bidirectional;  /* index into
  26                                            _PyUnicode_BidirectionalNames */
  27     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  28     const unsigned char east_asian_width;       /* index into
  29                                                    _PyUnicode_EastAsianWidth */
  30 } _PyUnicode_DatabaseRecord;
  31
  32 typedef struct change_record {
  33     /* sequence of fields should be the same as in merge_old_version */
  34     const unsigned char bidir_changed;
  35     const unsigned char category_changed;
  36     const unsigned char decimal_changed;
  37     const int numeric_changed;
  38 } change_record;
  39
  40 /* data file generated by Tools/unicode/makeunicodedata.py */
  41 #include "unicodedata_db.h"
  42
  43 static const _PyUnicode_DatabaseRecord*
  44 _getrecord_ex(Py_UCS4 code)
  45 {
  46     int index;
  47     if (code >= 0x110000)
  48         index = 0;
  49     else {
  50         index = index1[(code>>SHIFT)];
  51         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  52     }
  53
  54     return &_PyUnicode_Database_Records[index];
  55 }
  56
  57 /* ------------- Previous-version API ------------------------------------- */
  58 typedef struct previous_version {
  59     PyObject_HEAD
  60     const char *name;
  61     const change_record* (*getrecord)(Py_UCS4);
  62     Py_UCS4 (*normalization)(Py_UCS4);
  63 } PreviousDBVersion;
  64
  65 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
  66
  67 static PyMemberDef DB_members[] = {
  68         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
  69         {NULL}
  70 };
  71
  72 /* forward declaration */
  73 static PyTypeObject UCD_Type;
  74
  75 static PyObject*
  76 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
  77                      Py_UCS4 (*normalization)(Py_UCS4))
  78 {
  79         PreviousDBVersion *self;
  80         self = PyObject_New(PreviousDBVersion, &UCD_Type);
  81         if (self == NULL)
  82                 return NULL;
  83         self->name = name;
  84         self->getrecord = getrecord;
  85         self->normalization = normalization;
  86         return (PyObject*)self;
  87 }
  88
  89
  90 static Py_UCS4 getuchar(PyUnicodeObject *obj)
  91 {
  92     Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
  93
  94     if (PyUnicode_GET_SIZE(obj) == 1)
  95         return *v;
  96 #ifndef Py_UNICODE_WIDE
  97     else if ((PyUnicode_GET_SIZE(obj) == 2) &&
  98              (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
  99              (0xDC00 <= v[1] && v[1] <= 0xDFFF))
 100         return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
 101 #endif
 102     PyErr_SetString(PyExc_TypeError,
 103                     "need a single Unicode character as parameter");
 104     return (Py_UCS4)-1;
 105 }
 106
 107 /* --- Module API --------------------------------------------------------- */
 108
 109 PyDoc_STRVAR(unicodedata_decimal__doc__,
 110 "decimal(unichr[, default])\n\
 111 \n\
 112 Returns the decimal value assigned to the Unicode character unichr\n\
 113 as integer. If no such value is defined, default is returned, or, if\n\
 114 not given, ValueError is raised.");
 115
 116 static PyObject *
 117 unicodedata_decimal(PyObject *self, PyObject *args)
 118 {
 119     PyUnicodeObject *v;
 120     PyObject *defobj = NULL;
 121     int have_old = 0;
 122     long rc;
 123     Py_UCS4 c;
 124
 125     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
 126         return NULL;
 127     c = getuchar(v);
 128     if (c == (Py_UCS4)-1)
 129         return NULL;
 130
 131     if (self) {
 132         const change_record *old = get_old_record(self, c);
 133         if (old->category_changed == 0) {
 134             /* unassigned */
 135             have_old = 1;
 136             rc = -1;
 137         }
 138         else if (old->decimal_changed != 0xFF) {
 139             have_old = 1;
 140             rc = old->decimal_changed;
 141         }
 142     }
 143
 144     if (!have_old)
 145         rc = Py_UNICODE_TODECIMAL(c);
 146     if (rc < 0) {
 147         if (defobj == NULL) {
 148             PyErr_SetString(PyExc_ValueError,
 149                             "not a decimal");
 150             return NULL;
 151         }
 152         else {
 153             Py_INCREF(defobj);
 154             return defobj;
 155         }
 156     }
 157     return PyInt_FromLong(rc);
 158 }
 159
 160 PyDoc_STRVAR(unicodedata_digit__doc__,
 161 "digit(unichr[, default])\n\
 162 \n\
 163 Returns the digit value assigned to the Unicode character unichr as\n\
 164 integer. If no such value is defined, default is returned, or, if\n\
 165 not given, ValueError is raised.");
 166
 167 static PyObject *
 168 unicodedata_digit(PyObject *self, PyObject *args)
 169 {
 170     PyUnicodeObject *v;
 171     PyObject *defobj = NULL;
 172     long rc;
 173     Py_UCS4 c;
 174
 175     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
 176         return NULL;
 177     c = getuchar(v);
 178     if (c == (Py_UCS4)-1)
 179         return NULL;
 180     rc = Py_UNICODE_TODIGIT(c);
 181     if (rc < 0) {
 182         if (defobj == NULL) {
 183             PyErr_SetString(PyExc_ValueError, "not a digit");
 184             return NULL;
 185         }
 186         else {
 187             Py_INCREF(defobj);
 188             return defobj;
 189         }
 190     }
 191     return PyInt_FromLong(rc);
 192 }
 193
 194 PyDoc_STRVAR(unicodedata_numeric__doc__,
 195 "numeric(unichr[, default])\n\
 196 \n\
 197 Returns the numeric value assigned to the Unicode character unichr\n\
 198 as float. If no such value is defined, default is returned, or, if\n\
 199 not given, ValueError is raised.");
 200
 201 static PyObject *
 202 unicodedata_numeric(PyObject *self, PyObject *args)
 203 {
 204     PyUnicodeObject *v;
 205     PyObject *defobj = NULL;
 206     int have_old = 0;
 207     double rc;
 208     Py_UCS4 c;
 209
 210     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 211         return NULL;
 212     c = getuchar(v);
 213     if (c == (Py_UCS4)-1)
 214         return NULL;
 215
 216     if (self) {
 217         const change_record *old = get_old_record(self, c);
 218         if (old->category_changed == 0) {
 219             /* unassigned */
 220             have_old = 1;
 221             rc = -1.0;
 222         }
 223         else if (old->decimal_changed != 0xFF) {
 224             have_old = 1;
 225             rc = old->decimal_changed;
 226         }
 227     }
 228
 229     if (!have_old)
 230         rc = Py_UNICODE_TONUMERIC(c);
 231     if (rc == -1.0) {
 232         if (defobj == NULL) {
 233             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 234             return NULL;
 235         }
 236         else {
 237             Py_INCREF(defobj);
 238             return defobj;
 239         }
 240     }
 241     return PyFloat_FromDouble(rc);
 242 }
 243
 244 PyDoc_STRVAR(unicodedata_category__doc__,
 245 "category(unichr)\n\
 246 \n\
 247 Returns the general category assigned to the Unicode character\n\
 248 unichr as string.");
 249
 250 static PyObject *
 251 unicodedata_category(PyObject *self, PyObject *args)
 252 {
 253     PyUnicodeObject *v;
 254     int index;
 255     Py_UCS4 c;
 256
 257     if (!PyArg_ParseTuple(args, "O!:category",
 258                           &PyUnicode_Type, &v))
 259         return NULL;
 260     c = getuchar(v);
 261     if (c == (Py_UCS4)-1)
 262         return NULL;
 263     index = (int) _getrecord_ex(c)->category;
 264     if (self) {
 265         const change_record *old = get_old_record(self, c);
 266         if (old->category_changed != 0xFF)
 267             index = old->category_changed;
 268     }
 269     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 270 }
 271
 272 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
 273 "bidirectional(unichr)\n\
 274 \n\
 275 Returns the bidirectional category assigned to the Unicode character\n\
 276 unichr as string. If no such value is defined, an empty string is\n\
 277 returned.");
 278
 279 static PyObject *
 280 unicodedata_bidirectional(PyObject *self, PyObject *args)
 281 {
 282     PyUnicodeObject *v;
 283     int index;
 284     Py_UCS4 c;
 285
 286     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 287                           &PyUnicode_Type, &v))
 288         return NULL;
 289     c = getuchar(v);
 290     if (c == (Py_UCS4)-1)
 291         return NULL;
 292     index = (int) _getrecord_ex(c)->bidirectional;
 293     if (self) {
 294         const change_record *old = get_old_record(self, c);
 295         if (old->category_changed == 0)
 296             index = 0; /* unassigned */
 297         else if (old->bidir_changed != 0xFF)
 298             index = old->bidir_changed;
 299     }
 300     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 301 }
 302
 303 PyDoc_STRVAR(unicodedata_combining__doc__,
 304 "combining(unichr)\n\
 305 \n\
 306 Returns the canonical combining class assigned to the Unicode\n\
 307 character unichr as integer. Returns 0 if no combining class is\n\
 308 defined.");
 309
 310 static PyObject *
 311 unicodedata_combining(PyObject *self, PyObject *args)
 312 {
 313     PyUnicodeObject *v;
 314     int index;
 315     Py_UCS4 c;
 316
 317     if (!PyArg_ParseTuple(args, "O!:combining",
 318                           &PyUnicode_Type, &v))
 319         return NULL;
 320     c = getuchar(v);
 321     if (c == (Py_UCS4)-1)
 322         return NULL;
 323     index = (int) _getrecord_ex(c)->combining;
 324     if (self) {
 325         const change_record *old = get_old_record(self, c);
 326         if (old->category_changed == 0)
 327             index = 0; /* unassigned */
 328     }
 329     return PyInt_FromLong(index);
 330 }
 331
 332 PyDoc_STRVAR(unicodedata_mirrored__doc__,
 333 "mirrored(unichr)\n\
 334 \n\
 335 Returns the mirrored property assigned to the Unicode character\n\
 336 unichr as integer. Returns 1 if the character has been identified as\n\
 337 a \"mirrored\" character in bidirectional text, 0 otherwise.");
 338
 339 static PyObject *
 340 unicodedata_mirrored(PyObject *self, PyObject *args)
 341 {
 342     PyUnicodeObject *v;
 343     int index;
 344     Py_UCS4 c;
 345
 346     if (!PyArg_ParseTuple(args, "O!:mirrored",
 347                           &PyUnicode_Type, &v))
 348         return NULL;
 349     c = getuchar(v);
 350     if (c == (Py_UCS4)-1)
 351         return NULL;
 352     index = (int) _getrecord_ex(c)->mirrored;
 353     if (self) {
 354         const change_record *old = get_old_record(self, c);
 355         if (old->category_changed == 0)
 356             index = 0; /* unassigned */
 357     }
 358     return PyInt_FromLong(index);
 359 }
 360
 361 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
 362 "east_asian_width(unichr)\n\
 363 \n\
 364 Returns the east asian width assigned to the Unicode character\n\
 365 unichr as string.");
 366
 367 static PyObject *
 368 unicodedata_east_asian_width(PyObject *self, PyObject *args)
 369 {
 370     PyUnicodeObject *v;
 371     int index;
 372     Py_UCS4 c;
 373
 374     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
 375                           &PyUnicode_Type, &v))
 376         return NULL;
 377     c = getuchar(v);
 378     if (c == (Py_UCS4)-1)
 379         return NULL;
 380     index = (int) _getrecord_ex(c)->east_asian_width;
 381     if (self) {
 382         const change_record *old = get_old_record(self, c);
 383         if (old->category_changed == 0)
 384             index = 0; /* unassigned */
 385     }
 386     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
 387 }
 388
 389 PyDoc_STRVAR(unicodedata_decomposition__doc__,
 390 "decomposition(unichr)\n\
 391 \n\
 392 Returns the character decomposition mapping assigned to the Unicode\n\
 393 character unichr as string. An empty string is returned in case no\n\
 394 such mapping is defined.");
 395
 396 static PyObject *
 397 unicodedata_decomposition(PyObject *self, PyObject *args)
 398 {
 399     PyUnicodeObject *v;
 400     char decomp[256];
 401     int code, index, count, i;
 402     unsigned int prefix_index;
 403     Py_UCS4 c;
 404
 405     if (!PyArg_ParseTuple(args, "O!:decomposition",
 406                           &PyUnicode_Type, &v))
 407         return NULL;
 408     c = getuchar(v);
 409     if (c == (Py_UCS4)-1)
 410         return NULL;
 411
 412     code = (int)c;
 413
 414     if (self) {
 415         const change_record *old = get_old_record(self, c);
 416         if (old->category_changed == 0)
 417             return PyString_FromString(""); /* unassigned */
 418     }
 419
 420     if (code < 0 || code >= 0x110000)
 421         index = 0;
 422     else {
 423         index = decomp_index1[(code>>DECOMP_SHIFT)];
 424         index = decomp_index2[(index<<DECOMP_SHIFT)+
 425                              (code&((1<<DECOMP_SHIFT)-1))];
 426     }
 427
 428     /* high byte is number of hex bytes (usually one or two), low byte
 429        is prefix code (from*/
 430     count = decomp_data[index] >> 8;
 431
 432     /* XXX: could allocate the PyString up front instead
 433        (strlen(prefix) + 5 * count + 1 bytes) */
 434
 435     /* Based on how index is calculated above and decomp_data is generated
 436        from Tools/unicode/makeunicodedata.py, it should not be possible
 437        to overflow decomp_prefix. */
 438     prefix_index = decomp_data[index] & 255;
 439     assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
 440
 441     /* copy prefix */
 442     i = strlen(decomp_prefix[prefix_index]);
 443     memcpy(decomp, decomp_prefix[prefix_index], i);
 444
 445     while (count-- > 0) {
 446         if (i)
 447             decomp[i++] = ' ';
 448         assert((size_t)i < sizeof(decomp));
 449         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 450                       decomp_data[++index]);
 451         i += strlen(decomp + i);
 452     }
 453
 454     decomp[i] = '\0';
 455
 456     return PyString_FromString(decomp);
 457 }
 458
 459 static void
 460 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 461 {
 462     if (code >= 0x110000) {
 463         *index = 0;
 464     } else if (self && get_old_record(self, code)->category_changed==0) {
 465         /* unassigned in old version */
 466         *index = 0;
 467     }
 468     else {
 469         *index = decomp_index1[(code>>DECOMP_SHIFT)];
 470         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 471                                (code&((1<<DECOMP_SHIFT)-1))];
 472     }
 473
 474     /* high byte is number of hex bytes (usually one or two), low byte
 475        is prefix code (from*/
 476     *count = decomp_data[*index] >> 8;
 477     *prefix = decomp_data[*index] & 255;
 478
 479     (*index)++;
 480 }
 481
 482 #define SBase   0xAC00
 483 #define LBase   0x1100
 484 #define VBase   0x1161
 485 #define TBase   0x11A7
 486 #define LCount  19
 487 #define VCount  21
 488 #define TCount  28
 489 #define NCount  (VCount*TCount)
 490 #define SCount  (LCount*NCount)
 491
 492 static PyObject*
 493 nfd_nfkd(PyObject *self, PyObject *input, int k)
 494 {
 495     PyObject *result;
 496     Py_UNICODE *i, *end, *o;
 497     /* Longest decomposition in Unicode 3.2: U+FDFA */
 498     Py_UNICODE stack[20];
 499     Py_ssize_t space, isize;
 500     int index, prefix, count, stackptr;
 501     unsigned char prev, cur;
 502
 503     stackptr = 0;
 504     isize = PyUnicode_GET_SIZE(input);
 505     /* Overallocate atmost 10 characters. */
 506     space = (isize > 10 ? 10 : isize) + isize;
 507     result = PyUnicode_FromUnicode(NULL, space);
 508     if (!result)
 509         return NULL;
 510     i = PyUnicode_AS_UNICODE(input);
 511     end = i + isize;
 512     o = PyUnicode_AS_UNICODE(result);
 513
 514     while (i < end) {
 515         stack[stackptr++] = *i++;
 516         while(stackptr) {
 517             Py_UNICODE code = stack[--stackptr];
 518             /* Hangul Decomposition adds three characters in
 519                a single step, so we need atleast that much room. */
 520             if (space < 3) {
 521                 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
 522                 space += 10;
 523                 if (PyUnicode_Resize(&result, newsize) == -1)
 524                     return NULL;
 525                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
 526             }
 527             /* Hangul Decomposition. */
 528             if (SBase <= code && code < (SBase+SCount)) {
 529                 int SIndex = code - SBase;
 530                 int L = LBase + SIndex / NCount;
 531                 int V = VBase + (SIndex % NCount) / TCount;
 532                 int T = TBase + SIndex % TCount;
 533                 *o++ = L;
 534                 *o++ = V;
 535                 space -= 2;
 536                 if (T != TBase) {
 537                     *o++ = T;
 538                     space --;
 539                 }
 540                 continue;
 541             }
 542             /* normalization changes */
 543             if (self) {
 544                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
 545                 if (value != 0) {
 546                     stack[stackptr++] = value;
 547                     continue;
 548                 }
 549             }
 550
 551             /* Other decompositions. */
 552             get_decomp_record(self, code, &index, &prefix, &count);
 553
 554             /* Copy character if it is not decomposable, or has a
 555                compatibility decomposition, but we do NFD. */
 556             if (!count || (prefix && !k)) {
 557                 *o++ = code;
 558                 space--;
 559                 continue;
 560             }
 561             /* Copy decomposition onto the stack, in reverse
 562                order.  */
 563             while(count) {
 564                 code = decomp_data[index + (--count)];
 565                 stack[stackptr++] = code;
 566             }
 567         }
 568     }
 569
 570     /* Drop overallocation. Cannot fail. */
 571     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 572
 573     /* Sort canonically. */
 574     i = PyUnicode_AS_UNICODE(result);
 575     prev = _getrecord_ex(*i)->combining;
 576     end = i + PyUnicode_GET_SIZE(result);
 577     for (i++; i < end; i++) {
 578         cur = _getrecord_ex(*i)->combining;
 579         if (prev == 0 || cur == 0 || prev <= cur) {
 580             prev = cur;
 581             continue;
 582         }
 583         /* Non-canonical order. Need to switch *i with previous. */
 584         o = i - 1;
 585         while (1) {
 586             Py_UNICODE tmp = o[1];
 587             o[1] = o[0];
 588             o[0] = tmp;
 589             o--;
 590             if (o < PyUnicode_AS_UNICODE(result))
 591                 break;
 592             prev = _getrecord_ex(*o)->combining;
 593             if (prev == 0 || prev <= cur)
 594                 break;
 595         }
 596         prev = _getrecord_ex(*i)->combining;
 597     }
 598     return result;
 599 }
 600
 601 static int
 602 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
 603 {
 604     int index;
 605     for (index = 0; nfc[index].start; index++) {
 606         int start = nfc[index].start;
 607         if (code < start)
 608             return -1;
 609         if (code <= start + nfc[index].count) {
 610             int delta = code - start;
 611             return nfc[index].index + delta;
 612         }
 613     }
 614     return -1;
 615 }
 616
 617 static PyObject*
 618 nfc_nfkc(PyObject *self, PyObject *input, int k)
 619 {
 620     PyObject *result;
 621     Py_UNICODE *i, *i1, *o, *end;
 622     int f,l,index,index1,comb;
 623     Py_UNICODE code;
 624     Py_UNICODE *skipped[20];
 625     int cskipped = 0;
 626
 627     result = nfd_nfkd(self, input, k);
 628     if (!result)
 629         return NULL;
 630
 631     /* We are going to modify result in-place.
 632        If nfd_nfkd is changed to sometimes return the input,
 633        this code needs to be reviewed. */
 634     assert(result != input);
 635
 636     i = PyUnicode_AS_UNICODE(result);
 637     end = i + PyUnicode_GET_SIZE(result);
 638     o = PyUnicode_AS_UNICODE(result);
 639
 640   again:
 641     while (i < end) {
 642       for (index = 0; index < cskipped; index++) {
 643           if (skipped[index] == i) {
 644               /* *i character is skipped.
 645                  Remove from list. */
 646               skipped[index] = skipped[cskipped-1];
 647               cskipped--;
 648               i++;
 649               goto again; /* continue while */
 650           }
 651       }
 652       /* Hangul Composition. We don't need to check for <LV,T>
 653          pairs, since we always have decomposed data. */
 654       if (LBase <= *i && *i < (LBase+LCount) &&
 655           i + 1 < end &&
 656           VBase <= i[1] && i[1] <= (VBase+VCount)) {
 657           int LIndex, VIndex;
 658           LIndex = i[0] - LBase;
 659           VIndex = i[1] - VBase;
 660           code = SBase + (LIndex*VCount+VIndex)*TCount;
 661           i+=2;
 662           if (i < end &&
 663               TBase <= *i && *i <= (TBase+TCount)) {
 664               code += *i-TBase;
 665               i++;
 666           }
 667           *o++ = code;
 668           continue;
 669       }
 670
 671       f = find_nfc_index(self, nfc_first, *i);
 672       if (f == -1) {
 673           *o++ = *i++;
 674           continue;
 675       }
 676       /* Find next unblocked character. */
 677       i1 = i+1;
 678       comb = 0;
 679       while (i1 < end) {
 680           int comb1 = _getrecord_ex(*i1)->combining;
 681           if (comb1 && comb == comb1) {
 682               /* Character is blocked. */
 683               i1++;
 684               continue;
 685           }
 686           l = find_nfc_index(self, nfc_last, *i1);
 687           /* *i1 cannot be combined with *i. If *i1
 688              is a starter, we don't need to look further.
 689              Otherwise, record the combining class. */
 690           if (l == -1) {
 691             not_combinable:
 692               if (comb1 == 0)
 693                   break;
 694               comb = comb1;
 695               i1++;
 696               continue;
 697           }
 698           index = f*TOTAL_LAST + l;
 699           index1 = comp_index[index >> COMP_SHIFT];
 700           code = comp_data[(index1<<COMP_SHIFT)+
 701                            (index&((1<<COMP_SHIFT)-1))];
 702           if (code == 0)
 703               goto not_combinable;
 704
 705           /* Replace the original character. */
 706           *i = code;
 707           /* Mark the second character unused. */
 708           skipped[cskipped++] = i1;
 709           i1++;
 710           f = find_nfc_index(self, nfc_first, *i);
 711           if (f == -1)
 712               break;
 713       }
 714       *o++ = *i++;
 715     }
 716     if (o != end)
 717         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 718     return result;
 719 }
 720
 721 PyDoc_STRVAR(unicodedata_normalize__doc__,
 722 "normalize(form, unistr)\n\
 723 \n\
 724 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
 725 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
 726
 727 static PyObject*
 728 unicodedata_normalize(PyObject *self, PyObject *args)
 729 {
 730     char *form;
 731     PyObject *input;
 732
 733     if(!PyArg_ParseTuple(args, "sO!:normalize",
 734                          &form, &PyUnicode_Type, &input))
 735         return NULL;
 736
 737     if (PyUnicode_GetSize(input) == 0) {
 738         /* Special case empty input strings, since resizing
 739            them  later would cause internal errors. */
 740         Py_INCREF(input);
 741         return input;
 742     }
 743
 744     if (strcmp(form, "NFC") == 0)
 745         return nfc_nfkc(self, input, 0);
 746     if (strcmp(form, "NFKC") == 0)
 747         return nfc_nfkc(self, input, 1);
 748     if (strcmp(form, "NFD") == 0)
 749         return nfd_nfkd(self, input, 0);
 750     if (strcmp(form, "NFKD") == 0)
 751         return nfd_nfkd(self, input, 1);
 752     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 753     return NULL;
 754 }
 755
 756 /* -------------------------------------------------------------------- */
 757 /* unicode character name tables */
 758
 759 /* data file generated by Tools/unicode/makeunicodedata.py */
 760 #include "unicodename_db.h"
 761
 762 /* -------------------------------------------------------------------- */
 763 /* database code (cut and pasted from the unidb package) */
 764
 765 static unsigned long
 766 _gethash(const char *s, int len, int scale)
 767 {
 768     int i;
 769     unsigned long h = 0;
 770     unsigned long ix;
 771     for (i = 0; i < len; i++) {
 772         h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
 773         ix = h & 0xff000000;
 774         if (ix)
 775             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 776     }
 777     return h;
 778 }
 779
 780 static char *hangul_syllables[][3] = {
 781     { "G",  "A",   ""   },
 782     { "GG", "AE",  "G"  },
 783     { "N",  "YA",  "GG" },
 784     { "D",  "YAE", "GS" },
 785     { "DD", "EO",  "N", },
 786     { "R",  "E",   "NJ" },
 787     { "M",  "YEO", "NH" },
 788     { "B",  "YE",  "D"  },
 789     { "BB", "O",   "L"  },
 790     { "S",  "WA",  "LG" },
 791     { "SS", "WAE", "LM" },
 792     { "",   "OE",  "LB" },
 793     { "J",  "YO",  "LS" },
 794     { "JJ", "U",   "LT" },
 795     { "C",  "WEO", "LP" },
 796     { "K",  "WE",  "LH" },
 797     { "T",  "WI",  "M"  },
 798     { "P",  "YU",  "B"  },
 799     { "H",  "EU",  "BS" },
 800     { 0,    "YI",  "S"  },
 801     { 0,    "I",   "SS" },
 802     { 0,    0,     "NG" },
 803     { 0,    0,     "J"  },
 804     { 0,    0,     "C"  },
 805     { 0,    0,     "K"  },
 806     { 0,    0,     "T"  },
 807     { 0,    0,     "P"  },
 808     { 0,    0,     "H"  }
 809 };
 810
 811 static int
 812 is_unified_ideograph(Py_UCS4 code)
 813 {
 814     return (
 815         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 816         (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
 817         (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
 818 }
 819
 820 static int
 821 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
 822 {
 823     int offset;
 824     int i;
 825     int word;
 826     unsigned char* w;
 827
 828     if (code >= 0x110000)
 829         return 0;
 830
 831     if (self) {
 832         const change_record *old = get_old_record(self, code);
 833         if (old->category_changed == 0) {
 834             /* unassigned */
 835             return 0;
 836         }
 837     }
 838
 839     if (SBase <= code && code < SBase+SCount) {
 840         /* Hangul syllable. */
 841         int SIndex = code - SBase;
 842         int L = SIndex / NCount;
 843         int V = (SIndex % NCount) / TCount;
 844         int T = SIndex % TCount;
 845
 846         if (buflen < 27)
 847             /* Worst case: HANGUL SYLLABLE <10chars>. */
 848             return 0;
 849         strcpy(buffer, "HANGUL SYLLABLE ");
 850         buffer += 16;
 851         strcpy(buffer, hangul_syllables[L][0]);
 852         buffer += strlen(hangul_syllables[L][0]);
 853         strcpy(buffer, hangul_syllables[V][1]);
 854         buffer += strlen(hangul_syllables[V][1]);
 855         strcpy(buffer, hangul_syllables[T][2]);
 856         buffer += strlen(hangul_syllables[T][2]);
 857         *buffer = '\0';
 858         return 1;
 859     }
 860
 861     if (is_unified_ideograph(code)) {
 862         if (buflen < 28)
 863             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 864             return 0;
 865         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 866         return 1;
 867     }
 868
 869     /* get offset into phrasebook */
 870     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 871     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 872                                (code&((1<<phrasebook_shift)-1))];
 873     if (!offset)
 874         return 0;
 875
 876     i = 0;
 877
 878     for (;;) {
 879         /* get word index */
 880         word = phrasebook[offset] - phrasebook_short;
 881         if (word >= 0) {
 882             word = (word << 8) + phrasebook[offset+1];
 883             offset += 2;
 884         } else
 885             word = phrasebook[offset++];
 886         if (i) {
 887             if (i > buflen)
 888                 return 0; /* buffer overflow */
 889             buffer[i++] = ' ';
 890         }
 891         /* copy word string from lexicon.  the last character in the
 892            word has bit 7 set.  the last word in a string ends with
 893            0x80 */
 894         w = lexicon + lexicon_offset[word];
 895         while (*w < 128) {
 896             if (i >= buflen)
 897                 return 0; /* buffer overflow */
 898             buffer[i++] = *w++;
 899         }
 900         if (i >= buflen)
 901             return 0; /* buffer overflow */
 902         buffer[i++] = *w & 127;
 903         if (*w == 128)
 904             break; /* end of word */
 905     }
 906
 907     return 1;
 908 }
 909
 910 static int
 911 _cmpname(PyObject *self, int code, const char* name, int namelen)
 912 {
 913     /* check if code corresponds to the given name */
 914     int i;
 915     char buffer[NAME_MAXLEN];
 916     if (!_getucname(self, code, buffer, sizeof(buffer)))
 917         return 0;
 918     for (i = 0; i < namelen; i++) {
 919         if (toupper(Py_CHARMASK(name[i])) != buffer[i])
 920             return 0;
 921     }
 922     return buffer[namelen] == '\0';
 923 }
 924
 925 static void
 926 find_syllable(const char *str, int *len, int *pos, int count, int column)
 927 {
 928     int i, len1;
 929     *len = -1;
 930     for (i = 0; i < count; i++) {
 931         char *s = hangul_syllables[i][column];
 932         len1 = strlen(s);
 933         if (len1 <= *len)
 934             continue;
 935         if (strncmp(str, s, len1) == 0) {
 936             *len = len1;
 937             *pos = i;
 938         }
 939     }
 940     if (*len == -1) {
 941         *len = 0;
 942     }
 943 }
 944
 945 static int
 946 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
 947 {
 948     unsigned int h, v;
 949     unsigned int mask = code_size-1;
 950     unsigned int i, incr;
 951
 952     /* Check for hangul syllables. */
 953     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
 954         int len, L = -1, V = -1, T = -1;
 955         const char *pos = name + 16;
 956         find_syllable(pos, &len, &L, LCount, 0);
 957         pos += len;
 958         find_syllable(pos, &len, &V, VCount, 1);
 959         pos += len;
 960         find_syllable(pos, &len, &T, TCount, 2);
 961         pos += len;
 962         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
 963             *code = SBase + (L*VCount+V)*TCount + T;
 964             return 1;
 965         }
 966         /* Otherwise, it's an illegal syllable name. */
 967         return 0;
 968     }
 969
 970     /* Check for unified ideographs. */
 971     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
 972         /* Four or five hexdigits must follow. */
 973         v = 0;
 974         name += 22;
 975         namelen -= 22;
 976         if (namelen != 4 && namelen != 5)
 977             return 0;
 978         while (namelen--) {
 979             v *= 16;
 980             if (*name >= '0' && *name <= '9')
 981                 v += *name - '0';
 982             else if (*name >= 'A' && *name <= 'F')
 983                 v += *name - 'A' + 10;
 984             else
 985                 return 0;
 986             name++;
 987         }
 988         if (!is_unified_ideograph(v))
 989             return 0;
 990         *code = v;
 991         return 1;
 992     }
 993
 994     /* the following is the same as python's dictionary lookup, with
 995        only minor changes.  see the makeunicodedata script for more
 996        details */
 997
 998     h = (unsigned int) _gethash(name, namelen, code_magic);
 999     i = (~h) & mask;
1000     v = code_hash[i];
1001     if (!v)
1002         return 0;
1003     if (_cmpname(self, v, name, namelen)) {
1004         *code = v;
1005         return 1;
1006     }
1007     incr = (h ^ (h >> 3)) & mask;
1008     if (!incr)
1009         incr = mask;
1010     for (;;) {
1011         i = (i + incr) & mask;
1012         v = code_hash[i];
1013         if (!v)
1014             return 0;
1015         if (_cmpname(self, v, name, namelen)) {
1016             *code = v;
1017             return 1;
1018         }
1019         incr = incr << 1;
1020         if (incr > mask)
1021             incr = incr ^ code_poly;
1022     }
1023 }
1024
1025 static const _PyUnicode_Name_CAPI hashAPI =
1026 {
1027     sizeof(_PyUnicode_Name_CAPI),
1028     _getucname,
1029     _getcode
1030 };
1031
1032 /* -------------------------------------------------------------------- */
1033 /* Python bindings */
1034
1035 PyDoc_STRVAR(unicodedata_name__doc__,
1036 "name(unichr[, default])\n\
1037 Returns the name assigned to the Unicode character unichr as a\n\
1038 string. If no name is defined, default is returned, or, if not\n\
1039 given, ValueError is raised.");
1040
1041 static PyObject *
1042 unicodedata_name(PyObject* self, PyObject* args)
1043 {
1044     char name[NAME_MAXLEN];
1045     Py_UCS4 c;
1046
1047     PyUnicodeObject* v;
1048     PyObject* defobj = NULL;
1049     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1050         return NULL;
1051
1052     c = getuchar(v);
1053     if (c == (Py_UCS4)-1)
1054         return NULL;
1055
1056     if (!_getucname(self, c, name, sizeof(name))) {
1057         if (defobj == NULL) {
1058             PyErr_SetString(PyExc_ValueError, "no such name");
1059             return NULL;
1060         }
1061         else {
1062             Py_INCREF(defobj);
1063             return defobj;
1064         }
1065     }
1066
1067     return Py_BuildValue("s", name);
1068 }
1069
1070 PyDoc_STRVAR(unicodedata_lookup__doc__,
1071 "lookup(name)\n\
1072 \n\
1073 Look up character by name.  If a character with the\n\
1074 given name is found, return the corresponding Unicode\n\
1075 character.  If not found, KeyError is raised.");
1076
1077 static PyObject *
1078 unicodedata_lookup(PyObject* self, PyObject* args)
1079 {
1080     Py_UCS4 code;
1081     Py_UNICODE str[2];
1082
1083     char* name;
1084     int namelen;
1085     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1086         return NULL;
1087
1088     if (!_getcode(self, name, namelen, &code)) {
1089         PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1090                      name);
1091         return NULL;
1092     }
1093
1094 #ifndef Py_UNICODE_WIDE
1095     if (code >= 0x10000) {
1096         str[0] = 0xd800 + ((code - 0x10000) >> 10);
1097         str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1098         return PyUnicode_FromUnicode(str, 2);
1099     }
1100 #endif
1101     str[0] = (Py_UNICODE) code;
1102     return PyUnicode_FromUnicode(str, 1);
1103 }
1104
1105 /* XXX Add doc strings. */
1106
1107 static PyMethodDef unicodedata_functions[] = {
1108     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1109     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1110     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1111     {"category", unicodedata_category, METH_VARARGS,
1112                  unicodedata_category__doc__},
1113     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1114                       unicodedata_bidirectional__doc__},
1115     {"combining", unicodedata_combining, METH_VARARGS,
1116                   unicodedata_combining__doc__},
1117     {"mirrored", unicodedata_mirrored, METH_VARARGS,
1118                  unicodedata_mirrored__doc__},
1119     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1120                          unicodedata_east_asian_width__doc__},
1121     {"decomposition", unicodedata_decomposition, METH_VARARGS,
1122                       unicodedata_decomposition__doc__},
1123     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1124     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1125     {"normalize", unicodedata_normalize, METH_VARARGS,
1126                   unicodedata_normalize__doc__},
1127     {NULL, NULL}                /* sentinel */
1128 };
1129
1130 static PyTypeObject UCD_Type = {
1131         /* The ob_type field must be initialized in the module init function
1132          * to be portable to Windows without using C++. */
1133         PyVarObject_HEAD_INIT(NULL, 0)
1134         "unicodedata.UCD",              /*tp_name*/
1135         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1136         0,                      /*tp_itemsize*/
1137         /* methods */
1138         (destructor)PyObject_Del, /*tp_dealloc*/
1139         0,                      /*tp_print*/
1140         0,                      /*tp_getattr*/
1141         0,                      /*tp_setattr*/
1142         0,                      /*tp_compare*/
1143         0,                      /*tp_repr*/
1144         0,                      /*tp_as_number*/
1145         0,                      /*tp_as_sequence*/
1146         0,                      /*tp_as_mapping*/
1147         0,                      /*tp_hash*/
1148         0,                      /*tp_call*/
1149         0,                      /*tp_str*/
1150         PyObject_GenericGetAttr,/*tp_getattro*/
1151         0,                      /*tp_setattro*/
1152         0,                      /*tp_as_buffer*/
1153         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1154         0,                      /*tp_doc*/
1155         0,                      /*tp_traverse*/
1156         0,                      /*tp_clear*/
1157         0,                      /*tp_richcompare*/
1158         0,                      /*tp_weaklistoffset*/
1159         0,                      /*tp_iter*/
1160         0,                      /*tp_iternext*/
1161         unicodedata_functions,  /*tp_methods*/
1162         DB_members,             /*tp_members*/
1163         0,                      /*tp_getset*/
1164         0,                      /*tp_base*/
1165         0,                      /*tp_dict*/
1166         0,                      /*tp_descr_get*/
1167         0,                      /*tp_descr_set*/
1168         0,                      /*tp_dictoffset*/
1169         0,                      /*tp_init*/
1170         0,                      /*tp_alloc*/
1171         0,                      /*tp_new*/
1172         0,                      /*tp_free*/
1173         0,                      /*tp_is_gc*/
1174 };
1175
1176 PyDoc_STRVAR(unicodedata_docstring,
1177 "This module provides access to the Unicode Character Database which\n\
1178 defines character properties for all Unicode characters. The data in\n\
1179 this database is based on the UnicodeData.txt file version\n\
1180 4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1181 \n\
1182 The module uses the same names and symbols as defined by the\n\
1183 UnicodeData File Format 4.1.0 (see\n\
1184 http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
1185
1186 PyMODINIT_FUNC
1187 initunicodedata(void)
1188 {
1189     PyObject *m, *v;
1190
1191     Py_TYPE(&UCD_Type) = &PyType_Type;
1192
1193     m = Py_InitModule3(
1194         "unicodedata", unicodedata_functions, unicodedata_docstring);
1195     if (!m)
1196         return;
1197
1198     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1199     Py_INCREF(&UCD_Type);
1200     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1201
1202     /* Previous versions */
1203     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1204     if (v != NULL)
1205         PyModule_AddObject(m, "ucd_3_2_0", v);
1206
1207     /* Export C API */
1208     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1209     if (v != NULL)
1210         PyModule_AddObject(m, "ucnhash_CAPI", v);
1211 }
1212
1213 /*
1214 Local variables:
1215 c-basic-offset: 4
1216 indent-tabs-mode: nil
1217 End:
1218 */