Modules/unicodedata.c

   1 /* ------------------------------------------------------------------------
   2
   3    unicodedata -- Provides access to the Unicode 4.1 data base.
   4
   5    Data was extracted from the Unicode 4.1 UnicodeData.txt file.
   6
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9    Modified by Martin v. Löwis (martin@v.loewis.de)
  10
  11    Copyright (c) Corporation for National Research Initiatives.
  12
  13    ------------------------------------------------------------------------ */
  14
  15 #include "Python.h"
  16 #include "ucnhash.h"
  17 #include "structmember.h"
  18
  19 /* character properties */
  20
  21 typedef struct {
  22     const unsigned char category;       /* index into
  23                                            _PyUnicode_CategoryNames */
  24     const unsigned char combining;      /* combining class value 0 - 255 */
  25     const unsigned char bidirectional;  /* index into
  26                                            _PyUnicode_BidirectionalNames */
  27     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  28     const unsigned char east_asian_width;       /* index into
  29                                                    _PyUnicode_EastAsianWidth */
  30 } _PyUnicode_DatabaseRecord;
  31
  32 typedef struct change_record {
  33     /* sequence of fields should be the same as in merge_old_version */
  34     const unsigned char bidir_changed;
  35     const unsigned char category_changed;
  36     const unsigned char decimal_changed;
  37     const int numeric_changed;
  38 } change_record;
  39
  40 /* data file generated by Tools/unicode/makeunicodedata.py */
  41 #include "unicodedata_db.h"
  42
  43 static const _PyUnicode_DatabaseRecord*
  44 _getrecord_ex(Py_UCS4 code)
  45 {
  46     int index;
  47     if (code >= 0x110000)
  48         index = 0;
  49     else {
  50         index = index1[(code>>SHIFT)];
  51         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  52     }
  53
  54     return &_PyUnicode_Database_Records[index];
  55 }
  56
  57 static const _PyUnicode_DatabaseRecord*
  58 _getrecord(PyUnicodeObject* v)
  59 {
  60     return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
  61 }
  62
  63 /* ------------- Previous-version API ------------------------------------- */
  64 typedef struct previous_version {
  65     PyObject_HEAD
  66     const char *name;
  67     const change_record* (*getrecord)(Py_UCS4);
  68     Py_UCS4 (*normalization)(Py_UCS4);
  69 } PreviousDBVersion;
  70
  71 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
  72
  73 static PyMemberDef DB_members[] = {
  74         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
  75         {NULL}
  76 };
  77
  78 /* forward declaration */
  79 static PyTypeObject UCD_Type;
  80
  81 static PyObject*
  82 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
  83                      Py_UCS4 (*normalization)(Py_UCS4))
  84 {
  85         PreviousDBVersion *self;
  86         self = PyObject_New(PreviousDBVersion, &UCD_Type);
  87         if (self == NULL)
  88                 return NULL;
  89         self->name = name;
  90         self->getrecord = getrecord;
  91         self->normalization = normalization;
  92         return (PyObject*)self;
  93 }
  94
  95 /* --- Module API --------------------------------------------------------- */
  96
  97 PyDoc_STRVAR(unicodedata_decimal__doc__,
  98 "decimal(unichr[, default])\n\
  99 \n\
 100 Returns the decimal value assigned to the Unicode character unichr\n\
 101 as integer. If no such value is defined, default is returned, or, if\n\
 102 not given, ValueError is raised.");
 103
 104 static PyObject *
 105 unicodedata_decimal(PyObject *self, PyObject *args)
 106 {
 107     PyUnicodeObject *v;
 108     PyObject *defobj = NULL;
 109     int have_old = 0;
 110     long rc;
 111
 112     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
 113         return NULL;
 114     if (PyUnicode_GET_SIZE(v) != 1) {
 115         PyErr_SetString(PyExc_TypeError,
 116                         "need a single Unicode character as parameter");
 117         return NULL;
 118     }
 119
 120     if (self) {
 121         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 122         if (old->category_changed == 0) {
 123             /* unassigned */
 124             have_old = 1;
 125             rc = -1;
 126         }
 127         else if (old->decimal_changed != 0xFF) {
 128             have_old = 1;
 129             rc = old->decimal_changed;
 130         }
 131     }
 132
 133     if (!have_old)
 134         rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
 135     if (rc < 0) {
 136         if (defobj == NULL) {
 137             PyErr_SetString(PyExc_ValueError,
 138                             "not a decimal");
 139             return NULL;
 140         }
 141         else {
 142             Py_INCREF(defobj);
 143             return defobj;
 144         }
 145     }
 146     return PyInt_FromLong(rc);
 147 }
 148
 149 PyDoc_STRVAR(unicodedata_digit__doc__,
 150 "digit(unichr[, default])\n\
 151 \n\
 152 Returns the digit value assigned to the Unicode character unichr as\n\
 153 integer. If no such value is defined, default is returned, or, if\n\
 154 not given, ValueError is raised.");
 155
 156 static PyObject *
 157 unicodedata_digit(PyObject *self, PyObject *args)
 158 {
 159     PyUnicodeObject *v;
 160     PyObject *defobj = NULL;
 161     long rc;
 162
 163     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
 164         return NULL;
 165     if (PyUnicode_GET_SIZE(v) != 1) {
 166         PyErr_SetString(PyExc_TypeError,
 167                         "need a single Unicode character as parameter");
 168         return NULL;
 169     }
 170     rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
 171     if (rc < 0) {
 172         if (defobj == NULL) {
 173             PyErr_SetString(PyExc_ValueError, "not a digit");
 174             return NULL;
 175         }
 176         else {
 177             Py_INCREF(defobj);
 178             return defobj;
 179         }
 180     }
 181     return PyInt_FromLong(rc);
 182 }
 183
 184 PyDoc_STRVAR(unicodedata_numeric__doc__,
 185 "numeric(unichr[, default])\n\
 186 \n\
 187 Returns the numeric value assigned to the Unicode character unichr\n\
 188 as float. If no such value is defined, default is returned, or, if\n\
 189 not given, ValueError is raised.");
 190
 191 static PyObject *
 192 unicodedata_numeric(PyObject *self, PyObject *args)
 193 {
 194     PyUnicodeObject *v;
 195     PyObject *defobj = NULL;
 196     int have_old = 0;
 197     double rc;
 198
 199     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 200         return NULL;
 201     if (PyUnicode_GET_SIZE(v) != 1) {
 202         PyErr_SetString(PyExc_TypeError,
 203                         "need a single Unicode character as parameter");
 204         return NULL;
 205     }
 206
 207     if (self) {
 208         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 209         if (old->category_changed == 0) {
 210             /* unassigned */
 211             have_old = 1;
 212             rc = -1.0;
 213         }
 214         else if (old->decimal_changed != 0xFF) {
 215             have_old = 1;
 216             rc = old->decimal_changed;
 217         }
 218     }
 219
 220     if (!have_old)
 221         rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
 222     if (rc == -1.0) {
 223         if (defobj == NULL) {
 224             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 225             return NULL;
 226         }
 227         else {
 228             Py_INCREF(defobj);
 229             return defobj;
 230         }
 231     }
 232     return PyFloat_FromDouble(rc);
 233 }
 234
 235 PyDoc_STRVAR(unicodedata_category__doc__,
 236 "category(unichr)\n\
 237 \n\
 238 Returns the general category assigned to the Unicode character\n\
 239 unichr as string.");
 240
 241 static PyObject *
 242 unicodedata_category(PyObject *self, PyObject *args)
 243 {
 244     PyUnicodeObject *v;
 245     int index;
 246
 247     if (!PyArg_ParseTuple(args, "O!:category",
 248                           &PyUnicode_Type, &v))
 249         return NULL;
 250     if (PyUnicode_GET_SIZE(v) != 1) {
 251         PyErr_SetString(PyExc_TypeError,
 252                         "need a single Unicode character as parameter");
 253         return NULL;
 254     }
 255     index = (int) _getrecord(v)->category;
 256     if (self) {
 257         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 258         if (old->category_changed != 0xFF)
 259             index = old->category_changed;
 260     }
 261     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 262 }
 263
 264 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
 265 "bidirectional(unichr)\n\
 266 \n\
 267 Returns the bidirectional category assigned to the Unicode character\n\
 268 unichr as string. If no such value is defined, an empty string is\n\
 269 returned.");
 270
 271 static PyObject *
 272 unicodedata_bidirectional(PyObject *self, PyObject *args)
 273 {
 274     PyUnicodeObject *v;
 275     int index;
 276
 277     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 278                           &PyUnicode_Type, &v))
 279         return NULL;
 280     if (PyUnicode_GET_SIZE(v) != 1) {
 281         PyErr_SetString(PyExc_TypeError,
 282                         "need a single Unicode character as parameter");
 283         return NULL;
 284     }
 285     index = (int) _getrecord(v)->bidirectional;
 286     if (self) {
 287         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 288         if (old->category_changed == 0)
 289             index = 0; /* unassigned */
 290         else if (old->bidir_changed != 0xFF)
 291             index = old->bidir_changed;
 292     }
 293     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 294 }
 295
 296 PyDoc_STRVAR(unicodedata_combining__doc__,
 297 "combining(unichr)\n\
 298 \n\
 299 Returns the canonical combining class assigned to the Unicode\n\
 300 character unichr as integer. Returns 0 if no combining class is\n\
 301 defined.");
 302
 303 static PyObject *
 304 unicodedata_combining(PyObject *self, PyObject *args)
 305 {
 306     PyUnicodeObject *v;
 307     int index;
 308
 309     if (!PyArg_ParseTuple(args, "O!:combining",
 310                           &PyUnicode_Type, &v))
 311         return NULL;
 312     if (PyUnicode_GET_SIZE(v) != 1) {
 313         PyErr_SetString(PyExc_TypeError,
 314                         "need a single Unicode character as parameter");
 315         return NULL;
 316     }
 317     index = (int) _getrecord(v)->combining;
 318     if (self) {
 319         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 320         if (old->category_changed == 0)
 321             index = 0; /* unassigned */
 322     }
 323     return PyInt_FromLong(index);
 324 }
 325
 326 PyDoc_STRVAR(unicodedata_mirrored__doc__,
 327 "mirrored(unichr)\n\
 328 \n\
 329 Returns the mirrored property assigned to the Unicode character\n\
 330 unichr as integer. Returns 1 if the character has been identified as\n\
 331 a \"mirrored\" character in bidirectional text, 0 otherwise.");
 332
 333 static PyObject *
 334 unicodedata_mirrored(PyObject *self, PyObject *args)
 335 {
 336     PyUnicodeObject *v;
 337     int index;
 338
 339     if (!PyArg_ParseTuple(args, "O!:mirrored",
 340                           &PyUnicode_Type, &v))
 341         return NULL;
 342     if (PyUnicode_GET_SIZE(v) != 1) {
 343         PyErr_SetString(PyExc_TypeError,
 344                         "need a single Unicode character as parameter");
 345         return NULL;
 346     }
 347     index = (int) _getrecord(v)->mirrored;
 348     if (self) {
 349         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 350         if (old->category_changed == 0)
 351             index = 0; /* unassigned */
 352     }
 353     return PyInt_FromLong(index);
 354 }
 355
 356 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
 357 "east_asian_width(unichr)\n\
 358 \n\
 359 Returns the east asian width assigned to the Unicode character\n\
 360 unichr as string.");
 361
 362 static PyObject *
 363 unicodedata_east_asian_width(PyObject *self, PyObject *args)
 364 {
 365     PyUnicodeObject *v;
 366     int index;
 367
 368     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
 369                           &PyUnicode_Type, &v))
 370         return NULL;
 371     if (PyUnicode_GET_SIZE(v) != 1) {
 372         PyErr_SetString(PyExc_TypeError,
 373                         "need a single Unicode character as parameter");
 374         return NULL;
 375     }
 376     index = (int) _getrecord(v)->east_asian_width;
 377     if (self) {
 378         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 379         if (old->category_changed == 0)
 380             index = 0; /* unassigned */
 381     }
 382     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
 383 }
 384
 385 PyDoc_STRVAR(unicodedata_decomposition__doc__,
 386 "decomposition(unichr)\n\
 387 \n\
 388 Returns the character decomposition mapping assigned to the Unicode\n\
 389 character unichr as string. An empty string is returned in case no\n\
 390 such mapping is defined.");
 391
 392 static PyObject *
 393 unicodedata_decomposition(PyObject *self, PyObject *args)
 394 {
 395     PyUnicodeObject *v;
 396     char decomp[256];
 397     int code, index, count, i;
 398     unsigned int prefix_index;
 399
 400     if (!PyArg_ParseTuple(args, "O!:decomposition",
 401                           &PyUnicode_Type, &v))
 402         return NULL;
 403     if (PyUnicode_GET_SIZE(v) != 1) {
 404         PyErr_SetString(PyExc_TypeError,
 405                         "need a single Unicode character as parameter");
 406         return NULL;
 407     }
 408
 409     code = (int) *PyUnicode_AS_UNICODE(v);
 410
 411     if (self) {
 412         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 413         if (old->category_changed == 0)
 414             return PyString_FromString(""); /* unassigned */
 415     }
 416
 417     if (code < 0 || code >= 0x110000)
 418         index = 0;
 419     else {
 420         index = decomp_index1[(code>>DECOMP_SHIFT)];
 421         index = decomp_index2[(index<<DECOMP_SHIFT)+
 422                              (code&((1<<DECOMP_SHIFT)-1))];
 423     }
 424
 425     /* high byte is number of hex bytes (usually one or two), low byte
 426        is prefix code (from*/
 427     count = decomp_data[index] >> 8;
 428
 429     /* XXX: could allocate the PyString up front instead
 430        (strlen(prefix) + 5 * count + 1 bytes) */
 431
 432     /* Based on how index is calculated above and decomp_data is generated
 433        from Tools/unicode/makeunicodedata.py, it should not be possible
 434        to overflow decomp_prefix. */
 435     prefix_index = decomp_data[index] & 255;
 436     assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
 437
 438     /* copy prefix */
 439     i = strlen(decomp_prefix[prefix_index]);
 440     memcpy(decomp, decomp_prefix[prefix_index], i);
 441
 442     while (count-- > 0) {
 443         if (i)
 444             decomp[i++] = ' ';
 445         assert((size_t)i < sizeof(decomp));
 446         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 447                       decomp_data[++index]);
 448         i += strlen(decomp + i);
 449     }
 450
 451     decomp[i] = '\0';
 452
 453     return PyString_FromString(decomp);
 454 }
 455
 456 static void
 457 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 458 {
 459     if (code >= 0x110000) {
 460         *index = 0;
 461     } else if (self && get_old_record(self, code)->category_changed==0) {
 462         /* unassigned in old version */
 463         *index = 0;
 464     }
 465     else {
 466         *index = decomp_index1[(code>>DECOMP_SHIFT)];
 467         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 468                                (code&((1<<DECOMP_SHIFT)-1))];
 469     }
 470
 471     /* high byte is number of hex bytes (usually one or two), low byte
 472        is prefix code (from*/
 473     *count = decomp_data[*index] >> 8;
 474     *prefix = decomp_data[*index] & 255;
 475
 476     (*index)++;
 477 }
 478
 479 #define SBase   0xAC00
 480 #define LBase   0x1100
 481 #define VBase   0x1161
 482 #define TBase   0x11A7
 483 #define LCount  19
 484 #define VCount  21
 485 #define TCount  28
 486 #define NCount  (VCount*TCount)
 487 #define SCount  (LCount*NCount)
 488
 489 static PyObject*
 490 nfd_nfkd(PyObject *self, PyObject *input, int k)
 491 {
 492     PyObject *result;
 493     Py_UNICODE *i, *end, *o;
 494     /* Longest decomposition in Unicode 3.2: U+FDFA */
 495     Py_UNICODE stack[20];
 496     Py_ssize_t space, isize;
 497     int index, prefix, count, stackptr;
 498     unsigned char prev, cur;
 499
 500     stackptr = 0;
 501     isize = PyUnicode_GET_SIZE(input);
 502     /* Overallocate atmost 10 characters. */
 503     space = (isize > 10 ? 10 : isize) + isize;
 504     result = PyUnicode_FromUnicode(NULL, space);
 505     if (!result)
 506         return NULL;
 507     i = PyUnicode_AS_UNICODE(input);
 508     end = i + isize;
 509     o = PyUnicode_AS_UNICODE(result);
 510
 511     while (i < end) {
 512         stack[stackptr++] = *i++;
 513         while(stackptr) {
 514             Py_UNICODE code = stack[--stackptr];
 515             /* Hangul Decomposition adds three characters in
 516                a single step, so we need atleast that much room. */
 517             if (space < 3) {
 518                 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
 519                 space += 10;
 520                 if (PyUnicode_Resize(&result, newsize) == -1)
 521                     return NULL;
 522                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
 523             }
 524             /* Hangul Decomposition. */
 525             if (SBase <= code && code < (SBase+SCount)) {
 526                 int SIndex = code - SBase;
 527                 int L = LBase + SIndex / NCount;
 528                 int V = VBase + (SIndex % NCount) / TCount;
 529                 int T = TBase + SIndex % TCount;
 530                 *o++ = L;
 531                 *o++ = V;
 532                 space -= 2;
 533                 if (T != TBase) {
 534                     *o++ = T;
 535                     space --;
 536                 }
 537                 continue;
 538             }
 539             /* normalization changes */
 540             if (self) {
 541                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
 542                 if (value != 0) {
 543                     stack[stackptr++] = value;
 544                     continue;
 545                 }
 546             }
 547
 548             /* Other decompositions. */
 549             get_decomp_record(self, code, &index, &prefix, &count);
 550
 551             /* Copy character if it is not decomposable, or has a
 552                compatibility decomposition, but we do NFD. */
 553             if (!count || (prefix && !k)) {
 554                 *o++ = code;
 555                 space--;
 556                 continue;
 557             }
 558             /* Copy decomposition onto the stack, in reverse
 559                order.  */
 560             while(count) {
 561                 code = decomp_data[index + (--count)];
 562                 stack[stackptr++] = code;
 563             }
 564         }
 565     }
 566
 567     /* Drop overallocation. Cannot fail. */
 568     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 569
 570     /* Sort canonically. */
 571     i = PyUnicode_AS_UNICODE(result);
 572     prev = _getrecord_ex(*i)->combining;
 573     end = i + PyUnicode_GET_SIZE(result);
 574     for (i++; i < end; i++) {
 575         cur = _getrecord_ex(*i)->combining;
 576         if (prev == 0 || cur == 0 || prev <= cur) {
 577             prev = cur;
 578             continue;
 579         }
 580         /* Non-canonical order. Need to switch *i with previous. */
 581         o = i - 1;
 582         while (1) {
 583             Py_UNICODE tmp = o[1];
 584             o[1] = o[0];
 585             o[0] = tmp;
 586             o--;
 587             if (o < PyUnicode_AS_UNICODE(result))
 588                 break;
 589             prev = _getrecord_ex(*o)->combining;
 590             if (prev == 0 || prev <= cur)
 591                 break;
 592         }
 593         prev = _getrecord_ex(*i)->combining;
 594     }
 595     return result;
 596 }
 597
 598 static int
 599 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
 600 {
 601     int index;
 602     for (index = 0; nfc[index].start; index++) {
 603         int start = nfc[index].start;
 604         if (code < start)
 605             return -1;
 606         if (code <= start + nfc[index].count) {
 607             int delta = code - start;
 608             return nfc[index].index + delta;
 609         }
 610     }
 611     return -1;
 612 }
 613
 614 static PyObject*
 615 nfc_nfkc(PyObject *self, PyObject *input, int k)
 616 {
 617     PyObject *result;
 618     Py_UNICODE *i, *i1, *o, *end;
 619     int f,l,index,index1,comb;
 620     Py_UNICODE code;
 621     Py_UNICODE *skipped[20];
 622     int cskipped = 0;
 623
 624     result = nfd_nfkd(self, input, k);
 625     if (!result)
 626         return NULL;
 627
 628     /* We are going to modify result in-place.
 629        If nfd_nfkd is changed to sometimes return the input,
 630        this code needs to be reviewed. */
 631     assert(result != input);
 632
 633     i = PyUnicode_AS_UNICODE(result);
 634     end = i + PyUnicode_GET_SIZE(result);
 635     o = PyUnicode_AS_UNICODE(result);
 636
 637   again:
 638     while (i < end) {
 639       for (index = 0; index < cskipped; index++) {
 640           if (skipped[index] == i) {
 641               /* *i character is skipped.
 642                  Remove from list. */
 643               skipped[index] = skipped[cskipped-1];
 644               cskipped--;
 645               i++;
 646               goto again; /* continue while */
 647           }
 648       }
 649       /* Hangul Composition. We don't need to check for <LV,T>
 650          pairs, since we always have decomposed data. */
 651       if (LBase <= *i && *i < (LBase+LCount) &&
 652           i + 1 < end &&
 653           VBase <= i[1] && i[1] <= (VBase+VCount)) {
 654           int LIndex, VIndex;
 655           LIndex = i[0] - LBase;
 656           VIndex = i[1] - VBase;
 657           code = SBase + (LIndex*VCount+VIndex)*TCount;
 658           i+=2;
 659           if (i < end &&
 660               TBase <= *i && *i <= (TBase+TCount)) {
 661               code += *i-TBase;
 662               i++;
 663           }
 664           *o++ = code;
 665           continue;
 666       }
 667
 668       f = find_nfc_index(self, nfc_first, *i);
 669       if (f == -1) {
 670           *o++ = *i++;
 671           continue;
 672       }
 673       /* Find next unblocked character. */
 674       i1 = i+1;
 675       comb = 0;
 676       while (i1 < end) {
 677           int comb1 = _getrecord_ex(*i1)->combining;
 678           if (comb1 && comb == comb1) {
 679               /* Character is blocked. */
 680               i1++;
 681               continue;
 682           }
 683           l = find_nfc_index(self, nfc_last, *i1);
 684           /* *i1 cannot be combined with *i. If *i1
 685              is a starter, we don't need to look further.
 686              Otherwise, record the combining class. */
 687           if (l == -1) {
 688             not_combinable:
 689               if (comb1 == 0)
 690                   break;
 691               comb = comb1;
 692               i1++;
 693               continue;
 694           }
 695           index = f*TOTAL_LAST + l;
 696           index1 = comp_index[index >> COMP_SHIFT];
 697           code = comp_data[(index1<<COMP_SHIFT)+
 698                            (index&((1<<COMP_SHIFT)-1))];
 699           if (code == 0)
 700               goto not_combinable;
 701
 702           /* Replace the original character. */
 703           *i = code;
 704           /* Mark the second character unused. */
 705           skipped[cskipped++] = i1;
 706           i1++;
 707           f = find_nfc_index(self, nfc_first, *i);
 708           if (f == -1)
 709               break;
 710       }
 711       *o++ = *i++;
 712     }
 713     if (o != end)
 714         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 715     return result;
 716 }
 717
 718 PyDoc_STRVAR(unicodedata_normalize__doc__,
 719 "normalize(form, unistr)\n\
 720 \n\
 721 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
 722 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
 723
 724 static PyObject*
 725 unicodedata_normalize(PyObject *self, PyObject *args)
 726 {
 727     char *form;
 728     PyObject *input;
 729
 730     if(!PyArg_ParseTuple(args, "sO!:normalize",
 731                          &form, &PyUnicode_Type, &input))
 732         return NULL;
 733
 734     if (PyUnicode_GetSize(input) == 0) {
 735         /* Special case empty input strings, since resizing
 736            them  later would cause internal errors. */
 737         Py_INCREF(input);
 738         return input;
 739     }
 740
 741     if (strcmp(form, "NFC") == 0)
 742         return nfc_nfkc(self, input, 0);
 743     if (strcmp(form, "NFKC") == 0)
 744         return nfc_nfkc(self, input, 1);
 745     if (strcmp(form, "NFD") == 0)
 746         return nfd_nfkd(self, input, 0);
 747     if (strcmp(form, "NFKD") == 0)
 748         return nfd_nfkd(self, input, 1);
 749     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 750     return NULL;
 751 }
 752
 753 /* -------------------------------------------------------------------- */
 754 /* unicode character name tables */
 755
 756 /* data file generated by Tools/unicode/makeunicodedata.py */
 757 #include "unicodename_db.h"
 758
 759 /* -------------------------------------------------------------------- */
 760 /* database code (cut and pasted from the unidb package) */
 761
 762 static unsigned long
 763 _gethash(const char *s, int len, int scale)
 764 {
 765     int i;
 766     unsigned long h = 0;
 767     unsigned long ix;
 768     for (i = 0; i < len; i++) {
 769         h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
 770         ix = h & 0xff000000;
 771         if (ix)
 772             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 773     }
 774     return h;
 775 }
 776
 777 static char *hangul_syllables[][3] = {
 778     { "G",  "A",   ""   },
 779     { "GG", "AE",  "G"  },
 780     { "N",  "YA",  "GG" },
 781     { "D",  "YAE", "GS" },
 782     { "DD", "EO",  "N", },
 783     { "R",  "E",   "NJ" },
 784     { "M",  "YEO", "NH" },
 785     { "B",  "YE",  "D"  },
 786     { "BB", "O",   "L"  },
 787     { "S",  "WA",  "LG" },
 788     { "SS", "WAE", "LM" },
 789     { "",   "OE",  "LB" },
 790     { "J",  "YO",  "LS" },
 791     { "JJ", "U",   "LT" },
 792     { "C",  "WEO", "LP" },
 793     { "K",  "WE",  "LH" },
 794     { "T",  "WI",  "M"  },
 795     { "P",  "YU",  "B"  },
 796     { "H",  "EU",  "BS" },
 797     { 0,    "YI",  "S"  },
 798     { 0,    "I",   "SS" },
 799     { 0,    0,     "NG" },
 800     { 0,    0,     "J"  },
 801     { 0,    0,     "C"  },
 802     { 0,    0,     "K"  },
 803     { 0,    0,     "T"  },
 804     { 0,    0,     "P"  },
 805     { 0,    0,     "H"  }
 806 };
 807
 808 static int
 809 is_unified_ideograph(Py_UCS4 code)
 810 {
 811     return (
 812         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 813         (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
 814         (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
 815 }
 816
 817 static int
 818 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
 819 {
 820     int offset;
 821     int i;
 822     int word;
 823     unsigned char* w;
 824
 825     if (code >= 0x110000)
 826         return 0;
 827
 828     if (self) {
 829         const change_record *old = get_old_record(self, code);
 830         if (old->category_changed == 0) {
 831             /* unassigned */
 832             return 0;
 833         }
 834     }
 835
 836     if (SBase <= code && code < SBase+SCount) {
 837         /* Hangul syllable. */
 838         int SIndex = code - SBase;
 839         int L = SIndex / NCount;
 840         int V = (SIndex % NCount) / TCount;
 841         int T = SIndex % TCount;
 842
 843         if (buflen < 27)
 844             /* Worst case: HANGUL SYLLABLE <10chars>. */
 845             return 0;
 846         strcpy(buffer, "HANGUL SYLLABLE ");
 847         buffer += 16;
 848         strcpy(buffer, hangul_syllables[L][0]);
 849         buffer += strlen(hangul_syllables[L][0]);
 850         strcpy(buffer, hangul_syllables[V][1]);
 851         buffer += strlen(hangul_syllables[V][1]);
 852         strcpy(buffer, hangul_syllables[T][2]);
 853         buffer += strlen(hangul_syllables[T][2]);
 854         *buffer = '\0';
 855         return 1;
 856     }
 857
 858     if (is_unified_ideograph(code)) {
 859         if (buflen < 28)
 860             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 861             return 0;
 862         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 863         return 1;
 864     }
 865
 866     /* get offset into phrasebook */
 867     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 868     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 869                                (code&((1<<phrasebook_shift)-1))];
 870     if (!offset)
 871         return 0;
 872
 873     i = 0;
 874
 875     for (;;) {
 876         /* get word index */
 877         word = phrasebook[offset] - phrasebook_short;
 878         if (word >= 0) {
 879             word = (word << 8) + phrasebook[offset+1];
 880             offset += 2;
 881         } else
 882             word = phrasebook[offset++];
 883         if (i) {
 884             if (i > buflen)
 885                 return 0; /* buffer overflow */
 886             buffer[i++] = ' ';
 887         }
 888         /* copy word string from lexicon.  the last character in the
 889            word has bit 7 set.  the last word in a string ends with
 890            0x80 */
 891         w = lexicon + lexicon_offset[word];
 892         while (*w < 128) {
 893             if (i >= buflen)
 894                 return 0; /* buffer overflow */
 895             buffer[i++] = *w++;
 896         }
 897         if (i >= buflen)
 898             return 0; /* buffer overflow */
 899         buffer[i++] = *w & 127;
 900         if (*w == 128)
 901             break; /* end of word */
 902     }
 903
 904     return 1;
 905 }
 906
 907 static int
 908 _cmpname(PyObject *self, int code, const char* name, int namelen)
 909 {
 910     /* check if code corresponds to the given name */
 911     int i;
 912     char buffer[NAME_MAXLEN];
 913     if (!_getucname(self, code, buffer, sizeof(buffer)))
 914         return 0;
 915     for (i = 0; i < namelen; i++) {
 916         if (toupper(Py_CHARMASK(name[i])) != buffer[i])
 917             return 0;
 918     }
 919     return buffer[namelen] == '\0';
 920 }
 921
 922 static void
 923 find_syllable(const char *str, int *len, int *pos, int count, int column)
 924 {
 925     int i, len1;
 926     *len = -1;
 927     for (i = 0; i < count; i++) {
 928         char *s = hangul_syllables[i][column];
 929         len1 = strlen(s);
 930         if (len1 <= *len)
 931             continue;
 932         if (strncmp(str, s, len1) == 0) {
 933             *len = len1;
 934             *pos = i;
 935         }
 936     }
 937     if (*len == -1) {
 938         *len = 0;
 939     }
 940 }
 941
 942 static int
 943 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
 944 {
 945     unsigned int h, v;
 946     unsigned int mask = code_size-1;
 947     unsigned int i, incr;
 948
 949     /* Check for hangul syllables. */
 950     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
 951         int len, L = -1, V = -1, T = -1;
 952         const char *pos = name + 16;
 953         find_syllable(pos, &len, &L, LCount, 0);
 954         pos += len;
 955         find_syllable(pos, &len, &V, VCount, 1);
 956         pos += len;
 957         find_syllable(pos, &len, &T, TCount, 2);
 958         pos += len;
 959         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
 960             *code = SBase + (L*VCount+V)*TCount + T;
 961             return 1;
 962         }
 963         /* Otherwise, it's an illegal syllable name. */
 964         return 0;
 965     }
 966
 967     /* Check for unified ideographs. */
 968     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
 969         /* Four or five hexdigits must follow. */
 970         v = 0;
 971         name += 22;
 972         namelen -= 22;
 973         if (namelen != 4 && namelen != 5)
 974             return 0;
 975         while (namelen--) {
 976             v *= 16;
 977             if (*name >= '0' && *name <= '9')
 978                 v += *name - '0';
 979             else if (*name >= 'A' && *name <= 'F')
 980                 v += *name - 'A' + 10;
 981             else
 982                 return 0;
 983             name++;
 984         }
 985         if (!is_unified_ideograph(v))
 986             return 0;
 987         *code = v;
 988         return 1;
 989     }
 990
 991     /* the following is the same as python's dictionary lookup, with
 992        only minor changes.  see the makeunicodedata script for more
 993        details */
 994
 995     h = (unsigned int) _gethash(name, namelen, code_magic);
 996     i = (~h) & mask;
 997     v = code_hash[i];
 998     if (!v)
 999         return 0;
1000     if (_cmpname(self, v, name, namelen)) {
1001         *code = v;
1002         return 1;
1003     }
1004     incr = (h ^ (h >> 3)) & mask;
1005     if (!incr)
1006         incr = mask;
1007     for (;;) {
1008         i = (i + incr) & mask;
1009         v = code_hash[i];
1010         if (!v)
1011             return 0;
1012         if (_cmpname(self, v, name, namelen)) {
1013             *code = v;
1014             return 1;
1015         }
1016         incr = incr << 1;
1017         if (incr > mask)
1018             incr = incr ^ code_poly;
1019     }
1020 }
1021
1022 static const _PyUnicode_Name_CAPI hashAPI =
1023 {
1024     sizeof(_PyUnicode_Name_CAPI),
1025     _getucname,
1026     _getcode
1027 };
1028
1029 /* -------------------------------------------------------------------- */
1030 /* Python bindings */
1031
1032 PyDoc_STRVAR(unicodedata_name__doc__,
1033 "name(unichr[, default])\n\
1034 Returns the name assigned to the Unicode character unichr as a\n\
1035 string. If no name is defined, default is returned, or, if not\n\
1036 given, ValueError is raised.");
1037
1038 static PyObject *
1039 unicodedata_name(PyObject* self, PyObject* args)
1040 {
1041     char name[NAME_MAXLEN];
1042
1043     PyUnicodeObject* v;
1044     PyObject* defobj = NULL;
1045     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1046         return NULL;
1047
1048     if (PyUnicode_GET_SIZE(v) != 1) {
1049         PyErr_SetString(PyExc_TypeError,
1050                         "need a single Unicode character as parameter");
1051         return NULL;
1052     }
1053
1054     if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
1055                     name, sizeof(name))) {
1056         if (defobj == NULL) {
1057             PyErr_SetString(PyExc_ValueError, "no such name");
1058             return NULL;
1059         }
1060         else {
1061             Py_INCREF(defobj);
1062             return defobj;
1063         }
1064     }
1065
1066     return Py_BuildValue("s", name);
1067 }
1068
1069 PyDoc_STRVAR(unicodedata_lookup__doc__,
1070 "lookup(name)\n\
1071 \n\
1072 Look up character by name.  If a character with the\n\
1073 given name is found, return the corresponding Unicode\n\
1074 character.  If not found, KeyError is raised.");
1075
1076 static PyObject *
1077 unicodedata_lookup(PyObject* self, PyObject* args)
1078 {
1079     Py_UCS4 code;
1080     Py_UNICODE str[2];
1081
1082     char* name;
1083     int namelen;
1084     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1085         return NULL;
1086
1087     if (!_getcode(self, name, namelen, &code)) {
1088         PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1089                      name);
1090         return NULL;
1091     }
1092
1093 #ifndef Py_UNICODE_WIDE
1094     if (code >= 0x10000) {
1095         str[0] = 0xd800 + ((code - 0x10000) >> 10);
1096         str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1097         return PyUnicode_FromUnicode(str, 2);
1098     }
1099 #endif
1100     str[0] = (Py_UNICODE) code;
1101     return PyUnicode_FromUnicode(str, 1);
1102 }
1103
1104 /* XXX Add doc strings. */
1105
1106 static PyMethodDef unicodedata_functions[] = {
1107     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1108     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1109     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1110     {"category", unicodedata_category, METH_VARARGS,
1111                  unicodedata_category__doc__},
1112     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1113                       unicodedata_bidirectional__doc__},
1114     {"combining", unicodedata_combining, METH_VARARGS,
1115                   unicodedata_combining__doc__},
1116     {"mirrored", unicodedata_mirrored, METH_VARARGS,
1117                  unicodedata_mirrored__doc__},
1118     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1119                          unicodedata_east_asian_width__doc__},
1120     {"decomposition", unicodedata_decomposition, METH_VARARGS,
1121                       unicodedata_decomposition__doc__},
1122     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1123     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1124     {"normalize", unicodedata_normalize, METH_VARARGS,
1125                   unicodedata_normalize__doc__},
1126     {NULL, NULL}                /* sentinel */
1127 };
1128
1129 static PyTypeObject UCD_Type = {
1130         /* The ob_type field must be initialized in the module init function
1131          * to be portable to Windows without using C++. */
1132         PyVarObject_HEAD_INIT(NULL, 0)
1133         "unicodedata.UCD",              /*tp_name*/
1134         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1135         0,                      /*tp_itemsize*/
1136         /* methods */
1137         (destructor)PyObject_Del, /*tp_dealloc*/
1138         0,                      /*tp_print*/
1139         0,                      /*tp_getattr*/
1140         0,                      /*tp_setattr*/
1141         0,                      /*tp_compare*/
1142         0,                      /*tp_repr*/
1143         0,                      /*tp_as_number*/
1144         0,                      /*tp_as_sequence*/
1145         0,                      /*tp_as_mapping*/
1146         0,                      /*tp_hash*/
1147         0,                      /*tp_call*/
1148         0,                      /*tp_str*/
1149         PyObject_GenericGetAttr,/*tp_getattro*/
1150         0,                      /*tp_setattro*/
1151         0,                      /*tp_as_buffer*/
1152         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1153         0,                      /*tp_doc*/
1154         0,                      /*tp_traverse*/
1155         0,                      /*tp_clear*/
1156         0,                      /*tp_richcompare*/
1157         0,                      /*tp_weaklistoffset*/
1158         0,                      /*tp_iter*/
1159         0,                      /*tp_iternext*/
1160         unicodedata_functions,  /*tp_methods*/
1161         DB_members,             /*tp_members*/
1162         0,                      /*tp_getset*/
1163         0,                      /*tp_base*/
1164         0,                      /*tp_dict*/
1165         0,                      /*tp_descr_get*/
1166         0,                      /*tp_descr_set*/
1167         0,                      /*tp_dictoffset*/
1168         0,                      /*tp_init*/
1169         0,                      /*tp_alloc*/
1170         0,                      /*tp_new*/
1171         0,                      /*tp_free*/
1172         0,                      /*tp_is_gc*/
1173 };
1174
1175 PyDoc_STRVAR(unicodedata_docstring,
1176 "This module provides access to the Unicode Character Database which\n\
1177 defines character properties for all Unicode characters. The data in\n\
1178 this database is based on the UnicodeData.txt file version\n\
1179 4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1180 \n\
1181 The module uses the same names and symbols as defined by the\n\
1182 UnicodeData File Format 4.1.0 (see\n\
1183 http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
1184
1185 PyMODINIT_FUNC
1186 initunicodedata(void)
1187 {
1188     PyObject *m, *v;
1189
1190     Py_Type(&UCD_Type) = &PyType_Type;
1191
1192     m = Py_InitModule3(
1193         "unicodedata", unicodedata_functions, unicodedata_docstring);
1194     if (!m)
1195         return;
1196
1197     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1198     Py_INCREF(&UCD_Type);
1199     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1200
1201     /* Previous versions */
1202     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1203     if (v != NULL)
1204         PyModule_AddObject(m, "ucd_3_2_0", v);
1205
1206     /* Export C API */
1207     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1208     if (v != NULL)
1209         PyModule_AddObject(m, "ucnhash_CAPI", v);
1210 }
1211
1212 /*
1213 Local variables:
1214 c-basic-offset: 4
1215 indent-tabs-mode: nil
1216 End:
1217 */