Modules/unicodedata.c

   1 /* ------------------------------------------------------------------------
   2
   3    unicodedata -- Provides access to the Unicode 3.2 data base.
   4
   5    Data was extracted from the Unicode 3.2 UnicodeData.txt file.
   6
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9    Modified by Martin v. Löwis (martin@v.loewis.de)
  10
  11    Copyright (c) Corporation for National Research Initiatives.
  12
  13    ------------------------------------------------------------------------ */
  14
  15 #include "Python.h"
  16 #include "ucnhash.h"
  17
  18 /* character properties */
  19
  20 typedef struct {
  21     const unsigned char category;       /* index into
  22                                            _PyUnicode_CategoryNames */
  23     const unsigned char combining;      /* combining class value 0 - 255 */
  24     const unsigned char bidirectional;  /* index into
  25                                            _PyUnicode_BidirectionalNames */
  26     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  27     const unsigned char east_asian_width;       /* index into
  28                                                    _PyUnicode_EastAsianWidth */
  29 } _PyUnicode_DatabaseRecord;
  30
  31 /* data file generated by Tools/unicode/makeunicodedata.py */
  32 #include "unicodedata_db.h"
  33
  34 static const _PyUnicode_DatabaseRecord*
  35 _getrecord_ex(Py_UCS4 code)
  36 {
  37     int index;
  38     if (code >= 0x110000)
  39         index = 0;
  40     else {
  41         index = index1[(code>>SHIFT)];
  42         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  43     }
  44
  45     return &_PyUnicode_Database_Records[index];
  46 }
  47
  48 static const _PyUnicode_DatabaseRecord*
  49 _getrecord(PyUnicodeObject* v)
  50 {
  51     return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
  52 }
  53
  54 /* --- Module API --------------------------------------------------------- */
  55
  56 PyDoc_STRVAR(unicodedata_decimal__doc__,
  57 "decimal(unichr[, default])\n\
  58 \n\
  59 Returns the decimal value assigned to the Unicode character unichr\n\
  60 as integer. If no such value is defined, default is returned, or, if\n\
  61 not given, ValueError is raised.");
  62
  63 static PyObject *
  64 unicodedata_decimal(PyObject *self, PyObject *args)
  65 {
  66     PyUnicodeObject *v;
  67     PyObject *defobj = NULL;
  68     long rc;
  69
  70     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
  71         return NULL;
  72     if (PyUnicode_GET_SIZE(v) != 1) {
  73         PyErr_SetString(PyExc_TypeError,
  74                         "need a single Unicode character as parameter");
  75         return NULL;
  76     }
  77     rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
  78     if (rc < 0) {
  79         if (defobj == NULL) {
  80             PyErr_SetString(PyExc_ValueError,
  81                             "not a decimal");
  82             return NULL;
  83         }
  84         else {
  85             Py_INCREF(defobj);
  86             return defobj;
  87         }
  88     }
  89     return PyInt_FromLong(rc);
  90 }
  91
  92 PyDoc_STRVAR(unicodedata_digit__doc__,
  93 "digit(unichr[, default])\n\
  94 \n\
  95 Returns the digit value assigned to the Unicode character unichr as\n\
  96 integer. If no such value is defined, default is returned, or, if\n\
  97 not given, ValueError is raised.");
  98
  99 static PyObject *
 100 unicodedata_digit(PyObject *self, PyObject *args)
 101 {
 102     PyUnicodeObject *v;
 103     PyObject *defobj = NULL;
 104     long rc;
 105
 106     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
 107         return NULL;
 108     if (PyUnicode_GET_SIZE(v) != 1) {
 109         PyErr_SetString(PyExc_TypeError,
 110                         "need a single Unicode character as parameter");
 111         return NULL;
 112     }
 113     rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
 114     if (rc < 0) {
 115         if (defobj == NULL) {
 116             PyErr_SetString(PyExc_ValueError, "not a digit");
 117             return NULL;
 118         }
 119         else {
 120             Py_INCREF(defobj);
 121             return defobj;
 122         }
 123     }
 124     return PyInt_FromLong(rc);
 125 }
 126
 127 PyDoc_STRVAR(unicodedata_numeric__doc__,
 128 "numeric(unichr[, default])\n\
 129 \n\
 130 Returns the numeric value assigned to the Unicode character unichr\n\
 131 as float. If no such value is defined, default is returned, or, if\n\
 132 not given, ValueError is raised.");
 133
 134 static PyObject *
 135 unicodedata_numeric(PyObject *self, PyObject *args)
 136 {
 137     PyUnicodeObject *v;
 138     PyObject *defobj = NULL;
 139     double rc;
 140
 141     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 142         return NULL;
 143     if (PyUnicode_GET_SIZE(v) != 1) {
 144         PyErr_SetString(PyExc_TypeError,
 145                         "need a single Unicode character as parameter");
 146         return NULL;
 147     }
 148     rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
 149     if (rc < 0) {
 150         if (defobj == NULL) {
 151             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 152             return NULL;
 153         }
 154         else {
 155             Py_INCREF(defobj);
 156             return defobj;
 157         }
 158     }
 159     return PyFloat_FromDouble(rc);
 160 }
 161
 162 PyDoc_STRVAR(unicodedata_category__doc__,
 163 "category(unichr)\n\
 164 \n\
 165 Returns the general category assigned to the Unicode character\n\
 166 unichr as string.");
 167
 168 static PyObject *
 169 unicodedata_category(PyObject *self, PyObject *args)
 170 {
 171     PyUnicodeObject *v;
 172     int index;
 173
 174     if (!PyArg_ParseTuple(args, "O!:category",
 175                           &PyUnicode_Type, &v))
 176         return NULL;
 177     if (PyUnicode_GET_SIZE(v) != 1) {
 178         PyErr_SetString(PyExc_TypeError,
 179                         "need a single Unicode character as parameter");
 180         return NULL;
 181     }
 182     index = (int) _getrecord(v)->category;
 183     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 184 }
 185
 186 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
 187 "bidirectional(unichr)\n\
 188 \n\
 189 Returns the bidirectional category assigned to the Unicode character\n\
 190 unichr as string. If no such value is defined, an empty string is\n\
 191 returned.");
 192
 193 static PyObject *
 194 unicodedata_bidirectional(PyObject *self, PyObject *args)
 195 {
 196     PyUnicodeObject *v;
 197     int index;
 198
 199     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 200                           &PyUnicode_Type, &v))
 201         return NULL;
 202     if (PyUnicode_GET_SIZE(v) != 1) {
 203         PyErr_SetString(PyExc_TypeError,
 204                         "need a single Unicode character as parameter");
 205         return NULL;
 206     }
 207     index = (int) _getrecord(v)->bidirectional;
 208     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 209 }
 210
 211 PyDoc_STRVAR(unicodedata_combining__doc__,
 212 "combining(unichr)\n\
 213 \n\
 214 Returns the canonical combining class assigned to the Unicode\n\
 215 character unichr as integer. Returns 0 if no combining class is\n\
 216 defined.");
 217
 218 static PyObject *
 219 unicodedata_combining(PyObject *self, PyObject *args)
 220 {
 221     PyUnicodeObject *v;
 222
 223     if (!PyArg_ParseTuple(args, "O!:combining",
 224                           &PyUnicode_Type, &v))
 225         return NULL;
 226     if (PyUnicode_GET_SIZE(v) != 1) {
 227         PyErr_SetString(PyExc_TypeError,
 228                         "need a single Unicode character as parameter");
 229         return NULL;
 230     }
 231     return PyInt_FromLong((int) _getrecord(v)->combining);
 232 }
 233
 234 PyDoc_STRVAR(unicodedata_mirrored__doc__,
 235 "mirrored(unichr)\n\
 236 \n\
 237 Returns the mirrored property assigned to the Unicode character\n\
 238 unichr as integer. Returns 1 if the character has been identified as\n\
 239 a \"mirrored\" character in bidirectional text, 0 otherwise.");
 240
 241 static PyObject *
 242 unicodedata_mirrored(PyObject *self, PyObject *args)
 243 {
 244     PyUnicodeObject *v;
 245
 246     if (!PyArg_ParseTuple(args, "O!:mirrored",
 247                           &PyUnicode_Type, &v))
 248         return NULL;
 249     if (PyUnicode_GET_SIZE(v) != 1) {
 250         PyErr_SetString(PyExc_TypeError,
 251                         "need a single Unicode character as parameter");
 252         return NULL;
 253     }
 254     return PyInt_FromLong((int) _getrecord(v)->mirrored);
 255 }
 256
 257 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
 258 "east_asian_width(unichr)\n\
 259 \n\
 260 Returns the east asian width assigned to the Unicode character\n\
 261 unichr as string.");
 262
 263 static PyObject *
 264 unicodedata_east_asian_width(PyObject *self, PyObject *args)
 265 {
 266     PyUnicodeObject *v;
 267     int index;
 268
 269     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
 270                           &PyUnicode_Type, &v))
 271         return NULL;
 272     if (PyUnicode_GET_SIZE(v) != 1) {
 273         PyErr_SetString(PyExc_TypeError,
 274                         "need a single Unicode character as parameter");
 275         return NULL;
 276     }
 277     index = (int) _getrecord(v)->east_asian_width;
 278     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
 279 }
 280
 281 PyDoc_STRVAR(unicodedata_decomposition__doc__,
 282 "decomposition(unichr)\n\
 283 \n\
 284 Returns the character decomposition mapping assigned to the Unicode\n\
 285 character unichr as string. An empty string is returned in case no\n\
 286 such mapping is defined.");
 287
 288 static PyObject *
 289 unicodedata_decomposition(PyObject *self, PyObject *args)
 290 {
 291     PyUnicodeObject *v;
 292     char decomp[256];
 293     int code, index, count, i;
 294
 295     if (!PyArg_ParseTuple(args, "O!:decomposition",
 296                           &PyUnicode_Type, &v))
 297         return NULL;
 298     if (PyUnicode_GET_SIZE(v) != 1) {
 299         PyErr_SetString(PyExc_TypeError,
 300                         "need a single Unicode character as parameter");
 301         return NULL;
 302     }
 303
 304     code = (int) *PyUnicode_AS_UNICODE(v);
 305
 306     if (code < 0 || code >= 0x110000)
 307         index = 0;
 308     else {
 309         index = decomp_index1[(code>>DECOMP_SHIFT)];
 310         index = decomp_index2[(index<<DECOMP_SHIFT)+
 311                              (code&((1<<DECOMP_SHIFT)-1))];
 312     }
 313
 314     /* high byte is number of hex bytes (usually one or two), low byte
 315        is prefix code (from*/
 316     count = decomp_data[index] >> 8;
 317
 318     /* XXX: could allocate the PyString up front instead
 319        (strlen(prefix) + 5 * count + 1 bytes) */
 320
 321     /* copy prefix */
 322     i = strlen(decomp_prefix[decomp_data[index] & 255]);
 323     memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
 324
 325     while (count-- > 0) {
 326         if (i)
 327             decomp[i++] = ' ';
 328         assert((size_t)i < sizeof(decomp));
 329         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 330                       decomp_data[++index]);
 331         i += strlen(decomp + i);
 332     }
 333
 334     decomp[i] = '\0';
 335
 336     return PyString_FromString(decomp);
 337 }
 338
 339 void
 340 get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
 341 {
 342     if (code >= 0x110000) {
 343         *index = 0;
 344     }
 345     else {
 346         *index = decomp_index1[(code>>DECOMP_SHIFT)];
 347         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 348                                (code&((1<<DECOMP_SHIFT)-1))];
 349     }
 350
 351     /* high byte is number of hex bytes (usually one or two), low byte
 352        is prefix code (from*/
 353     *count = decomp_data[*index] >> 8;
 354     *prefix = decomp_data[*index] & 255;
 355
 356     (*index)++;
 357 }
 358
 359 #define SBase   0xAC00
 360 #define LBase   0x1100
 361 #define VBase   0x1161
 362 #define TBase   0x11A7
 363 #define LCount  19
 364 #define VCount  21
 365 #define TCount  28
 366 #define NCount  (VCount*TCount)
 367 #define SCount  (LCount*NCount)
 368
 369 static PyObject*
 370 nfd_nfkd(PyObject *input, int k)
 371 {
 372     PyObject *result;
 373     Py_UNICODE *i, *end, *o;
 374     /* Longest decomposition in Unicode 3.2: U+FDFA */
 375     Py_UNICODE stack[20];
 376     int space, stackptr, isize;
 377     int index, prefix, count;
 378     unsigned char prev, cur;
 379
 380     stackptr = 0;
 381     isize = PyUnicode_GET_SIZE(input);
 382     /* Overallocate atmost 10 characters. */
 383     space = (isize > 10 ? 10 : isize) + isize;
 384     result = PyUnicode_FromUnicode(NULL, space);
 385     if (!result)
 386         return NULL;
 387     i = PyUnicode_AS_UNICODE(input);
 388     end = i + isize;
 389     o = PyUnicode_AS_UNICODE(result);
 390
 391     while (i < end) {
 392         stack[stackptr++] = *i++;
 393         while(stackptr) {
 394             Py_UNICODE code = stack[--stackptr];
 395             /* Hangul Decomposition adds three characters in
 396                a single step, so we need atleast that much room. */
 397             if (space < 3) {
 398                 int newsize = PyString_GET_SIZE(result) + 10;
 399                 space += 10;
 400                 if (PyUnicode_Resize(&result, newsize) == -1)
 401                     return NULL;
 402                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
 403             }
 404             /* Hangul Decomposition. */
 405             if (SBase <= code && code < (SBase+SCount)) {
 406                 int SIndex = code - SBase;
 407                 int L = LBase + SIndex / NCount;
 408                 int V = VBase + (SIndex % NCount) / TCount;
 409                 int T = TBase + SIndex % TCount;
 410                 *o++ = L;
 411                 *o++ = V;
 412                 space -= 2;
 413                 if (T != TBase) {
 414                     *o++ = T;
 415                     space --;
 416                 }
 417                 continue;
 418             }
 419             /* Other decompoistions. */
 420             get_decomp_record(code, &index, &prefix, &count);
 421
 422             /* Copy character if it is not decomposable, or has a
 423                compatibility decomposition, but we do NFD. */
 424             if (!count || (prefix && !k)) {
 425                 *o++ = code;
 426                 space--;
 427                 continue;
 428             }
 429             /* Copy decomposition onto the stack, in reverse
 430                order.  */
 431             while(count) {
 432                 code = decomp_data[index + (--count)];
 433                 stack[stackptr++] = code;
 434             }
 435         }
 436     }
 437
 438     /* Drop overallocation. Cannot fail. */
 439     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 440
 441     /* Sort canonically. */
 442     i = PyUnicode_AS_UNICODE(result);
 443     prev = _getrecord_ex(*i)->combining;
 444     end = i + PyUnicode_GET_SIZE(result);
 445     for (i++; i < end; i++) {
 446         cur = _getrecord_ex(*i)->combining;
 447         if (prev == 0 || cur == 0 || prev <= cur) {
 448             prev = cur;
 449             continue;
 450         }
 451         /* Non-canonical order. Need to switch *i with previous. */
 452         o = i - 1;
 453         while (1) {
 454             Py_UNICODE tmp = o[1];
 455             o[1] = o[0];
 456             o[0] = tmp;
 457             o--;
 458             if (o < PyUnicode_AS_UNICODE(result))
 459                 break;
 460             prev = _getrecord_ex(*o)->combining;
 461             if (prev == 0 || prev <= cur)
 462                 break;
 463         }
 464         prev = _getrecord_ex(*i)->combining;
 465     }
 466     return result;
 467 }
 468
 469 static int
 470 find_nfc_index(struct reindex* nfc, Py_UNICODE code)
 471 {
 472     int index;
 473     for (index = 0; nfc[index].start; index++) {
 474         int start = nfc[index].start;
 475         if (code < start)
 476             return -1;
 477         if (code <= start + nfc[index].count) {
 478             int delta = code - start;
 479             return nfc[index].index + delta;
 480         }
 481     }
 482     return -1;
 483 }
 484
 485 static PyObject*
 486 nfc_nfkc(PyObject *input, int k)
 487 {
 488     PyObject *result;
 489     Py_UNICODE *i, *i1, *o, *end;
 490     int f,l,index,index1,comb;
 491     Py_UNICODE code;
 492     Py_UNICODE *skipped[20];
 493     int cskipped = 0;
 494
 495     result = nfd_nfkd(input, k);
 496     if (!result)
 497         return NULL;
 498
 499     /* We are going to modify result in-place.
 500        If nfd_nfkd is changed to sometimes return the input,
 501        this code needs to be reviewed. */
 502     assert(result != input);
 503
 504     i = PyUnicode_AS_UNICODE(result);
 505     end = i + PyUnicode_GET_SIZE(result);
 506     o = PyUnicode_AS_UNICODE(result);
 507
 508   again:
 509     while (i < end) {
 510       for (index = 0; index < cskipped; index++) {
 511           if (skipped[index] == i) {
 512               /* *i character is skipped.
 513                  Remove from list. */
 514               skipped[index] = skipped[cskipped-1];
 515               cskipped--;
 516               i++;
 517               goto again; /* continue while */
 518           }
 519       }
 520       /* Hangul Composition. We don't need to check for <LV,T>
 521          pairs, since we always have decomposed data. */
 522       if (LBase <= *i && *i < (LBase+LCount) &&
 523           i + 1 < end &&
 524           VBase <= i[1] && i[1] <= (VBase+VCount)) {
 525           int LIndex, VIndex;
 526           LIndex = i[0] - LBase;
 527           VIndex = i[1] - VBase;
 528           code = SBase + (LIndex*VCount+VIndex)*TCount;
 529           i+=2;
 530           if (i < end &&
 531               TBase <= *i && *i <= (TBase+TCount)) {
 532               code += *i-TBase;
 533               i++;
 534           }
 535           *o++ = code;
 536           continue;
 537       }
 538
 539       f = find_nfc_index(nfc_first, *i);
 540       if (f == -1) {
 541           *o++ = *i++;
 542           continue;
 543       }
 544       /* Find next unblocked character. */
 545       i1 = i+1;
 546       comb = 0;
 547       while (i1 < end) {
 548           int comb1 = _getrecord_ex(*i1)->combining;
 549           if (comb1 && comb == comb1) {
 550               /* Character is blocked. */
 551               i1++;
 552               continue;
 553           }
 554           l = find_nfc_index(nfc_last, *i1);
 555           /* *i1 cannot be combined with *i. If *i1
 556              is a starter, we don't need to look further.
 557              Otherwise, record the combining class. */
 558           if (l == -1) {
 559             not_combinable:
 560               if (comb1 == 0)
 561                   break;
 562               comb = comb1;
 563               i1++;
 564               continue;
 565           }
 566           index = f*TOTAL_LAST + l;
 567           index1 = comp_index[index >> COMP_SHIFT];
 568           code = comp_data[(index1<<COMP_SHIFT)+
 569                            (index&((1<<COMP_SHIFT)-1))];
 570           if (code == 0)
 571               goto not_combinable;
 572
 573           /* Replace the original character. */
 574           *i = code;
 575           /* Mark the second character unused. */
 576           skipped[cskipped++] = i1;
 577           i1++;
 578           f = find_nfc_index(nfc_first, *i);
 579           if (f == -1)
 580               break;
 581       }
 582       *o++ = *i++;
 583     }
 584     if (o != end)
 585         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 586     return result;
 587 }
 588
 589 PyDoc_STRVAR(unicodedata_normalize__doc__,
 590 "normalize(form, unistr)\n\
 591 \n\
 592 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
 593 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
 594
 595 static PyObject*
 596 unicodedata_normalize(PyObject *self, PyObject *args)
 597 {
 598     char *form;
 599     PyObject *input;
 600
 601     if(!PyArg_ParseTuple(args, "sO!:normalize",
 602                          &form, &PyUnicode_Type, &input))
 603         return NULL;
 604
 605     if (PyUnicode_GetSize(input) == 0) {
 606         /* Special case empty input strings, since resizing
 607            them  later would cause internal errors. */
 608         Py_INCREF(input);
 609         return input;
 610     }
 611
 612     if (strcmp(form, "NFC") == 0)
 613         return nfc_nfkc(input, 0);
 614     if (strcmp(form, "NFKC") == 0)
 615         return nfc_nfkc(input, 1);
 616     if (strcmp(form, "NFD") == 0)
 617         return nfd_nfkd(input, 0);
 618     if (strcmp(form, "NFKD") == 0)
 619         return nfd_nfkd(input, 1);
 620     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 621     return NULL;
 622 }
 623
 624 /* -------------------------------------------------------------------- */
 625 /* unicode character name tables */
 626
 627 /* data file generated by Tools/unicode/makeunicodedata.py */
 628 #include "unicodename_db.h"
 629
 630 /* -------------------------------------------------------------------- */
 631 /* database code (cut and pasted from the unidb package) */
 632
 633 static unsigned long
 634 _gethash(const char *s, int len, int scale)
 635 {
 636     int i;
 637     unsigned long h = 0;
 638     unsigned long ix;
 639     for (i = 0; i < len; i++) {
 640         h = (h * scale) + (unsigned char) toupper(s[i]);
 641         ix = h & 0xff000000;
 642         if (ix)
 643             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 644     }
 645     return h;
 646 }
 647
 648 static char *hangul_syllables[][3] = {
 649     { "G",  "A",   ""   },
 650     { "GG", "AE",  "G"  },
 651     { "N",  "YA",  "GG" },
 652     { "D",  "YAE", "GS" },
 653     { "DD", "EO",  "N", },
 654     { "R",  "E",   "NJ" },
 655     { "M",  "YEO", "NH" },
 656     { "B",  "YE",  "D"  },
 657     { "BB", "O",   "L"  },
 658     { "S",  "WA",  "LG" },
 659     { "SS", "WAE", "LM" },
 660     { "",   "OE",  "LB" },
 661     { "J",  "YO",  "LS" },
 662     { "JJ", "U",   "LT" },
 663     { "C",  "WEO", "LP" },
 664     { "K",  "WE",  "LH" },
 665     { "T",  "WI",  "M"  },
 666     { "P",  "YU",  "B"  },
 667     { "H",  "EU",  "BS" },
 668     { 0,    "YI",  "S"  },
 669     { 0,    "I",   "SS" },
 670     { 0,    0,     "NG" },
 671     { 0,    0,     "J"  },
 672     { 0,    0,     "C"  },
 673     { 0,    0,     "K"  },
 674     { 0,    0,     "T"  },
 675     { 0,    0,     "P"  },
 676     { 0,    0,     "H"  }
 677 };
 678
 679 static int
 680 is_unified_ideograph(Py_UCS4 code)
 681 {
 682     return (
 683         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 684         (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
 685         (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
 686 }
 687
 688 static int
 689 _getucname(Py_UCS4 code, char* buffer, int buflen)
 690 {
 691     int offset;
 692     int i;
 693     int word;
 694     unsigned char* w;
 695
 696     if (SBase <= code && code < SBase+SCount) {
 697         /* Hangul syllable. */
 698         int SIndex = code - SBase;
 699         int L = SIndex / NCount;
 700         int V = (SIndex % NCount) / TCount;
 701         int T = SIndex % TCount;
 702
 703         if (buflen < 27)
 704             /* Worst case: HANGUL SYLLABLE <10chars>. */
 705             return 0;
 706         strcpy(buffer, "HANGUL SYLLABLE ");
 707         buffer += 16;
 708         strcpy(buffer, hangul_syllables[L][0]);
 709         buffer += strlen(hangul_syllables[L][0]);
 710         strcpy(buffer, hangul_syllables[V][1]);
 711         buffer += strlen(hangul_syllables[V][1]);
 712         strcpy(buffer, hangul_syllables[T][2]);
 713         buffer += strlen(hangul_syllables[T][2]);
 714         *buffer = '\0';
 715         return 1;
 716     }
 717
 718     if (is_unified_ideograph(code)) {
 719         if (buflen < 28)
 720             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 721             return 0;
 722         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 723         return 1;
 724     }
 725
 726     if (code >= 0x110000)
 727         return 0;
 728
 729     /* get offset into phrasebook */
 730     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 731     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 732                                (code&((1<<phrasebook_shift)-1))];
 733     if (!offset)
 734         return 0;
 735
 736     i = 0;
 737
 738     for (;;) {
 739         /* get word index */
 740         word = phrasebook[offset] - phrasebook_short;
 741         if (word >= 0) {
 742             word = (word << 8) + phrasebook[offset+1];
 743             offset += 2;
 744         } else
 745             word = phrasebook[offset++];
 746         if (i) {
 747             if (i > buflen)
 748                 return 0; /* buffer overflow */
 749             buffer[i++] = ' ';
 750         }
 751         /* copy word string from lexicon.  the last character in the
 752            word has bit 7 set.  the last word in a string ends with
 753            0x80 */
 754         w = lexicon + lexicon_offset[word];
 755         while (*w < 128) {
 756             if (i >= buflen)
 757                 return 0; /* buffer overflow */
 758             buffer[i++] = *w++;
 759         }
 760         if (i >= buflen)
 761             return 0; /* buffer overflow */
 762         buffer[i++] = *w & 127;
 763         if (*w == 128)
 764             break; /* end of word */
 765     }
 766
 767     return 1;
 768 }
 769
 770 static int
 771 _cmpname(int code, const char* name, int namelen)
 772 {
 773     /* check if code corresponds to the given name */
 774     int i;
 775     char buffer[NAME_MAXLEN];
 776     if (!_getucname(code, buffer, sizeof(buffer)))
 777         return 0;
 778     for (i = 0; i < namelen; i++) {
 779         if (toupper(name[i]) != buffer[i])
 780             return 0;
 781     }
 782     return buffer[namelen] == '\0';
 783 }
 784
 785 static void
 786 find_syllable(const char *str, int *len, int *pos, int count, int column)
 787 {
 788     int i, len1;
 789     *len = -1;
 790     for (i = 0; i < count; i++) {
 791         char *s = hangul_syllables[i][column];
 792         len1 = strlen(s);
 793         if (len1 <= *len)
 794             continue;
 795         if (strncmp(str, s, len1) == 0) {
 796             *len = len1;
 797             *pos = i;
 798         }
 799     }
 800     if (*len == -1) {
 801         *len = 0;
 802         *pos = -1;
 803     }
 804 }
 805
 806 static int
 807 _getcode(const char* name, int namelen, Py_UCS4* code)
 808 {
 809     unsigned int h, v;
 810     unsigned int mask = code_size-1;
 811     unsigned int i, incr;
 812
 813     /* Check for hangul syllables. */
 814     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
 815         int L, V, T, len;
 816         const char *pos = name + 16;
 817         find_syllable(pos, &len, &L, LCount, 0);
 818         pos += len;
 819         find_syllable(pos, &len, &V, VCount, 1);
 820         pos += len;
 821         find_syllable(pos, &len, &T, TCount, 2);
 822         pos += len;
 823         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
 824             *code = SBase + (L*VCount+V)*TCount + T;
 825             return 1;
 826         }
 827         /* Otherwise, it's an illegal syllable name. */
 828         return 0;
 829     }
 830
 831     /* Check for unified ideographs. */
 832     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
 833         /* Four or five hexdigits must follow. */
 834         v = 0;
 835         name += 22;
 836         namelen -= 22;
 837         if (namelen != 4 && namelen != 5)
 838             return 0;
 839         while (namelen--) {
 840             v *= 16;
 841             if (*name >= '0' && *name <= '9')
 842                 v += *name - '0';
 843             else if (*name >= 'A' && *name <= 'F')
 844                 v += *name - 'A' + 10;
 845             else
 846                 return 0;
 847             name++;
 848         }
 849         if (!is_unified_ideograph(v))
 850             return 0;
 851         *code = v;
 852         return 1;
 853     }
 854
 855     /* the following is the same as python's dictionary lookup, with
 856        only minor changes.  see the makeunicodedata script for more
 857        details */
 858
 859     h = (unsigned int) _gethash(name, namelen, code_magic);
 860     i = (~h) & mask;
 861     v = code_hash[i];
 862     if (!v)
 863         return 0;
 864     if (_cmpname(v, name, namelen)) {
 865         *code = v;
 866         return 1;
 867     }
 868     incr = (h ^ (h >> 3)) & mask;
 869     if (!incr)
 870         incr = mask;
 871     for (;;) {
 872         i = (i + incr) & mask;
 873         v = code_hash[i];
 874         if (!v)
 875             return 0;
 876         if (_cmpname(v, name, namelen)) {
 877             *code = v;
 878             return 1;
 879         }
 880         incr = incr << 1;
 881         if (incr > mask)
 882             incr = incr ^ code_poly;
 883     }
 884 }
 885
 886 static const _PyUnicode_Name_CAPI hashAPI =
 887 {
 888     sizeof(_PyUnicode_Name_CAPI),
 889     _getucname,
 890     _getcode
 891 };
 892
 893 /* -------------------------------------------------------------------- */
 894 /* Python bindings */
 895
 896 PyDoc_STRVAR(unicodedata_name__doc__,
 897 "name(unichr[, default])\n\
 898 Returns the name assigned to the Unicode character unichr as a\n\
 899 string. If no name is defined, default is returned, or, if not\n\
 900 given, ValueError is raised.");
 901
 902 static PyObject *
 903 unicodedata_name(PyObject* self, PyObject* args)
 904 {
 905     char name[NAME_MAXLEN];
 906
 907     PyUnicodeObject* v;
 908     PyObject* defobj = NULL;
 909     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
 910         return NULL;
 911
 912     if (PyUnicode_GET_SIZE(v) != 1) {
 913         PyErr_SetString(PyExc_TypeError,
 914                         "need a single Unicode character as parameter");
 915         return NULL;
 916     }
 917
 918     if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
 919                              name, sizeof(name))) {
 920         if (defobj == NULL) {
 921             PyErr_SetString(PyExc_ValueError, "no such name");
 922             return NULL;
 923         }
 924         else {
 925             Py_INCREF(defobj);
 926             return defobj;
 927         }
 928     }
 929
 930     return Py_BuildValue("s", name);
 931 }
 932
 933 PyDoc_STRVAR(unicodedata_lookup__doc__,
 934 "lookup(name)\n\
 935 \n\
 936 Look up character by name.  If a character with the\n\
 937 given name is found, return the corresponding Unicode\n\
 938 character.  If not found, KeyError is raised.");
 939
 940 static PyObject *
 941 unicodedata_lookup(PyObject* self, PyObject* args)
 942 {
 943     Py_UCS4 code;
 944     Py_UNICODE str[1];
 945
 946     char* name;
 947     int namelen;
 948     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
 949         return NULL;
 950
 951     if (!_getcode(name, namelen, &code)) {
 952         char fmt[] = "undefined character name '%s'";
 953         char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
 954         sprintf(buf, fmt, name);
 955         PyErr_SetString(PyExc_KeyError, buf);
 956         PyMem_FREE(buf);
 957         return NULL;
 958     }
 959
 960     str[0] = (Py_UNICODE) code;
 961     return PyUnicode_FromUnicode(str, 1);
 962 }
 963
 964 /* XXX Add doc strings. */
 965
 966 static PyMethodDef unicodedata_functions[] = {
 967     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
 968     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
 969     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
 970     {"category", unicodedata_category, METH_VARARGS,
 971                  unicodedata_category__doc__},
 972     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
 973                       unicodedata_bidirectional__doc__},
 974     {"combining", unicodedata_combining, METH_VARARGS,
 975                   unicodedata_combining__doc__},
 976     {"mirrored", unicodedata_mirrored, METH_VARARGS,
 977                  unicodedata_mirrored__doc__},
 978     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
 979                          unicodedata_east_asian_width__doc__},
 980     {"decomposition", unicodedata_decomposition, METH_VARARGS,
 981                       unicodedata_decomposition__doc__},
 982     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
 983     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
 984     {"normalize", unicodedata_normalize, METH_VARARGS,
 985                   unicodedata_normalize__doc__},
 986     {NULL, NULL}                /* sentinel */
 987 };
 988
 989 PyDoc_STRVAR(unicodedata_docstring,
 990 "This module provides access to the Unicode Character Database which\n\
 991 defines character properties for all Unicode characters. The data in\n\
 992 this database is based on the UnicodeData.txt file version\n\
 993 3.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
 994 \n\
 995 The module uses the same names and symbols as defined by the\n\
 996 UnicodeData File Format 3.2.0 (see\n\
 997 http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html).");
 998
 999 PyMODINIT_FUNC
1000 initunicodedata(void)
1001 {
1002     PyObject *m, *v;
1003
1004     m = Py_InitModule3(
1005         "unicodedata", unicodedata_functions, unicodedata_docstring);
1006     if (!m)
1007         return;
1008
1009     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1010
1011     /* Export C API */
1012     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1013     if (v != NULL)
1014         PyModule_AddObject(m, "ucnhash_CAPI", v);
1015 }
1016
1017 /*
1018 Local variables:
1019 c-basic-offset: 4
1020 indent-tabs-mode: nil
1021 End:
1022 */