Modules/unicodedata.c

   1 /* ------------------------------------------------------------------------
   2
   3    unicodedata -- Provides access to the Unicode 5.1 data base.
   4
   5    Data was extracted from the Unicode 5.1 UnicodeData.txt file.
   6
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9    Modified by Martin v. Löwis (martin@v.loewis.de)
  10
  11    Copyright (c) Corporation for National Research Initiatives.
  12
  13    ------------------------------------------------------------------------ */
  14
  15 #include "Python.h"
  16 #include "ucnhash.h"
  17 #include "structmember.h"
  18
  19 /* character properties */
  20
  21 typedef struct {
  22     const unsigned char category;       /* index into
  23                                            _PyUnicode_CategoryNames */
  24     const unsigned char combining;      /* combining class value 0 - 255 */
  25     const unsigned char bidirectional;  /* index into
  26                                            _PyUnicode_BidirectionalNames */
  27     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  28     const unsigned char east_asian_width;       /* index into
  29                                                    _PyUnicode_EastAsianWidth */
  30     const unsigned char normalization_quick_check; /* see is_normalized() */
  31 } _PyUnicode_DatabaseRecord;
  32
  33 typedef struct change_record {
  34     /* sequence of fields should be the same as in merge_old_version */
  35     const unsigned char bidir_changed;
  36     const unsigned char category_changed;
  37     const unsigned char decimal_changed;
  38     const unsigned char mirrored_changed;
  39     const int numeric_changed;
  40 } change_record;
  41
  42 /* data file generated by Tools/unicode/makeunicodedata.py */
  43 #include "unicodedata_db.h"
  44
  45 static const _PyUnicode_DatabaseRecord*
  46 _getrecord_ex(Py_UCS4 code)
  47 {
  48     int index;
  49     if (code >= 0x110000)
  50         index = 0;
  51     else {
  52         index = index1[(code>>SHIFT)];
  53         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  54     }
  55
  56     return &_PyUnicode_Database_Records[index];
  57 }
  58
  59 /* ------------- Previous-version API ------------------------------------- */
  60 typedef struct previous_version {
  61     PyObject_HEAD
  62     const char *name;
  63     const change_record* (*getrecord)(Py_UCS4);
  64     Py_UCS4 (*normalization)(Py_UCS4);
  65 } PreviousDBVersion;
  66
  67 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
  68
  69 static PyMemberDef DB_members[] = {
  70         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
  71         {NULL}
  72 };
  73
  74 /* forward declaration */
  75 static PyTypeObject UCD_Type;
  76 #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
  77
  78 static PyObject*
  79 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
  80                      Py_UCS4 (*normalization)(Py_UCS4))
  81 {
  82         PreviousDBVersion *self;
  83         self = PyObject_New(PreviousDBVersion, &UCD_Type);
  84         if (self == NULL)
  85                 return NULL;
  86         self->name = name;
  87         self->getrecord = getrecord;
  88         self->normalization = normalization;
  89         return (PyObject*)self;
  90 }
  91
  92
  93 static Py_UCS4 getuchar(PyUnicodeObject *obj)
  94 {
  95     Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
  96
  97     if (PyUnicode_GET_SIZE(obj) == 1)
  98         return *v;
  99 #ifndef Py_UNICODE_WIDE
 100     else if ((PyUnicode_GET_SIZE(obj) == 2) &&
 101              (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
 102              (0xDC00 <= v[1] && v[1] <= 0xDFFF))
 103         return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
 104 #endif
 105     PyErr_SetString(PyExc_TypeError,
 106                     "need a single Unicode character as parameter");
 107     return (Py_UCS4)-1;
 108 }
 109
 110 /* --- Module API --------------------------------------------------------- */
 111
 112 PyDoc_STRVAR(unicodedata_decimal__doc__,
 113 "decimal(unichr[, default])\n\
 114 \n\
 115 Returns the decimal value assigned to the Unicode character unichr\n\
 116 as integer. If no such value is defined, default is returned, or, if\n\
 117 not given, ValueError is raised.");
 118
 119 static PyObject *
 120 unicodedata_decimal(PyObject *self, PyObject *args)
 121 {
 122     PyUnicodeObject *v;
 123     PyObject *defobj = NULL;
 124     int have_old = 0;
 125     long rc;
 126     Py_UCS4 c;
 127
 128     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
 129         return NULL;
 130     c = getuchar(v);
 131     if (c == (Py_UCS4)-1)
 132         return NULL;
 133
 134     if (self && UCD_Check(self)) {
 135         const change_record *old = get_old_record(self, c);
 136         if (old->category_changed == 0) {
 137             /* unassigned */
 138             have_old = 1;
 139             rc = -1;
 140         }
 141         else if (old->decimal_changed != 0xFF) {
 142             have_old = 1;
 143             rc = old->decimal_changed;
 144         }
 145     }
 146
 147     if (!have_old)
 148         rc = Py_UNICODE_TODECIMAL(c);
 149     if (rc < 0) {
 150         if (defobj == NULL) {
 151             PyErr_SetString(PyExc_ValueError,
 152                             "not a decimal");
 153             return NULL;
 154         }
 155         else {
 156             Py_INCREF(defobj);
 157             return defobj;
 158         }
 159     }
 160     return PyLong_FromLong(rc);
 161 }
 162
 163 PyDoc_STRVAR(unicodedata_digit__doc__,
 164 "digit(unichr[, default])\n\
 165 \n\
 166 Returns the digit value assigned to the Unicode character unichr as\n\
 167 integer. If no such value is defined, default is returned, or, if\n\
 168 not given, ValueError is raised.");
 169
 170 static PyObject *
 171 unicodedata_digit(PyObject *self, PyObject *args)
 172 {
 173     PyUnicodeObject *v;
 174     PyObject *defobj = NULL;
 175     long rc;
 176     Py_UCS4 c;
 177
 178     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
 179         return NULL;
 180     c = getuchar(v);
 181     if (c == (Py_UCS4)-1)
 182         return NULL;
 183     rc = Py_UNICODE_TODIGIT(c);
 184     if (rc < 0) {
 185         if (defobj == NULL) {
 186             PyErr_SetString(PyExc_ValueError, "not a digit");
 187             return NULL;
 188         }
 189         else {
 190             Py_INCREF(defobj);
 191             return defobj;
 192         }
 193     }
 194     return PyLong_FromLong(rc);
 195 }
 196
 197 PyDoc_STRVAR(unicodedata_numeric__doc__,
 198 "numeric(unichr[, default])\n\
 199 \n\
 200 Returns the numeric value assigned to the Unicode character unichr\n\
 201 as float. If no such value is defined, default is returned, or, if\n\
 202 not given, ValueError is raised.");
 203
 204 static PyObject *
 205 unicodedata_numeric(PyObject *self, PyObject *args)
 206 {
 207     PyUnicodeObject *v;
 208     PyObject *defobj = NULL;
 209     int have_old = 0;
 210     double rc;
 211     Py_UCS4 c;
 212
 213     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 214         return NULL;
 215     c = getuchar(v);
 216     if (c == (Py_UCS4)-1)
 217         return NULL;
 218
 219     if (self && UCD_Check(self)) {
 220         const change_record *old = get_old_record(self, c);
 221         if (old->category_changed == 0) {
 222             /* unassigned */
 223             have_old = 1;
 224             rc = -1.0;
 225         }
 226         else if (old->decimal_changed != 0xFF) {
 227             have_old = 1;
 228             rc = old->decimal_changed;
 229         }
 230     }
 231
 232     if (!have_old)
 233         rc = Py_UNICODE_TONUMERIC(c);
 234     if (rc == -1.0) {
 235         if (defobj == NULL) {
 236             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 237             return NULL;
 238         }
 239         else {
 240             Py_INCREF(defobj);
 241             return defobj;
 242         }
 243     }
 244     return PyFloat_FromDouble(rc);
 245 }
 246
 247 PyDoc_STRVAR(unicodedata_category__doc__,
 248 "category(unichr)\n\
 249 \n\
 250 Returns the general category assigned to the Unicode character\n\
 251 unichr as string.");
 252
 253 static PyObject *
 254 unicodedata_category(PyObject *self, PyObject *args)
 255 {
 256     PyUnicodeObject *v;
 257     int index;
 258     Py_UCS4 c;
 259
 260     if (!PyArg_ParseTuple(args, "O!:category",
 261                           &PyUnicode_Type, &v))
 262         return NULL;
 263     c = getuchar(v);
 264     if (c == (Py_UCS4)-1)
 265         return NULL;
 266     index = (int) _getrecord_ex(c)->category;
 267     if (self && UCD_Check(self)) {
 268         const change_record *old = get_old_record(self, c);
 269         if (old->category_changed != 0xFF)
 270             index = old->category_changed;
 271     }
 272     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
 273 }
 274
 275 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
 276 "bidirectional(unichr)\n\
 277 \n\
 278 Returns the bidirectional category assigned to the Unicode character\n\
 279 unichr as string. If no such value is defined, an empty string is\n\
 280 returned.");
 281
 282 static PyObject *
 283 unicodedata_bidirectional(PyObject *self, PyObject *args)
 284 {
 285     PyUnicodeObject *v;
 286     int index;
 287     Py_UCS4 c;
 288
 289     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 290                           &PyUnicode_Type, &v))
 291         return NULL;
 292     c = getuchar(v);
 293     if (c == (Py_UCS4)-1)
 294         return NULL;
 295     index = (int) _getrecord_ex(c)->bidirectional;
 296     if (self && UCD_Check(self)) {
 297         const change_record *old = get_old_record(self, c);
 298         if (old->category_changed == 0)
 299             index = 0; /* unassigned */
 300         else if (old->bidir_changed != 0xFF)
 301             index = old->bidir_changed;
 302     }
 303     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
 304 }
 305
 306 PyDoc_STRVAR(unicodedata_combining__doc__,
 307 "combining(unichr)\n\
 308 \n\
 309 Returns the canonical combining class assigned to the Unicode\n\
 310 character unichr as integer. Returns 0 if no combining class is\n\
 311 defined.");
 312
 313 static PyObject *
 314 unicodedata_combining(PyObject *self, PyObject *args)
 315 {
 316     PyUnicodeObject *v;
 317     int index;
 318     Py_UCS4 c;
 319
 320     if (!PyArg_ParseTuple(args, "O!:combining",
 321                           &PyUnicode_Type, &v))
 322         return NULL;
 323     c = getuchar(v);
 324     if (c == (Py_UCS4)-1)
 325         return NULL;
 326     index = (int) _getrecord_ex(c)->combining;
 327     if (self && UCD_Check(self)) {
 328         const change_record *old = get_old_record(self, c);
 329         if (old->category_changed == 0)
 330             index = 0; /* unassigned */
 331     }
 332     return PyLong_FromLong(index);
 333 }
 334
 335 PyDoc_STRVAR(unicodedata_mirrored__doc__,
 336 "mirrored(unichr)\n\
 337 \n\
 338 Returns the mirrored property assigned to the Unicode character\n\
 339 unichr as integer. Returns 1 if the character has been identified as\n\
 340 a \"mirrored\" character in bidirectional text, 0 otherwise.");
 341
 342 static PyObject *
 343 unicodedata_mirrored(PyObject *self, PyObject *args)
 344 {
 345     PyUnicodeObject *v;
 346     int index;
 347     Py_UCS4 c;
 348
 349     if (!PyArg_ParseTuple(args, "O!:mirrored",
 350                           &PyUnicode_Type, &v))
 351         return NULL;
 352     c = getuchar(v);
 353     if (c == (Py_UCS4)-1)
 354         return NULL;
 355     index = (int) _getrecord_ex(c)->mirrored;
 356     if (self && UCD_Check(self)) {
 357         const change_record *old = get_old_record(self, c);
 358         if (old->category_changed == 0)
 359             index = 0; /* unassigned */
 360         else if (old->mirrored_changed != 0xFF)
 361             index = old->mirrored_changed;
 362     }
 363     return PyLong_FromLong(index);
 364 }
 365
 366 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
 367 "east_asian_width(unichr)\n\
 368 \n\
 369 Returns the east asian width assigned to the Unicode character\n\
 370 unichr as string.");
 371
 372 static PyObject *
 373 unicodedata_east_asian_width(PyObject *self, PyObject *args)
 374 {
 375     PyUnicodeObject *v;
 376     int index;
 377     Py_UCS4 c;
 378
 379     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
 380                           &PyUnicode_Type, &v))
 381         return NULL;
 382     c = getuchar(v);
 383     if (c == (Py_UCS4)-1)
 384         return NULL;
 385     index = (int) _getrecord_ex(c)->east_asian_width;
 386     if (self && UCD_Check(self)) {
 387         const change_record *old = get_old_record(self, c);
 388         if (old->category_changed == 0)
 389             index = 0; /* unassigned */
 390     }
 391     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
 392 }
 393
 394 PyDoc_STRVAR(unicodedata_decomposition__doc__,
 395 "decomposition(unichr)\n\
 396 \n\
 397 Returns the character decomposition mapping assigned to the Unicode\n\
 398 character unichr as string. An empty string is returned in case no\n\
 399 such mapping is defined.");
 400
 401 static PyObject *
 402 unicodedata_decomposition(PyObject *self, PyObject *args)
 403 {
 404     PyUnicodeObject *v;
 405     char decomp[256];
 406     int code, index, count, i;
 407     unsigned int prefix_index;
 408     Py_UCS4 c;
 409
 410     if (!PyArg_ParseTuple(args, "O!:decomposition",
 411                           &PyUnicode_Type, &v))
 412         return NULL;
 413     c = getuchar(v);
 414     if (c == (Py_UCS4)-1)
 415         return NULL;
 416
 417     code = (int)c;
 418
 419     if (self && UCD_Check(self)) {
 420         const change_record *old = get_old_record(self, c);
 421         if (old->category_changed == 0)
 422             return PyUnicode_FromString(""); /* unassigned */
 423     }
 424
 425     if (code < 0 || code >= 0x110000)
 426         index = 0;
 427     else {
 428         index = decomp_index1[(code>>DECOMP_SHIFT)];
 429         index = decomp_index2[(index<<DECOMP_SHIFT)+
 430                              (code&((1<<DECOMP_SHIFT)-1))];
 431     }
 432
 433     /* high byte is number of hex bytes (usually one or two), low byte
 434        is prefix code (from*/
 435     count = decomp_data[index] >> 8;
 436
 437     /* XXX: could allocate the PyString up front instead
 438        (strlen(prefix) + 5 * count + 1 bytes) */
 439
 440     /* Based on how index is calculated above and decomp_data is generated
 441        from Tools/unicode/makeunicodedata.py, it should not be possible
 442        to overflow decomp_prefix. */
 443     prefix_index = decomp_data[index] & 255;
 444     assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
 445
 446     /* copy prefix */
 447     i = strlen(decomp_prefix[prefix_index]);
 448     memcpy(decomp, decomp_prefix[prefix_index], i);
 449
 450     while (count-- > 0) {
 451         if (i)
 452             decomp[i++] = ' ';
 453         assert((size_t)i < sizeof(decomp));
 454         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 455                       decomp_data[++index]);
 456         i += strlen(decomp + i);
 457     }
 458
 459     decomp[i] = '\0';
 460
 461     return PyUnicode_FromString(decomp);
 462 }
 463
 464 static void
 465 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 466 {
 467     if (code >= 0x110000) {
 468         *index = 0;
 469     } else if (self && UCD_Check(self) &&
 470                get_old_record(self, code)->category_changed==0) {
 471         /* unassigned in old version */
 472         *index = 0;
 473     }
 474     else {
 475         *index = decomp_index1[(code>>DECOMP_SHIFT)];
 476         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 477                                (code&((1<<DECOMP_SHIFT)-1))];
 478     }
 479
 480     /* high byte is number of hex bytes (usually one or two), low byte
 481        is prefix code (from*/
 482     *count = decomp_data[*index] >> 8;
 483     *prefix = decomp_data[*index] & 255;
 484
 485     (*index)++;
 486 }
 487
 488 #define SBase   0xAC00
 489 #define LBase   0x1100
 490 #define VBase   0x1161
 491 #define TBase   0x11A7
 492 #define LCount  19
 493 #define VCount  21
 494 #define TCount  28
 495 #define NCount  (VCount*TCount)
 496 #define SCount  (LCount*NCount)
 497
 498 static PyObject*
 499 nfd_nfkd(PyObject *self, PyObject *input, int k)
 500 {
 501     PyObject *result;
 502     Py_UNICODE *i, *end, *o;
 503     /* Longest decomposition in Unicode 3.2: U+FDFA */
 504     Py_UNICODE stack[20];
 505     Py_ssize_t space, isize;
 506     int index, prefix, count, stackptr;
 507     unsigned char prev, cur;
 508
 509     stackptr = 0;
 510     isize = PyUnicode_GET_SIZE(input);
 511     /* Overallocate atmost 10 characters. */
 512     space = (isize > 10 ? 10 : isize) + isize;
 513     result = PyUnicode_FromUnicode(NULL, space);
 514     if (!result)
 515         return NULL;
 516     i = PyUnicode_AS_UNICODE(input);
 517     end = i + isize;
 518     o = PyUnicode_AS_UNICODE(result);
 519
 520     while (i < end) {
 521         stack[stackptr++] = *i++;
 522         while(stackptr) {
 523             Py_UNICODE code = stack[--stackptr];
 524             /* Hangul Decomposition adds three characters in
 525                a single step, so we need atleast that much room. */
 526             if (space < 3) {
 527                 Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
 528                 space += 10;
 529                 if (PyUnicode_Resize(&result, newsize) == -1)
 530                     return NULL;
 531                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
 532             }
 533             /* Hangul Decomposition. */
 534             if (SBase <= code && code < (SBase+SCount)) {
 535                 int SIndex = code - SBase;
 536                 int L = LBase + SIndex / NCount;
 537                 int V = VBase + (SIndex % NCount) / TCount;
 538                 int T = TBase + SIndex % TCount;
 539                 *o++ = L;
 540                 *o++ = V;
 541                 space -= 2;
 542                 if (T != TBase) {
 543                     *o++ = T;
 544                     space --;
 545                 }
 546                 continue;
 547             }
 548             /* normalization changes */
 549             if (self && UCD_Check(self)) {
 550                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
 551                 if (value != 0) {
 552                     stack[stackptr++] = value;
 553                     continue;
 554                 }
 555             }
 556
 557             /* Other decompositions. */
 558             get_decomp_record(self, code, &index, &prefix, &count);
 559
 560             /* Copy character if it is not decomposable, or has a
 561                compatibility decomposition, but we do NFD. */
 562             if (!count || (prefix && !k)) {
 563                 *o++ = code;
 564                 space--;
 565                 continue;
 566             }
 567             /* Copy decomposition onto the stack, in reverse
 568                order.  */
 569             while(count) {
 570                 code = decomp_data[index + (--count)];
 571                 stack[stackptr++] = code;
 572             }
 573         }
 574     }
 575
 576     /* Drop overallocation. Cannot fail. */
 577     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 578
 579     /* Sort canonically. */
 580     i = PyUnicode_AS_UNICODE(result);
 581     prev = _getrecord_ex(*i)->combining;
 582     end = i + PyUnicode_GET_SIZE(result);
 583     for (i++; i < end; i++) {
 584         cur = _getrecord_ex(*i)->combining;
 585         if (prev == 0 || cur == 0 || prev <= cur) {
 586             prev = cur;
 587             continue;
 588         }
 589         /* Non-canonical order. Need to switch *i with previous. */
 590         o = i - 1;
 591         while (1) {
 592             Py_UNICODE tmp = o[1];
 593             o[1] = o[0];
 594             o[0] = tmp;
 595             o--;
 596             if (o < PyUnicode_AS_UNICODE(result))
 597                 break;
 598             prev = _getrecord_ex(*o)->combining;
 599             if (prev == 0 || prev <= cur)
 600                 break;
 601         }
 602         prev = _getrecord_ex(*i)->combining;
 603     }
 604     return result;
 605 }
 606
 607 static int
 608 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
 609 {
 610     int index;
 611     for (index = 0; nfc[index].start; index++) {
 612         int start = nfc[index].start;
 613         if (code < start)
 614             return -1;
 615         if (code <= start + nfc[index].count) {
 616             int delta = code - start;
 617             return nfc[index].index + delta;
 618         }
 619     }
 620     return -1;
 621 }
 622
 623 static PyObject*
 624 nfc_nfkc(PyObject *self, PyObject *input, int k)
 625 {
 626     PyObject *result;
 627     Py_UNICODE *i, *i1, *o, *end;
 628     int f,l,index,index1,comb;
 629     Py_UNICODE code;
 630     Py_UNICODE *skipped[20];
 631     int cskipped = 0;
 632
 633     result = nfd_nfkd(self, input, k);
 634     if (!result)
 635         return NULL;
 636
 637     /* We are going to modify result in-place.
 638        If nfd_nfkd is changed to sometimes return the input,
 639        this code needs to be reviewed. */
 640     assert(result != input);
 641
 642     i = PyUnicode_AS_UNICODE(result);
 643     end = i + PyUnicode_GET_SIZE(result);
 644     o = PyUnicode_AS_UNICODE(result);
 645
 646   again:
 647     while (i < end) {
 648       for (index = 0; index < cskipped; index++) {
 649           if (skipped[index] == i) {
 650               /* *i character is skipped.
 651                  Remove from list. */
 652               skipped[index] = skipped[cskipped-1];
 653               cskipped--;
 654               i++;
 655               goto again; /* continue while */
 656           }
 657       }
 658       /* Hangul Composition. We don't need to check for <LV,T>
 659          pairs, since we always have decomposed data. */
 660       if (LBase <= *i && *i < (LBase+LCount) &&
 661           i + 1 < end &&
 662           VBase <= i[1] && i[1] <= (VBase+VCount)) {
 663           int LIndex, VIndex;
 664           LIndex = i[0] - LBase;
 665           VIndex = i[1] - VBase;
 666           code = SBase + (LIndex*VCount+VIndex)*TCount;
 667           i+=2;
 668           if (i < end &&
 669               TBase <= *i && *i <= (TBase+TCount)) {
 670               code += *i-TBase;
 671               i++;
 672           }
 673           *o++ = code;
 674           continue;
 675       }
 676
 677       f = find_nfc_index(self, nfc_first, *i);
 678       if (f == -1) {
 679           *o++ = *i++;
 680           continue;
 681       }
 682       /* Find next unblocked character. */
 683       i1 = i+1;
 684       comb = 0;
 685       while (i1 < end) {
 686           int comb1 = _getrecord_ex(*i1)->combining;
 687           if (comb1 && comb == comb1) {
 688               /* Character is blocked. */
 689               i1++;
 690               continue;
 691           }
 692           l = find_nfc_index(self, nfc_last, *i1);
 693           /* *i1 cannot be combined with *i. If *i1
 694              is a starter, we don't need to look further.
 695              Otherwise, record the combining class. */
 696           if (l == -1) {
 697             not_combinable:
 698               if (comb1 == 0)
 699                   break;
 700               comb = comb1;
 701               i1++;
 702               continue;
 703           }
 704           index = f*TOTAL_LAST + l;
 705           index1 = comp_index[index >> COMP_SHIFT];
 706           code = comp_data[(index1<<COMP_SHIFT)+
 707                            (index&((1<<COMP_SHIFT)-1))];
 708           if (code == 0)
 709               goto not_combinable;
 710
 711           /* Replace the original character. */
 712           *i = code;
 713           /* Mark the second character unused. */
 714           skipped[cskipped++] = i1;
 715           i1++;
 716           f = find_nfc_index(self, nfc_first, *i);
 717           if (f == -1)
 718               break;
 719       }
 720       *o++ = *i++;
 721     }
 722     if (o != end)
 723         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 724     return result;
 725 }
 726
 727 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
 728 static int
 729 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
 730 {
 731     Py_UNICODE *i, *end;
 732     unsigned char prev_combining = 0, quickcheck_mask;
 733
 734     /* An older version of the database is requested, quickchecks must be
 735        disabled. */
 736     if (self && UCD_Check(self))
 737         return 0;
 738
 739     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
 740        as described in http://unicode.org/reports/tr15/#Annex8. */
 741     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
 742
 743     i = PyUnicode_AS_UNICODE(input);
 744     end = i + PyUnicode_GET_SIZE(input);
 745     while (i < end) {
 746         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
 747         unsigned char combining = record->combining;
 748         unsigned char quickcheck = record->normalization_quick_check;
 749
 750         if (quickcheck & quickcheck_mask)
 751             return 0; /* this string might need normalization */
 752         if (combining && prev_combining > combining)
 753             return 0; /* non-canonical sort order, not normalized */
 754         prev_combining = combining;
 755     }
 756     return 1; /* certainly normalized */
 757 }
 758
 759 PyDoc_STRVAR(unicodedata_normalize__doc__,
 760 "normalize(form, unistr)\n\
 761 \n\
 762 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
 763 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
 764
 765 static PyObject*
 766 unicodedata_normalize(PyObject *self, PyObject *args)
 767 {
 768     char *form;
 769     PyObject *input;
 770
 771     if(!PyArg_ParseTuple(args, "sO!:normalize",
 772                          &form, &PyUnicode_Type, &input))
 773         return NULL;
 774
 775     if (PyUnicode_GetSize(input) == 0) {
 776         /* Special case empty input strings, since resizing
 777            them  later would cause internal errors. */
 778         Py_INCREF(input);
 779         return input;
 780     }
 781
 782     if (strcmp(form, "NFC") == 0) {
 783         if (is_normalized(self, input, 1, 0)) {
 784             Py_INCREF(input);
 785             return input;
 786         }
 787         return nfc_nfkc(self, input, 0);
 788     }
 789     if (strcmp(form, "NFKC") == 0) {
 790         if (is_normalized(self, input, 1, 1)) {
 791             Py_INCREF(input);
 792             return input;
 793         }
 794         return nfc_nfkc(self, input, 1);
 795     }
 796     if (strcmp(form, "NFD") == 0) {
 797         if (is_normalized(self, input, 0, 0)) {
 798             Py_INCREF(input);
 799             return input;
 800         }
 801         return nfd_nfkd(self, input, 0);
 802     }
 803     if (strcmp(form, "NFKD") == 0) {
 804         if (is_normalized(self, input, 0, 1)) {
 805             Py_INCREF(input);
 806             return input;
 807         }
 808         return nfd_nfkd(self, input, 1);
 809     }
 810     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 811     return NULL;
 812 }
 813
 814 /* -------------------------------------------------------------------- */
 815 /* unicode character name tables */
 816
 817 /* data file generated by Tools/unicode/makeunicodedata.py */
 818 #include "unicodename_db.h"
 819
 820 /* -------------------------------------------------------------------- */
 821 /* database code (cut and pasted from the unidb package) */
 822
 823 static unsigned long
 824 _gethash(const char *s, int len, int scale)
 825 {
 826     int i;
 827     unsigned long h = 0;
 828     unsigned long ix;
 829     for (i = 0; i < len; i++) {
 830         h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
 831         ix = h & 0xff000000;
 832         if (ix)
 833             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 834     }
 835     return h;
 836 }
 837
 838 static char *hangul_syllables[][3] = {
 839     { "G",  "A",   ""   },
 840     { "GG", "AE",  "G"  },
 841     { "N",  "YA",  "GG" },
 842     { "D",  "YAE", "GS" },
 843     { "DD", "EO",  "N", },
 844     { "R",  "E",   "NJ" },
 845     { "M",  "YEO", "NH" },
 846     { "B",  "YE",  "D"  },
 847     { "BB", "O",   "L"  },
 848     { "S",  "WA",  "LG" },
 849     { "SS", "WAE", "LM" },
 850     { "",   "OE",  "LB" },
 851     { "J",  "YO",  "LS" },
 852     { "JJ", "U",   "LT" },
 853     { "C",  "WEO", "LP" },
 854     { "K",  "WE",  "LH" },
 855     { "T",  "WI",  "M"  },
 856     { "P",  "YU",  "B"  },
 857     { "H",  "EU",  "BS" },
 858     { 0,    "YI",  "S"  },
 859     { 0,    "I",   "SS" },
 860     { 0,    0,     "NG" },
 861     { 0,    0,     "J"  },
 862     { 0,    0,     "C"  },
 863     { 0,    0,     "K"  },
 864     { 0,    0,     "T"  },
 865     { 0,    0,     "P"  },
 866     { 0,    0,     "H"  }
 867 };
 868
 869 static int
 870 is_unified_ideograph(Py_UCS4 code)
 871 {
 872     return (
 873         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 874         (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
 875         (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
 876 }
 877
 878 static int
 879 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
 880 {
 881     int offset;
 882     int i;
 883     int word;
 884     unsigned char* w;
 885
 886     if (code >= 0x110000)
 887         return 0;
 888
 889     if (self && UCD_Check(self)) {
 890         const change_record *old = get_old_record(self, code);
 891         if (old->category_changed == 0) {
 892             /* unassigned */
 893             return 0;
 894         }
 895     }
 896
 897     if (SBase <= code && code < SBase+SCount) {
 898         /* Hangul syllable. */
 899         int SIndex = code - SBase;
 900         int L = SIndex / NCount;
 901         int V = (SIndex % NCount) / TCount;
 902         int T = SIndex % TCount;
 903
 904         if (buflen < 27)
 905             /* Worst case: HANGUL SYLLABLE <10chars>. */
 906             return 0;
 907         strcpy(buffer, "HANGUL SYLLABLE ");
 908         buffer += 16;
 909         strcpy(buffer, hangul_syllables[L][0]);
 910         buffer += strlen(hangul_syllables[L][0]);
 911         strcpy(buffer, hangul_syllables[V][1]);
 912         buffer += strlen(hangul_syllables[V][1]);
 913         strcpy(buffer, hangul_syllables[T][2]);
 914         buffer += strlen(hangul_syllables[T][2]);
 915         *buffer = '\0';
 916         return 1;
 917     }
 918
 919     if (is_unified_ideograph(code)) {
 920         if (buflen < 28)
 921             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 922             return 0;
 923         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 924         return 1;
 925     }
 926
 927     /* get offset into phrasebook */
 928     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 929     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 930                                (code&((1<<phrasebook_shift)-1))];
 931     if (!offset)
 932         return 0;
 933
 934     i = 0;
 935
 936     for (;;) {
 937         /* get word index */
 938         word = phrasebook[offset] - phrasebook_short;
 939         if (word >= 0) {
 940             word = (word << 8) + phrasebook[offset+1];
 941             offset += 2;
 942         } else
 943             word = phrasebook[offset++];
 944         if (i) {
 945             if (i > buflen)
 946                 return 0; /* buffer overflow */
 947             buffer[i++] = ' ';
 948         }
 949         /* copy word string from lexicon.  the last character in the
 950            word has bit 7 set.  the last word in a string ends with
 951            0x80 */
 952         w = lexicon + lexicon_offset[word];
 953         while (*w < 128) {
 954             if (i >= buflen)
 955                 return 0; /* buffer overflow */
 956             buffer[i++] = *w++;
 957         }
 958         if (i >= buflen)
 959             return 0; /* buffer overflow */
 960         buffer[i++] = *w & 127;
 961         if (*w == 128)
 962             break; /* end of word */
 963     }
 964
 965     return 1;
 966 }
 967
 968 static int
 969 _cmpname(PyObject *self, int code, const char* name, int namelen)
 970 {
 971     /* check if code corresponds to the given name */
 972     int i;
 973     char buffer[NAME_MAXLEN];
 974     if (!_getucname(self, code, buffer, sizeof(buffer)))
 975         return 0;
 976     for (i = 0; i < namelen; i++) {
 977         if (toupper(Py_CHARMASK(name[i])) != buffer[i])
 978             return 0;
 979     }
 980     return buffer[namelen] == '\0';
 981 }
 982
 983 static void
 984 find_syllable(const char *str, int *len, int *pos, int count, int column)
 985 {
 986     int i, len1;
 987     *len = -1;
 988     for (i = 0; i < count; i++) {
 989         char *s = hangul_syllables[i][column];
 990         len1 = strlen(s);
 991         if (len1 <= *len)
 992             continue;
 993         if (strncmp(str, s, len1) == 0) {
 994             *len = len1;
 995             *pos = i;
 996         }
 997     }
 998     if (*len == -1) {
 999         *len = 0;
1000     }
1001 }
1002
1003 static int
1004 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
1005 {
1006     unsigned int h, v;
1007     unsigned int mask = code_size-1;
1008     unsigned int i, incr;
1009
1010     /* Check for hangul syllables. */
1011     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1012         int len, L = -1, V = -1, T = -1;
1013         const char *pos = name + 16;
1014         find_syllable(pos, &len, &L, LCount, 0);
1015         pos += len;
1016         find_syllable(pos, &len, &V, VCount, 1);
1017         pos += len;
1018         find_syllable(pos, &len, &T, TCount, 2);
1019         pos += len;
1020         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1021             *code = SBase + (L*VCount+V)*TCount + T;
1022             return 1;
1023         }
1024         /* Otherwise, it's an illegal syllable name. */
1025         return 0;
1026     }
1027
1028     /* Check for unified ideographs. */
1029     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1030         /* Four or five hexdigits must follow. */
1031         v = 0;
1032         name += 22;
1033         namelen -= 22;
1034         if (namelen != 4 && namelen != 5)
1035             return 0;
1036         while (namelen--) {
1037             v *= 16;
1038             if (*name >= '0' && *name <= '9')
1039                 v += *name - '0';
1040             else if (*name >= 'A' && *name <= 'F')
1041                 v += *name - 'A' + 10;
1042             else
1043                 return 0;
1044             name++;
1045         }
1046         if (!is_unified_ideograph(v))
1047             return 0;
1048         *code = v;
1049         return 1;
1050     }
1051
1052     /* the following is the same as python's dictionary lookup, with
1053        only minor changes.  see the makeunicodedata script for more
1054        details */
1055
1056     h = (unsigned int) _gethash(name, namelen, code_magic);
1057     i = (~h) & mask;
1058     v = code_hash[i];
1059     if (!v)
1060         return 0;
1061     if (_cmpname(self, v, name, namelen)) {
1062         *code = v;
1063         return 1;
1064     }
1065     incr = (h ^ (h >> 3)) & mask;
1066     if (!incr)
1067         incr = mask;
1068     for (;;) {
1069         i = (i + incr) & mask;
1070         v = code_hash[i];
1071         if (!v)
1072             return 0;
1073         if (_cmpname(self, v, name, namelen)) {
1074             *code = v;
1075             return 1;
1076         }
1077         incr = incr << 1;
1078         if (incr > mask)
1079             incr = incr ^ code_poly;
1080     }
1081 }
1082
1083 static const _PyUnicode_Name_CAPI hashAPI =
1084 {
1085     sizeof(_PyUnicode_Name_CAPI),
1086     _getucname,
1087     _getcode
1088 };
1089
1090 /* -------------------------------------------------------------------- */
1091 /* Python bindings */
1092
1093 PyDoc_STRVAR(unicodedata_name__doc__,
1094 "name(unichr[, default])\n\
1095 Returns the name assigned to the Unicode character unichr as a\n\
1096 string. If no name is defined, default is returned, or, if not\n\
1097 given, ValueError is raised.");
1098
1099 static PyObject *
1100 unicodedata_name(PyObject* self, PyObject* args)
1101 {
1102     char name[NAME_MAXLEN];
1103     Py_UCS4 c;
1104
1105     PyUnicodeObject* v;
1106     PyObject* defobj = NULL;
1107     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1108         return NULL;
1109
1110     c = getuchar(v);
1111     if (c == (Py_UCS4)-1)
1112         return NULL;
1113
1114     if (!_getucname(self, c, name, sizeof(name))) {
1115         if (defobj == NULL) {
1116             PyErr_SetString(PyExc_ValueError, "no such name");
1117             return NULL;
1118         }
1119         else {
1120             Py_INCREF(defobj);
1121             return defobj;
1122         }
1123     }
1124
1125     return PyUnicode_FromString(name);
1126 }
1127
1128 PyDoc_STRVAR(unicodedata_lookup__doc__,
1129 "lookup(name)\n\
1130 \n\
1131 Look up character by name.  If a character with the\n\
1132 given name is found, return the corresponding Unicode\n\
1133 character.  If not found, KeyError is raised.");
1134
1135 static PyObject *
1136 unicodedata_lookup(PyObject* self, PyObject* args)
1137 {
1138     Py_UCS4 code;
1139     Py_UNICODE str[2];
1140
1141     char* name;
1142     int namelen;
1143     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1144         return NULL;
1145
1146     if (!_getcode(self, name, namelen, &code)) {
1147         PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1148                      name);
1149         return NULL;
1150     }
1151
1152 #ifndef Py_UNICODE_WIDE
1153     if (code >= 0x10000) {
1154         str[0] = 0xd800 + ((code - 0x10000) >> 10);
1155         str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1156         return PyUnicode_FromUnicode(str, 2);
1157     }
1158 #endif
1159     str[0] = (Py_UNICODE) code;
1160     return PyUnicode_FromUnicode(str, 1);
1161 }
1162
1163 /* XXX Add doc strings. */
1164
1165 static PyMethodDef unicodedata_functions[] = {
1166     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1167     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1168     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1169     {"category", unicodedata_category, METH_VARARGS,
1170                  unicodedata_category__doc__},
1171     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1172                       unicodedata_bidirectional__doc__},
1173     {"combining", unicodedata_combining, METH_VARARGS,
1174                   unicodedata_combining__doc__},
1175     {"mirrored", unicodedata_mirrored, METH_VARARGS,
1176                  unicodedata_mirrored__doc__},
1177     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1178                          unicodedata_east_asian_width__doc__},
1179     {"decomposition", unicodedata_decomposition, METH_VARARGS,
1180                       unicodedata_decomposition__doc__},
1181     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1182     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1183     {"normalize", unicodedata_normalize, METH_VARARGS,
1184                   unicodedata_normalize__doc__},
1185     {NULL, NULL}                /* sentinel */
1186 };
1187
1188 static PyTypeObject UCD_Type = {
1189         /* The ob_type field must be initialized in the module init function
1190          * to be portable to Windows without using C++. */
1191         PyVarObject_HEAD_INIT(NULL, 0)
1192         "unicodedata.UCD",              /*tp_name*/
1193         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1194         0,                      /*tp_itemsize*/
1195         /* methods */
1196         (destructor)PyObject_Del, /*tp_dealloc*/
1197         0,                      /*tp_print*/
1198         0,                      /*tp_getattr*/
1199         0,                      /*tp_setattr*/
1200         0,                      /*tp_reserved*/
1201         0,                      /*tp_repr*/
1202         0,                      /*tp_as_number*/
1203         0,                      /*tp_as_sequence*/
1204         0,                      /*tp_as_mapping*/
1205         0,                      /*tp_hash*/
1206         0,                      /*tp_call*/
1207         0,                      /*tp_str*/
1208         PyObject_GenericGetAttr,/*tp_getattro*/
1209         0,                      /*tp_setattro*/
1210         0,                      /*tp_as_buffer*/
1211         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1212         0,                      /*tp_doc*/
1213         0,                      /*tp_traverse*/
1214         0,                      /*tp_clear*/
1215         0,                      /*tp_richcompare*/
1216         0,                      /*tp_weaklistoffset*/
1217         0,                      /*tp_iter*/
1218         0,                      /*tp_iternext*/
1219         unicodedata_functions,  /*tp_methods*/
1220         DB_members,             /*tp_members*/
1221         0,                      /*tp_getset*/
1222         0,                      /*tp_base*/
1223         0,                      /*tp_dict*/
1224         0,                      /*tp_descr_get*/
1225         0,                      /*tp_descr_set*/
1226         0,                      /*tp_dictoffset*/
1227         0,                      /*tp_init*/
1228         0,                      /*tp_alloc*/
1229         0,                      /*tp_new*/
1230         0,                      /*tp_free*/
1231         0,                      /*tp_is_gc*/
1232 };
1233
1234 PyDoc_STRVAR(unicodedata_docstring,
1235 "This module provides access to the Unicode Character Database which\n\
1236 defines character properties for all Unicode characters. The data in\n\
1237 this database is based on the UnicodeData.txt file version\n\
1238 5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1239 \n\
1240 The module uses the same names and symbols as defined by the\n\
1241 UnicodeData File Format 5.1.0 (see\n\
1242 http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
1243
1244
1245 static struct PyModuleDef unicodedatamodule = {
1246         PyModuleDef_HEAD_INIT,
1247         "unicodedata",
1248         unicodedata_docstring,
1249         -1,
1250         unicodedata_functions,
1251         NULL,
1252         NULL,
1253         NULL,
1254         NULL
1255 };
1256
1257 PyMODINIT_FUNC
1258 PyInit_unicodedata(void)
1259 {
1260     PyObject *m, *v;
1261
1262     Py_TYPE(&UCD_Type) = &PyType_Type;
1263
1264     m = PyModule_Create(&unicodedatamodule);
1265     if (!m)
1266         return NULL;
1267
1268     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1269     Py_INCREF(&UCD_Type);
1270     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1271
1272     /* Previous versions */
1273     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1274     if (v != NULL)
1275         PyModule_AddObject(m, "ucd_3_2_0", v);
1276
1277     /* Export C API */
1278     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1279     if (v != NULL)
1280         PyModule_AddObject(m, "ucnhash_CAPI", v);
1281     return m;
1282 }
1283
1284 /*
1285 Local variables:
1286 c-basic-offset: 4
1287 indent-tabs-mode: nil
1288 End:
1289 */