Modules/unicodedata.c

   1 /* ------------------------------------------------------------------------
   2
   3    unicodedata -- Provides access to the Unicode 5.1 data base.
   4
   5    Data was extracted from the Unicode 5.1 UnicodeData.txt file.
   6
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9    Modified by Martin v. Löwis (martin@v.loewis.de)
  10
  11    Copyright (c) Corporation for National Research Initiatives.
  12
  13    ------------------------------------------------------------------------ */
  14
  15 #include "Python.h"
  16 #include "ucnhash.h"
  17 #include "structmember.h"
  18
  19 /* character properties */
  20
  21 typedef struct {
  22     const unsigned char category;       /* index into
  23                                            _PyUnicode_CategoryNames */
  24     const unsigned char combining;      /* combining class value 0 - 255 */
  25     const unsigned char bidirectional;  /* index into
  26                                            _PyUnicode_BidirectionalNames */
  27     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  28     const unsigned char east_asian_width;       /* index into
  29                                                    _PyUnicode_EastAsianWidth */
  30 } _PyUnicode_DatabaseRecord;
  31
  32 typedef struct change_record {
  33     /* sequence of fields should be the same as in merge_old_version */
  34     const unsigned char bidir_changed;
  35     const unsigned char category_changed;
  36     const unsigned char decimal_changed;
  37     const unsigned char mirrored_changed;
  38     const int numeric_changed;
  39 } change_record;
  40
  41 /* data file generated by Tools/unicode/makeunicodedata.py */
  42 #include "unicodedata_db.h"
  43
  44 static const _PyUnicode_DatabaseRecord*
  45 _getrecord_ex(Py_UCS4 code)
  46 {
  47     int index;
  48     if (code >= 0x110000)
  49         index = 0;
  50     else {
  51         index = index1[(code>>SHIFT)];
  52         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  53     }
  54
  55     return &_PyUnicode_Database_Records[index];
  56 }
  57
  58 /* ------------- Previous-version API ------------------------------------- */
  59 typedef struct previous_version {
  60     PyObject_HEAD
  61     const char *name;
  62     const change_record* (*getrecord)(Py_UCS4);
  63     Py_UCS4 (*normalization)(Py_UCS4);
  64 } PreviousDBVersion;
  65
  66 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
  67
  68 static PyMemberDef DB_members[] = {
  69         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
  70         {NULL}
  71 };
  72
  73 /* forward declaration */
  74 static PyTypeObject UCD_Type;
  75
  76 static PyObject*
  77 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
  78                      Py_UCS4 (*normalization)(Py_UCS4))
  79 {
  80         PreviousDBVersion *self;
  81         self = PyObject_New(PreviousDBVersion, &UCD_Type);
  82         if (self == NULL)
  83                 return NULL;
  84         self->name = name;
  85         self->getrecord = getrecord;
  86         self->normalization = normalization;
  87         return (PyObject*)self;
  88 }
  89
  90
  91 static Py_UCS4 getuchar(PyUnicodeObject *obj)
  92 {
  93     Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
  94
  95     if (PyUnicode_GET_SIZE(obj) == 1)
  96         return *v;
  97 #ifndef Py_UNICODE_WIDE
  98     else if ((PyUnicode_GET_SIZE(obj) == 2) &&
  99              (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
 100              (0xDC00 <= v[1] && v[1] <= 0xDFFF))
 101         return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
 102 #endif
 103     PyErr_SetString(PyExc_TypeError,
 104                     "need a single Unicode character as parameter");
 105     return (Py_UCS4)-1;
 106 }
 107
 108 /* --- Module API --------------------------------------------------------- */
 109
 110 PyDoc_STRVAR(unicodedata_decimal__doc__,
 111 "decimal(unichr[, default])\n\
 112 \n\
 113 Returns the decimal value assigned to the Unicode character unichr\n\
 114 as integer. If no such value is defined, default is returned, or, if\n\
 115 not given, ValueError is raised.");
 116
 117 static PyObject *
 118 unicodedata_decimal(PyObject *self, PyObject *args)
 119 {
 120     PyUnicodeObject *v;
 121     PyObject *defobj = NULL;
 122     int have_old = 0;
 123     long rc;
 124     Py_UCS4 c;
 125
 126     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
 127         return NULL;
 128     c = getuchar(v);
 129     if (c == (Py_UCS4)-1)
 130         return NULL;
 131
 132     if (self) {
 133         const change_record *old = get_old_record(self, c);
 134         if (old->category_changed == 0) {
 135             /* unassigned */
 136             have_old = 1;
 137             rc = -1;
 138         }
 139         else if (old->decimal_changed != 0xFF) {
 140             have_old = 1;
 141             rc = old->decimal_changed;
 142         }
 143     }
 144
 145     if (!have_old)
 146         rc = Py_UNICODE_TODECIMAL(c);
 147     if (rc < 0) {
 148         if (defobj == NULL) {
 149             PyErr_SetString(PyExc_ValueError,
 150                             "not a decimal");
 151             return NULL;
 152         }
 153         else {
 154             Py_INCREF(defobj);
 155             return defobj;
 156         }
 157     }
 158     return PyInt_FromLong(rc);
 159 }
 160
 161 PyDoc_STRVAR(unicodedata_digit__doc__,
 162 "digit(unichr[, default])\n\
 163 \n\
 164 Returns the digit value assigned to the Unicode character unichr as\n\
 165 integer. If no such value is defined, default is returned, or, if\n\
 166 not given, ValueError is raised.");
 167
 168 static PyObject *
 169 unicodedata_digit(PyObject *self, PyObject *args)
 170 {
 171     PyUnicodeObject *v;
 172     PyObject *defobj = NULL;
 173     long rc;
 174     Py_UCS4 c;
 175
 176     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
 177         return NULL;
 178     c = getuchar(v);
 179     if (c == (Py_UCS4)-1)
 180         return NULL;
 181     rc = Py_UNICODE_TODIGIT(c);
 182     if (rc < 0) {
 183         if (defobj == NULL) {
 184             PyErr_SetString(PyExc_ValueError, "not a digit");
 185             return NULL;
 186         }
 187         else {
 188             Py_INCREF(defobj);
 189             return defobj;
 190         }
 191     }
 192     return PyInt_FromLong(rc);
 193 }
 194
 195 PyDoc_STRVAR(unicodedata_numeric__doc__,
 196 "numeric(unichr[, default])\n\
 197 \n\
 198 Returns the numeric value assigned to the Unicode character unichr\n\
 199 as float. If no such value is defined, default is returned, or, if\n\
 200 not given, ValueError is raised.");
 201
 202 static PyObject *
 203 unicodedata_numeric(PyObject *self, PyObject *args)
 204 {
 205     PyUnicodeObject *v;
 206     PyObject *defobj = NULL;
 207     int have_old = 0;
 208     double rc;
 209     Py_UCS4 c;
 210
 211     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 212         return NULL;
 213     c = getuchar(v);
 214     if (c == (Py_UCS4)-1)
 215         return NULL;
 216
 217     if (self) {
 218         const change_record *old = get_old_record(self, c);
 219         if (old->category_changed == 0) {
 220             /* unassigned */
 221             have_old = 1;
 222             rc = -1.0;
 223         }
 224         else if (old->decimal_changed != 0xFF) {
 225             have_old = 1;
 226             rc = old->decimal_changed;
 227         }
 228     }
 229
 230     if (!have_old)
 231         rc = Py_UNICODE_TONUMERIC(c);
 232     if (rc == -1.0) {
 233         if (defobj == NULL) {
 234             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 235             return NULL;
 236         }
 237         else {
 238             Py_INCREF(defobj);
 239             return defobj;
 240         }
 241     }
 242     return PyFloat_FromDouble(rc);
 243 }
 244
 245 PyDoc_STRVAR(unicodedata_category__doc__,
 246 "category(unichr)\n\
 247 \n\
 248 Returns the general category assigned to the Unicode character\n\
 249 unichr as string.");
 250
 251 static PyObject *
 252 unicodedata_category(PyObject *self, PyObject *args)
 253 {
 254     PyUnicodeObject *v;
 255     int index;
 256     Py_UCS4 c;
 257
 258     if (!PyArg_ParseTuple(args, "O!:category",
 259                           &PyUnicode_Type, &v))
 260         return NULL;
 261     c = getuchar(v);
 262     if (c == (Py_UCS4)-1)
 263         return NULL;
 264     index = (int) _getrecord_ex(c)->category;
 265     if (self) {
 266         const change_record *old = get_old_record(self, c);
 267         if (old->category_changed != 0xFF)
 268             index = old->category_changed;
 269     }
 270     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 271 }
 272
 273 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
 274 "bidirectional(unichr)\n\
 275 \n\
 276 Returns the bidirectional category assigned to the Unicode character\n\
 277 unichr as string. If no such value is defined, an empty string is\n\
 278 returned.");
 279
 280 static PyObject *
 281 unicodedata_bidirectional(PyObject *self, PyObject *args)
 282 {
 283     PyUnicodeObject *v;
 284     int index;
 285     Py_UCS4 c;
 286
 287     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 288                           &PyUnicode_Type, &v))
 289         return NULL;
 290     c = getuchar(v);
 291     if (c == (Py_UCS4)-1)
 292         return NULL;
 293     index = (int) _getrecord_ex(c)->bidirectional;
 294     if (self) {
 295         const change_record *old = get_old_record(self, c);
 296         if (old->category_changed == 0)
 297             index = 0; /* unassigned */
 298         else if (old->bidir_changed != 0xFF)
 299             index = old->bidir_changed;
 300     }
 301     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 302 }
 303
 304 PyDoc_STRVAR(unicodedata_combining__doc__,
 305 "combining(unichr)\n\
 306 \n\
 307 Returns the canonical combining class assigned to the Unicode\n\
 308 character unichr as integer. Returns 0 if no combining class is\n\
 309 defined.");
 310
 311 static PyObject *
 312 unicodedata_combining(PyObject *self, PyObject *args)
 313 {
 314     PyUnicodeObject *v;
 315     int index;
 316     Py_UCS4 c;
 317
 318     if (!PyArg_ParseTuple(args, "O!:combining",
 319                           &PyUnicode_Type, &v))
 320         return NULL;
 321     c = getuchar(v);
 322     if (c == (Py_UCS4)-1)
 323         return NULL;
 324     index = (int) _getrecord_ex(c)->combining;
 325     if (self) {
 326         const change_record *old = get_old_record(self, c);
 327         if (old->category_changed == 0)
 328             index = 0; /* unassigned */
 329     }
 330     return PyInt_FromLong(index);
 331 }
 332
 333 PyDoc_STRVAR(unicodedata_mirrored__doc__,
 334 "mirrored(unichr)\n\
 335 \n\
 336 Returns the mirrored property assigned to the Unicode character\n\
 337 unichr as integer. Returns 1 if the character has been identified as\n\
 338 a \"mirrored\" character in bidirectional text, 0 otherwise.");
 339
 340 static PyObject *
 341 unicodedata_mirrored(PyObject *self, PyObject *args)
 342 {
 343     PyUnicodeObject *v;
 344     int index;
 345     Py_UCS4 c;
 346
 347     if (!PyArg_ParseTuple(args, "O!:mirrored",
 348                           &PyUnicode_Type, &v))
 349         return NULL;
 350     c = getuchar(v);
 351     if (c == (Py_UCS4)-1)
 352         return NULL;
 353     index = (int) _getrecord_ex(c)->mirrored;
 354     if (self) {
 355         const change_record *old = get_old_record(self, c);
 356         if (old->category_changed == 0)
 357             index = 0; /* unassigned */
 358         else if (old->mirrored_changed != 0xFF)
 359             index = old->mirrored_changed;
 360     }
 361     return PyInt_FromLong(index);
 362 }
 363
 364 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
 365 "east_asian_width(unichr)\n\
 366 \n\
 367 Returns the east asian width assigned to the Unicode character\n\
 368 unichr as string.");
 369
 370 static PyObject *
 371 unicodedata_east_asian_width(PyObject *self, PyObject *args)
 372 {
 373     PyUnicodeObject *v;
 374     int index;
 375     Py_UCS4 c;
 376
 377     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
 378                           &PyUnicode_Type, &v))
 379         return NULL;
 380     c = getuchar(v);
 381     if (c == (Py_UCS4)-1)
 382         return NULL;
 383     index = (int) _getrecord_ex(c)->east_asian_width;
 384     if (self) {
 385         const change_record *old = get_old_record(self, c);
 386         if (old->category_changed == 0)
 387             index = 0; /* unassigned */
 388     }
 389     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
 390 }
 391
 392 PyDoc_STRVAR(unicodedata_decomposition__doc__,
 393 "decomposition(unichr)\n\
 394 \n\
 395 Returns the character decomposition mapping assigned to the Unicode\n\
 396 character unichr as string. An empty string is returned in case no\n\
 397 such mapping is defined.");
 398
 399 static PyObject *
 400 unicodedata_decomposition(PyObject *self, PyObject *args)
 401 {
 402     PyUnicodeObject *v;
 403     char decomp[256];
 404     int code, index, count, i;
 405     unsigned int prefix_index;
 406     Py_UCS4 c;
 407
 408     if (!PyArg_ParseTuple(args, "O!:decomposition",
 409                           &PyUnicode_Type, &v))
 410         return NULL;
 411     c = getuchar(v);
 412     if (c == (Py_UCS4)-1)
 413         return NULL;
 414
 415     code = (int)c;
 416
 417     if (self) {
 418         const change_record *old = get_old_record(self, c);
 419         if (old->category_changed == 0)
 420             return PyString_FromString(""); /* unassigned */
 421     }
 422
 423     if (code < 0 || code >= 0x110000)
 424         index = 0;
 425     else {
 426         index = decomp_index1[(code>>DECOMP_SHIFT)];
 427         index = decomp_index2[(index<<DECOMP_SHIFT)+
 428                              (code&((1<<DECOMP_SHIFT)-1))];
 429     }
 430
 431     /* high byte is number of hex bytes (usually one or two), low byte
 432        is prefix code (from*/
 433     count = decomp_data[index] >> 8;
 434
 435     /* XXX: could allocate the PyString up front instead
 436        (strlen(prefix) + 5 * count + 1 bytes) */
 437
 438     /* Based on how index is calculated above and decomp_data is generated
 439        from Tools/unicode/makeunicodedata.py, it should not be possible
 440        to overflow decomp_prefix. */
 441     prefix_index = decomp_data[index] & 255;
 442     assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
 443
 444     /* copy prefix */
 445     i = strlen(decomp_prefix[prefix_index]);
 446     memcpy(decomp, decomp_prefix[prefix_index], i);
 447
 448     while (count-- > 0) {
 449         if (i)
 450             decomp[i++] = ' ';
 451         assert((size_t)i < sizeof(decomp));
 452         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 453                       decomp_data[++index]);
 454         i += strlen(decomp + i);
 455     }
 456
 457     decomp[i] = '\0';
 458
 459     return PyString_FromString(decomp);
 460 }
 461
 462 static void
 463 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 464 {
 465     if (code >= 0x110000) {
 466         *index = 0;
 467     } else if (self && get_old_record(self, code)->category_changed==0) {
 468         /* unassigned in old version */
 469         *index = 0;
 470     }
 471     else {
 472         *index = decomp_index1[(code>>DECOMP_SHIFT)];
 473         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 474                                (code&((1<<DECOMP_SHIFT)-1))];
 475     }
 476
 477     /* high byte is number of hex bytes (usually one or two), low byte
 478        is prefix code (from*/
 479     *count = decomp_data[*index] >> 8;
 480     *prefix = decomp_data[*index] & 255;
 481
 482     (*index)++;
 483 }
 484
 485 #define SBase   0xAC00
 486 #define LBase   0x1100
 487 #define VBase   0x1161
 488 #define TBase   0x11A7
 489 #define LCount  19
 490 #define VCount  21
 491 #define TCount  28
 492 #define NCount  (VCount*TCount)
 493 #define SCount  (LCount*NCount)
 494
 495 static PyObject*
 496 nfd_nfkd(PyObject *self, PyObject *input, int k)
 497 {
 498     PyObject *result;
 499     Py_UNICODE *i, *end, *o;
 500     /* Longest decomposition in Unicode 3.2: U+FDFA */
 501     Py_UNICODE stack[20];
 502     Py_ssize_t space, isize;
 503     int index, prefix, count, stackptr;
 504     unsigned char prev, cur;
 505
 506     stackptr = 0;
 507     isize = PyUnicode_GET_SIZE(input);
 508     /* Overallocate atmost 10 characters. */
 509     space = (isize > 10 ? 10 : isize) + isize;
 510     result = PyUnicode_FromUnicode(NULL, space);
 511     if (!result)
 512         return NULL;
 513     i = PyUnicode_AS_UNICODE(input);
 514     end = i + isize;
 515     o = PyUnicode_AS_UNICODE(result);
 516
 517     while (i < end) {
 518         stack[stackptr++] = *i++;
 519         while(stackptr) {
 520             Py_UNICODE code = stack[--stackptr];
 521             /* Hangul Decomposition adds three characters in
 522                a single step, so we need atleast that much room. */
 523             if (space < 3) {
 524                 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
 525                 space += 10;
 526                 if (PyUnicode_Resize(&result, newsize) == -1)
 527                     return NULL;
 528                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
 529             }
 530             /* Hangul Decomposition. */
 531             if (SBase <= code && code < (SBase+SCount)) {
 532                 int SIndex = code - SBase;
 533                 int L = LBase + SIndex / NCount;
 534                 int V = VBase + (SIndex % NCount) / TCount;
 535                 int T = TBase + SIndex % TCount;
 536                 *o++ = L;
 537                 *o++ = V;
 538                 space -= 2;
 539                 if (T != TBase) {
 540                     *o++ = T;
 541                     space --;
 542                 }
 543                 continue;
 544             }
 545             /* normalization changes */
 546             if (self) {
 547                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
 548                 if (value != 0) {
 549                     stack[stackptr++] = value;
 550                     continue;
 551                 }
 552             }
 553
 554             /* Other decompositions. */
 555             get_decomp_record(self, code, &index, &prefix, &count);
 556
 557             /* Copy character if it is not decomposable, or has a
 558                compatibility decomposition, but we do NFD. */
 559             if (!count || (prefix && !k)) {
 560                 *o++ = code;
 561                 space--;
 562                 continue;
 563             }
 564             /* Copy decomposition onto the stack, in reverse
 565                order.  */
 566             while(count) {
 567                 code = decomp_data[index + (--count)];
 568                 stack[stackptr++] = code;
 569             }
 570         }
 571     }
 572
 573     /* Drop overallocation. Cannot fail. */
 574     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 575
 576     /* Sort canonically. */
 577     i = PyUnicode_AS_UNICODE(result);
 578     prev = _getrecord_ex(*i)->combining;
 579     end = i + PyUnicode_GET_SIZE(result);
 580     for (i++; i < end; i++) {
 581         cur = _getrecord_ex(*i)->combining;
 582         if (prev == 0 || cur == 0 || prev <= cur) {
 583             prev = cur;
 584             continue;
 585         }
 586         /* Non-canonical order. Need to switch *i with previous. */
 587         o = i - 1;
 588         while (1) {
 589             Py_UNICODE tmp = o[1];
 590             o[1] = o[0];
 591             o[0] = tmp;
 592             o--;
 593             if (o < PyUnicode_AS_UNICODE(result))
 594                 break;
 595             prev = _getrecord_ex(*o)->combining;
 596             if (prev == 0 || prev <= cur)
 597                 break;
 598         }
 599         prev = _getrecord_ex(*i)->combining;
 600     }
 601     return result;
 602 }
 603
 604 static int
 605 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
 606 {
 607     int index;
 608     for (index = 0; nfc[index].start; index++) {
 609         int start = nfc[index].start;
 610         if (code < start)
 611             return -1;
 612         if (code <= start + nfc[index].count) {
 613             int delta = code - start;
 614             return nfc[index].index + delta;
 615         }
 616     }
 617     return -1;
 618 }
 619
 620 static PyObject*
 621 nfc_nfkc(PyObject *self, PyObject *input, int k)
 622 {
 623     PyObject *result;
 624     Py_UNICODE *i, *i1, *o, *end;
 625     int f,l,index,index1,comb;
 626     Py_UNICODE code;
 627     Py_UNICODE *skipped[20];
 628     int cskipped = 0;
 629
 630     result = nfd_nfkd(self, input, k);
 631     if (!result)
 632         return NULL;
 633
 634     /* We are going to modify result in-place.
 635        If nfd_nfkd is changed to sometimes return the input,
 636        this code needs to be reviewed. */
 637     assert(result != input);
 638
 639     i = PyUnicode_AS_UNICODE(result);
 640     end = i + PyUnicode_GET_SIZE(result);
 641     o = PyUnicode_AS_UNICODE(result);
 642
 643   again:
 644     while (i < end) {
 645       for (index = 0; index < cskipped; index++) {
 646           if (skipped[index] == i) {
 647               /* *i character is skipped.
 648                  Remove from list. */
 649               skipped[index] = skipped[cskipped-1];
 650               cskipped--;
 651               i++;
 652               goto again; /* continue while */
 653           }
 654       }
 655       /* Hangul Composition. We don't need to check for <LV,T>
 656          pairs, since we always have decomposed data. */
 657       if (LBase <= *i && *i < (LBase+LCount) &&
 658           i + 1 < end &&
 659           VBase <= i[1] && i[1] <= (VBase+VCount)) {
 660           int LIndex, VIndex;
 661           LIndex = i[0] - LBase;
 662           VIndex = i[1] - VBase;
 663           code = SBase + (LIndex*VCount+VIndex)*TCount;
 664           i+=2;
 665           if (i < end &&
 666               TBase <= *i && *i <= (TBase+TCount)) {
 667               code += *i-TBase;
 668               i++;
 669           }
 670           *o++ = code;
 671           continue;
 672       }
 673
 674       f = find_nfc_index(self, nfc_first, *i);
 675       if (f == -1) {
 676           *o++ = *i++;
 677           continue;
 678       }
 679       /* Find next unblocked character. */
 680       i1 = i+1;
 681       comb = 0;
 682       while (i1 < end) {
 683           int comb1 = _getrecord_ex(*i1)->combining;
 684           if (comb1 && comb == comb1) {
 685               /* Character is blocked. */
 686               i1++;
 687               continue;
 688           }
 689           l = find_nfc_index(self, nfc_last, *i1);
 690           /* *i1 cannot be combined with *i. If *i1
 691              is a starter, we don't need to look further.
 692              Otherwise, record the combining class. */
 693           if (l == -1) {
 694             not_combinable:
 695               if (comb1 == 0)
 696                   break;
 697               comb = comb1;
 698               i1++;
 699               continue;
 700           }
 701           index = f*TOTAL_LAST + l;
 702           index1 = comp_index[index >> COMP_SHIFT];
 703           code = comp_data[(index1<<COMP_SHIFT)+
 704                            (index&((1<<COMP_SHIFT)-1))];
 705           if (code == 0)
 706               goto not_combinable;
 707
 708           /* Replace the original character. */
 709           *i = code;
 710           /* Mark the second character unused. */
 711           skipped[cskipped++] = i1;
 712           i1++;
 713           f = find_nfc_index(self, nfc_first, *i);
 714           if (f == -1)
 715               break;
 716       }
 717       *o++ = *i++;
 718     }
 719     if (o != end)
 720         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 721     return result;
 722 }
 723
 724 PyDoc_STRVAR(unicodedata_normalize__doc__,
 725 "normalize(form, unistr)\n\
 726 \n\
 727 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
 728 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
 729
 730 static PyObject*
 731 unicodedata_normalize(PyObject *self, PyObject *args)
 732 {
 733     char *form;
 734     PyObject *input;
 735
 736     if(!PyArg_ParseTuple(args, "sO!:normalize",
 737                          &form, &PyUnicode_Type, &input))
 738         return NULL;
 739
 740     if (PyUnicode_GetSize(input) == 0) {
 741         /* Special case empty input strings, since resizing
 742            them  later would cause internal errors. */
 743         Py_INCREF(input);
 744         return input;
 745     }
 746
 747     if (strcmp(form, "NFC") == 0)
 748         return nfc_nfkc(self, input, 0);
 749     if (strcmp(form, "NFKC") == 0)
 750         return nfc_nfkc(self, input, 1);
 751     if (strcmp(form, "NFD") == 0)
 752         return nfd_nfkd(self, input, 0);
 753     if (strcmp(form, "NFKD") == 0)
 754         return nfd_nfkd(self, input, 1);
 755     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 756     return NULL;
 757 }
 758
 759 /* -------------------------------------------------------------------- */
 760 /* unicode character name tables */
 761
 762 /* data file generated by Tools/unicode/makeunicodedata.py */
 763 #include "unicodename_db.h"
 764
 765 /* -------------------------------------------------------------------- */
 766 /* database code (cut and pasted from the unidb package) */
 767
 768 static unsigned long
 769 _gethash(const char *s, int len, int scale)
 770 {
 771     int i;
 772     unsigned long h = 0;
 773     unsigned long ix;
 774     for (i = 0; i < len; i++) {
 775         h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
 776         ix = h & 0xff000000;
 777         if (ix)
 778             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 779     }
 780     return h;
 781 }
 782
 783 static char *hangul_syllables[][3] = {
 784     { "G",  "A",   ""   },
 785     { "GG", "AE",  "G"  },
 786     { "N",  "YA",  "GG" },
 787     { "D",  "YAE", "GS" },
 788     { "DD", "EO",  "N", },
 789     { "R",  "E",   "NJ" },
 790     { "M",  "YEO", "NH" },
 791     { "B",  "YE",  "D"  },
 792     { "BB", "O",   "L"  },
 793     { "S",  "WA",  "LG" },
 794     { "SS", "WAE", "LM" },
 795     { "",   "OE",  "LB" },
 796     { "J",  "YO",  "LS" },
 797     { "JJ", "U",   "LT" },
 798     { "C",  "WEO", "LP" },
 799     { "K",  "WE",  "LH" },
 800     { "T",  "WI",  "M"  },
 801     { "P",  "YU",  "B"  },
 802     { "H",  "EU",  "BS" },
 803     { 0,    "YI",  "S"  },
 804     { 0,    "I",   "SS" },
 805     { 0,    0,     "NG" },
 806     { 0,    0,     "J"  },
 807     { 0,    0,     "C"  },
 808     { 0,    0,     "K"  },
 809     { 0,    0,     "T"  },
 810     { 0,    0,     "P"  },
 811     { 0,    0,     "H"  }
 812 };
 813
 814 static int
 815 is_unified_ideograph(Py_UCS4 code)
 816 {
 817     return (
 818         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 819         (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
 820         (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
 821 }
 822
 823 static int
 824 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
 825 {
 826     int offset;
 827     int i;
 828     int word;
 829     unsigned char* w;
 830
 831     if (code >= 0x110000)
 832         return 0;
 833
 834     if (self) {
 835         const change_record *old = get_old_record(self, code);
 836         if (old->category_changed == 0) {
 837             /* unassigned */
 838             return 0;
 839         }
 840     }
 841
 842     if (SBase <= code && code < SBase+SCount) {
 843         /* Hangul syllable. */
 844         int SIndex = code - SBase;
 845         int L = SIndex / NCount;
 846         int V = (SIndex % NCount) / TCount;
 847         int T = SIndex % TCount;
 848
 849         if (buflen < 27)
 850             /* Worst case: HANGUL SYLLABLE <10chars>. */
 851             return 0;
 852         strcpy(buffer, "HANGUL SYLLABLE ");
 853         buffer += 16;
 854         strcpy(buffer, hangul_syllables[L][0]);
 855         buffer += strlen(hangul_syllables[L][0]);
 856         strcpy(buffer, hangul_syllables[V][1]);
 857         buffer += strlen(hangul_syllables[V][1]);
 858         strcpy(buffer, hangul_syllables[T][2]);
 859         buffer += strlen(hangul_syllables[T][2]);
 860         *buffer = '\0';
 861         return 1;
 862     }
 863
 864     if (is_unified_ideograph(code)) {
 865         if (buflen < 28)
 866             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 867             return 0;
 868         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 869         return 1;
 870     }
 871
 872     /* get offset into phrasebook */
 873     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 874     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 875                                (code&((1<<phrasebook_shift)-1))];
 876     if (!offset)
 877         return 0;
 878
 879     i = 0;
 880
 881     for (;;) {
 882         /* get word index */
 883         word = phrasebook[offset] - phrasebook_short;
 884         if (word >= 0) {
 885             word = (word << 8) + phrasebook[offset+1];
 886             offset += 2;
 887         } else
 888             word = phrasebook[offset++];
 889         if (i) {
 890             if (i > buflen)
 891                 return 0; /* buffer overflow */
 892             buffer[i++] = ' ';
 893         }
 894         /* copy word string from lexicon.  the last character in the
 895            word has bit 7 set.  the last word in a string ends with
 896            0x80 */
 897         w = lexicon + lexicon_offset[word];
 898         while (*w < 128) {
 899             if (i >= buflen)
 900                 return 0; /* buffer overflow */
 901             buffer[i++] = *w++;
 902         }
 903         if (i >= buflen)
 904             return 0; /* buffer overflow */
 905         buffer[i++] = *w & 127;
 906         if (*w == 128)
 907             break; /* end of word */
 908     }
 909
 910     return 1;
 911 }
 912
 913 static int
 914 _cmpname(PyObject *self, int code, const char* name, int namelen)
 915 {
 916     /* check if code corresponds to the given name */
 917     int i;
 918     char buffer[NAME_MAXLEN];
 919     if (!_getucname(self, code, buffer, sizeof(buffer)))
 920         return 0;
 921     for (i = 0; i < namelen; i++) {
 922         if (toupper(Py_CHARMASK(name[i])) != buffer[i])
 923             return 0;
 924     }
 925     return buffer[namelen] == '\0';
 926 }
 927
 928 static void
 929 find_syllable(const char *str, int *len, int *pos, int count, int column)
 930 {
 931     int i, len1;
 932     *len = -1;
 933     for (i = 0; i < count; i++) {
 934         char *s = hangul_syllables[i][column];
 935         len1 = strlen(s);
 936         if (len1 <= *len)
 937             continue;
 938         if (strncmp(str, s, len1) == 0) {
 939             *len = len1;
 940             *pos = i;
 941         }
 942     }
 943     if (*len == -1) {
 944         *len = 0;
 945     }
 946 }
 947
 948 static int
 949 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
 950 {
 951     unsigned int h, v;
 952     unsigned int mask = code_size-1;
 953     unsigned int i, incr;
 954
 955     /* Check for hangul syllables. */
 956     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
 957         int len, L = -1, V = -1, T = -1;
 958         const char *pos = name + 16;
 959         find_syllable(pos, &len, &L, LCount, 0);
 960         pos += len;
 961         find_syllable(pos, &len, &V, VCount, 1);
 962         pos += len;
 963         find_syllable(pos, &len, &T, TCount, 2);
 964         pos += len;
 965         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
 966             *code = SBase + (L*VCount+V)*TCount + T;
 967             return 1;
 968         }
 969         /* Otherwise, it's an illegal syllable name. */
 970         return 0;
 971     }
 972
 973     /* Check for unified ideographs. */
 974     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
 975         /* Four or five hexdigits must follow. */
 976         v = 0;
 977         name += 22;
 978         namelen -= 22;
 979         if (namelen != 4 && namelen != 5)
 980             return 0;
 981         while (namelen--) {
 982             v *= 16;
 983             if (*name >= '0' && *name <= '9')
 984                 v += *name - '0';
 985             else if (*name >= 'A' && *name <= 'F')
 986                 v += *name - 'A' + 10;
 987             else
 988                 return 0;
 989             name++;
 990         }
 991         if (!is_unified_ideograph(v))
 992             return 0;
 993         *code = v;
 994         return 1;
 995     }
 996
 997     /* the following is the same as python's dictionary lookup, with
 998        only minor changes.  see the makeunicodedata script for more
 999        details */
1000
1001     h = (unsigned int) _gethash(name, namelen, code_magic);
1002     i = (~h) & mask;
1003     v = code_hash[i];
1004     if (!v)
1005         return 0;
1006     if (_cmpname(self, v, name, namelen)) {
1007         *code = v;
1008         return 1;
1009     }
1010     incr = (h ^ (h >> 3)) & mask;
1011     if (!incr)
1012         incr = mask;
1013     for (;;) {
1014         i = (i + incr) & mask;
1015         v = code_hash[i];
1016         if (!v)
1017             return 0;
1018         if (_cmpname(self, v, name, namelen)) {
1019             *code = v;
1020             return 1;
1021         }
1022         incr = incr << 1;
1023         if (incr > mask)
1024             incr = incr ^ code_poly;
1025     }
1026 }
1027
1028 static const _PyUnicode_Name_CAPI hashAPI =
1029 {
1030     sizeof(_PyUnicode_Name_CAPI),
1031     _getucname,
1032     _getcode
1033 };
1034
1035 /* -------------------------------------------------------------------- */
1036 /* Python bindings */
1037
1038 PyDoc_STRVAR(unicodedata_name__doc__,
1039 "name(unichr[, default])\n\
1040 Returns the name assigned to the Unicode character unichr as a\n\
1041 string. If no name is defined, default is returned, or, if not\n\
1042 given, ValueError is raised.");
1043
1044 static PyObject *
1045 unicodedata_name(PyObject* self, PyObject* args)
1046 {
1047     char name[NAME_MAXLEN];
1048     Py_UCS4 c;
1049
1050     PyUnicodeObject* v;
1051     PyObject* defobj = NULL;
1052     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1053         return NULL;
1054
1055     c = getuchar(v);
1056     if (c == (Py_UCS4)-1)
1057         return NULL;
1058
1059     if (!_getucname(self, c, name, sizeof(name))) {
1060         if (defobj == NULL) {
1061             PyErr_SetString(PyExc_ValueError, "no such name");
1062             return NULL;
1063         }
1064         else {
1065             Py_INCREF(defobj);
1066             return defobj;
1067         }
1068     }
1069
1070     return Py_BuildValue("s", name);
1071 }
1072
1073 PyDoc_STRVAR(unicodedata_lookup__doc__,
1074 "lookup(name)\n\
1075 \n\
1076 Look up character by name.  If a character with the\n\
1077 given name is found, return the corresponding Unicode\n\
1078 character.  If not found, KeyError is raised.");
1079
1080 static PyObject *
1081 unicodedata_lookup(PyObject* self, PyObject* args)
1082 {
1083     Py_UCS4 code;
1084     Py_UNICODE str[2];
1085
1086     char* name;
1087     int namelen;
1088     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1089         return NULL;
1090
1091     if (!_getcode(self, name, namelen, &code)) {
1092         PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1093                      name);
1094         return NULL;
1095     }
1096
1097 #ifndef Py_UNICODE_WIDE
1098     if (code >= 0x10000) {
1099         str[0] = 0xd800 + ((code - 0x10000) >> 10);
1100         str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1101         return PyUnicode_FromUnicode(str, 2);
1102     }
1103 #endif
1104     str[0] = (Py_UNICODE) code;
1105     return PyUnicode_FromUnicode(str, 1);
1106 }
1107
1108 /* XXX Add doc strings. */
1109
1110 static PyMethodDef unicodedata_functions[] = {
1111     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1112     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1113     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1114     {"category", unicodedata_category, METH_VARARGS,
1115                  unicodedata_category__doc__},
1116     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1117                       unicodedata_bidirectional__doc__},
1118     {"combining", unicodedata_combining, METH_VARARGS,
1119                   unicodedata_combining__doc__},
1120     {"mirrored", unicodedata_mirrored, METH_VARARGS,
1121                  unicodedata_mirrored__doc__},
1122     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1123                          unicodedata_east_asian_width__doc__},
1124     {"decomposition", unicodedata_decomposition, METH_VARARGS,
1125                       unicodedata_decomposition__doc__},
1126     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1127     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1128     {"normalize", unicodedata_normalize, METH_VARARGS,
1129                   unicodedata_normalize__doc__},
1130     {NULL, NULL}                /* sentinel */
1131 };
1132
1133 static PyTypeObject UCD_Type = {
1134         /* The ob_type field must be initialized in the module init function
1135          * to be portable to Windows without using C++. */
1136         PyVarObject_HEAD_INIT(NULL, 0)
1137         "unicodedata.UCD",              /*tp_name*/
1138         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1139         0,                      /*tp_itemsize*/
1140         /* methods */
1141         (destructor)PyObject_Del, /*tp_dealloc*/
1142         0,                      /*tp_print*/
1143         0,                      /*tp_getattr*/
1144         0,                      /*tp_setattr*/
1145         0,                      /*tp_compare*/
1146         0,                      /*tp_repr*/
1147         0,                      /*tp_as_number*/
1148         0,                      /*tp_as_sequence*/
1149         0,                      /*tp_as_mapping*/
1150         0,                      /*tp_hash*/
1151         0,                      /*tp_call*/
1152         0,                      /*tp_str*/
1153         PyObject_GenericGetAttr,/*tp_getattro*/
1154         0,                      /*tp_setattro*/
1155         0,                      /*tp_as_buffer*/
1156         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1157         0,                      /*tp_doc*/
1158         0,                      /*tp_traverse*/
1159         0,                      /*tp_clear*/
1160         0,                      /*tp_richcompare*/
1161         0,                      /*tp_weaklistoffset*/
1162         0,                      /*tp_iter*/
1163         0,                      /*tp_iternext*/
1164         unicodedata_functions,  /*tp_methods*/
1165         DB_members,             /*tp_members*/
1166         0,                      /*tp_getset*/
1167         0,                      /*tp_base*/
1168         0,                      /*tp_dict*/
1169         0,                      /*tp_descr_get*/
1170         0,                      /*tp_descr_set*/
1171         0,                      /*tp_dictoffset*/
1172         0,                      /*tp_init*/
1173         0,                      /*tp_alloc*/
1174         0,                      /*tp_new*/
1175         0,                      /*tp_free*/
1176         0,                      /*tp_is_gc*/
1177 };
1178
1179 PyDoc_STRVAR(unicodedata_docstring,
1180 "This module provides access to the Unicode Character Database which\n\
1181 defines character properties for all Unicode characters. The data in\n\
1182 this database is based on the UnicodeData.txt file version\n\
1183 5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1184 \n\
1185 The module uses the same names and symbols as defined by the\n\
1186 UnicodeData File Format 5.1.0 (see\n\
1187 http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
1188
1189 PyMODINIT_FUNC
1190 initunicodedata(void)
1191 {
1192     PyObject *m, *v;
1193
1194     Py_TYPE(&UCD_Type) = &PyType_Type;
1195
1196     m = Py_InitModule3(
1197         "unicodedata", unicodedata_functions, unicodedata_docstring);
1198     if (!m)
1199         return;
1200
1201     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1202     Py_INCREF(&UCD_Type);
1203     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1204
1205     /* Previous versions */
1206     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1207     if (v != NULL)
1208         PyModule_AddObject(m, "ucd_3_2_0", v);
1209
1210     /* Export C API */
1211     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1212     if (v != NULL)
1213         PyModule_AddObject(m, "ucnhash_CAPI", v);
1214 }
1215
1216 /*
1217 Local variables:
1218 c-basic-offset: 4
1219 indent-tabs-mode: nil
1220 End:
1221 */