Modules/unicodedata.c

   1 /* ------------------------------------------------------------------------
   2
   3    unicodedata -- Provides access to the Unicode 5.1 data base.
   4
   5    Data was extracted from the Unicode 5.1 UnicodeData.txt file.
   6
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9    Modified by Martin v. Löwis (martin@v.loewis.de)
  10
  11    Copyright (c) Corporation for National Research Initiatives.
  12
  13    ------------------------------------------------------------------------ */
  14
  15 #include "Python.h"
  16 #include "ucnhash.h"
  17 #include "structmember.h"
  18
  19 /* character properties */
  20
  21 typedef struct {
  22     const unsigned char category;       /* index into
  23                                            _PyUnicode_CategoryNames */
  24     const unsigned char combining;      /* combining class value 0 - 255 */
  25     const unsigned char bidirectional;  /* index into
  26                                            _PyUnicode_BidirectionalNames */
  27     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  28     const unsigned char east_asian_width;       /* index into
  29                                                    _PyUnicode_EastAsianWidth */
  30     const unsigned char normalization_quick_check; /* see is_normalized() */
  31 } _PyUnicode_DatabaseRecord;
  32
  33 typedef struct change_record {
  34     /* sequence of fields should be the same as in merge_old_version */
  35     const unsigned char bidir_changed;
  36     const unsigned char category_changed;
  37     const unsigned char decimal_changed;
  38     const unsigned char mirrored_changed;
  39     const int numeric_changed;
  40 } change_record;
  41
  42 /* data file generated by Tools/unicode/makeunicodedata.py */
  43 #include "unicodedata_db.h"
  44
  45 static const _PyUnicode_DatabaseRecord*
  46 _getrecord_ex(Py_UCS4 code)
  47 {
  48     int index;
  49     if (code >= 0x110000)
  50         index = 0;
  51     else {
  52         index = index1[(code>>SHIFT)];
  53         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  54     }
  55
  56     return &_PyUnicode_Database_Records[index];
  57 }
  58
  59 /* ------------- Previous-version API ------------------------------------- */
  60 typedef struct previous_version {
  61     PyObject_HEAD
  62     const char *name;
  63     const change_record* (*getrecord)(Py_UCS4);
  64     Py_UCS4 (*normalization)(Py_UCS4);
  65 } PreviousDBVersion;
  66
  67 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
  68
  69 static PyMemberDef DB_members[] = {
  70         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
  71         {NULL}
  72 };
  73
  74 /* forward declaration */
  75 static PyTypeObject UCD_Type;
  76
  77 static PyObject*
  78 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
  79                      Py_UCS4 (*normalization)(Py_UCS4))
  80 {
  81         PreviousDBVersion *self;
  82         self = PyObject_New(PreviousDBVersion, &UCD_Type);
  83         if (self == NULL)
  84                 return NULL;
  85         self->name = name;
  86         self->getrecord = getrecord;
  87         self->normalization = normalization;
  88         return (PyObject*)self;
  89 }
  90
  91
  92 static Py_UCS4 getuchar(PyUnicodeObject *obj)
  93 {
  94     Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
  95
  96     if (PyUnicode_GET_SIZE(obj) == 1)
  97         return *v;
  98 #ifndef Py_UNICODE_WIDE
  99     else if ((PyUnicode_GET_SIZE(obj) == 2) &&
 100              (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
 101              (0xDC00 <= v[1] && v[1] <= 0xDFFF))
 102         return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
 103 #endif
 104     PyErr_SetString(PyExc_TypeError,
 105                     "need a single Unicode character as parameter");
 106     return (Py_UCS4)-1;
 107 }
 108
 109 /* --- Module API --------------------------------------------------------- */
 110
 111 PyDoc_STRVAR(unicodedata_decimal__doc__,
 112 "decimal(unichr[, default])\n\
 113 \n\
 114 Returns the decimal value assigned to the Unicode character unichr\n\
 115 as integer. If no such value is defined, default is returned, or, if\n\
 116 not given, ValueError is raised.");
 117
 118 static PyObject *
 119 unicodedata_decimal(PyObject *self, PyObject *args)
 120 {
 121     PyUnicodeObject *v;
 122     PyObject *defobj = NULL;
 123     int have_old = 0;
 124     long rc;
 125     Py_UCS4 c;
 126
 127     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
 128         return NULL;
 129     c = getuchar(v);
 130     if (c == (Py_UCS4)-1)
 131         return NULL;
 132
 133     if (self) {
 134         const change_record *old = get_old_record(self, c);
 135         if (old->category_changed == 0) {
 136             /* unassigned */
 137             have_old = 1;
 138             rc = -1;
 139         }
 140         else if (old->decimal_changed != 0xFF) {
 141             have_old = 1;
 142             rc = old->decimal_changed;
 143         }
 144     }
 145
 146     if (!have_old)
 147         rc = Py_UNICODE_TODECIMAL(c);
 148     if (rc < 0) {
 149         if (defobj == NULL) {
 150             PyErr_SetString(PyExc_ValueError,
 151                             "not a decimal");
 152             return NULL;
 153         }
 154         else {
 155             Py_INCREF(defobj);
 156             return defobj;
 157         }
 158     }
 159     return PyInt_FromLong(rc);
 160 }
 161
 162 PyDoc_STRVAR(unicodedata_digit__doc__,
 163 "digit(unichr[, default])\n\
 164 \n\
 165 Returns the digit value assigned to the Unicode character unichr as\n\
 166 integer. If no such value is defined, default is returned, or, if\n\
 167 not given, ValueError is raised.");
 168
 169 static PyObject *
 170 unicodedata_digit(PyObject *self, PyObject *args)
 171 {
 172     PyUnicodeObject *v;
 173     PyObject *defobj = NULL;
 174     long rc;
 175     Py_UCS4 c;
 176
 177     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
 178         return NULL;
 179     c = getuchar(v);
 180     if (c == (Py_UCS4)-1)
 181         return NULL;
 182     rc = Py_UNICODE_TODIGIT(c);
 183     if (rc < 0) {
 184         if (defobj == NULL) {
 185             PyErr_SetString(PyExc_ValueError, "not a digit");
 186             return NULL;
 187         }
 188         else {
 189             Py_INCREF(defobj);
 190             return defobj;
 191         }
 192     }
 193     return PyInt_FromLong(rc);
 194 }
 195
 196 PyDoc_STRVAR(unicodedata_numeric__doc__,
 197 "numeric(unichr[, default])\n\
 198 \n\
 199 Returns the numeric value assigned to the Unicode character unichr\n\
 200 as float. If no such value is defined, default is returned, or, if\n\
 201 not given, ValueError is raised.");
 202
 203 static PyObject *
 204 unicodedata_numeric(PyObject *self, PyObject *args)
 205 {
 206     PyUnicodeObject *v;
 207     PyObject *defobj = NULL;
 208     int have_old = 0;
 209     double rc;
 210     Py_UCS4 c;
 211
 212     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 213         return NULL;
 214     c = getuchar(v);
 215     if (c == (Py_UCS4)-1)
 216         return NULL;
 217
 218     if (self) {
 219         const change_record *old = get_old_record(self, c);
 220         if (old->category_changed == 0) {
 221             /* unassigned */
 222             have_old = 1;
 223             rc = -1.0;
 224         }
 225         else if (old->decimal_changed != 0xFF) {
 226             have_old = 1;
 227             rc = old->decimal_changed;
 228         }
 229     }
 230
 231     if (!have_old)
 232         rc = Py_UNICODE_TONUMERIC(c);
 233     if (rc == -1.0) {
 234         if (defobj == NULL) {
 235             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 236             return NULL;
 237         }
 238         else {
 239             Py_INCREF(defobj);
 240             return defobj;
 241         }
 242     }
 243     return PyFloat_FromDouble(rc);
 244 }
 245
 246 PyDoc_STRVAR(unicodedata_category__doc__,
 247 "category(unichr)\n\
 248 \n\
 249 Returns the general category assigned to the Unicode character\n\
 250 unichr as string.");
 251
 252 static PyObject *
 253 unicodedata_category(PyObject *self, PyObject *args)
 254 {
 255     PyUnicodeObject *v;
 256     int index;
 257     Py_UCS4 c;
 258
 259     if (!PyArg_ParseTuple(args, "O!:category",
 260                           &PyUnicode_Type, &v))
 261         return NULL;
 262     c = getuchar(v);
 263     if (c == (Py_UCS4)-1)
 264         return NULL;
 265     index = (int) _getrecord_ex(c)->category;
 266     if (self) {
 267         const change_record *old = get_old_record(self, c);
 268         if (old->category_changed != 0xFF)
 269             index = old->category_changed;
 270     }
 271     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 272 }
 273
 274 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
 275 "bidirectional(unichr)\n\
 276 \n\
 277 Returns the bidirectional category assigned to the Unicode character\n\
 278 unichr as string. If no such value is defined, an empty string is\n\
 279 returned.");
 280
 281 static PyObject *
 282 unicodedata_bidirectional(PyObject *self, PyObject *args)
 283 {
 284     PyUnicodeObject *v;
 285     int index;
 286     Py_UCS4 c;
 287
 288     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 289                           &PyUnicode_Type, &v))
 290         return NULL;
 291     c = getuchar(v);
 292     if (c == (Py_UCS4)-1)
 293         return NULL;
 294     index = (int) _getrecord_ex(c)->bidirectional;
 295     if (self) {
 296         const change_record *old = get_old_record(self, c);
 297         if (old->category_changed == 0)
 298             index = 0; /* unassigned */
 299         else if (old->bidir_changed != 0xFF)
 300             index = old->bidir_changed;
 301     }
 302     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 303 }
 304
 305 PyDoc_STRVAR(unicodedata_combining__doc__,
 306 "combining(unichr)\n\
 307 \n\
 308 Returns the canonical combining class assigned to the Unicode\n\
 309 character unichr as integer. Returns 0 if no combining class is\n\
 310 defined.");
 311
 312 static PyObject *
 313 unicodedata_combining(PyObject *self, PyObject *args)
 314 {
 315     PyUnicodeObject *v;
 316     int index;
 317     Py_UCS4 c;
 318
 319     if (!PyArg_ParseTuple(args, "O!:combining",
 320                           &PyUnicode_Type, &v))
 321         return NULL;
 322     c = getuchar(v);
 323     if (c == (Py_UCS4)-1)
 324         return NULL;
 325     index = (int) _getrecord_ex(c)->combining;
 326     if (self) {
 327         const change_record *old = get_old_record(self, c);
 328         if (old->category_changed == 0)
 329             index = 0; /* unassigned */
 330     }
 331     return PyInt_FromLong(index);
 332 }
 333
 334 PyDoc_STRVAR(unicodedata_mirrored__doc__,
 335 "mirrored(unichr)\n\
 336 \n\
 337 Returns the mirrored property assigned to the Unicode character\n\
 338 unichr as integer. Returns 1 if the character has been identified as\n\
 339 a \"mirrored\" character in bidirectional text, 0 otherwise.");
 340
 341 static PyObject *
 342 unicodedata_mirrored(PyObject *self, PyObject *args)
 343 {
 344     PyUnicodeObject *v;
 345     int index;
 346     Py_UCS4 c;
 347
 348     if (!PyArg_ParseTuple(args, "O!:mirrored",
 349                           &PyUnicode_Type, &v))
 350         return NULL;
 351     c = getuchar(v);
 352     if (c == (Py_UCS4)-1)
 353         return NULL;
 354     index = (int) _getrecord_ex(c)->mirrored;
 355     if (self) {
 356         const change_record *old = get_old_record(self, c);
 357         if (old->category_changed == 0)
 358             index = 0; /* unassigned */
 359         else if (old->mirrored_changed != 0xFF)
 360             index = old->mirrored_changed;
 361     }
 362     return PyInt_FromLong(index);
 363 }
 364
 365 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
 366 "east_asian_width(unichr)\n\
 367 \n\
 368 Returns the east asian width assigned to the Unicode character\n\
 369 unichr as string.");
 370
 371 static PyObject *
 372 unicodedata_east_asian_width(PyObject *self, PyObject *args)
 373 {
 374     PyUnicodeObject *v;
 375     int index;
 376     Py_UCS4 c;
 377
 378     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
 379                           &PyUnicode_Type, &v))
 380         return NULL;
 381     c = getuchar(v);
 382     if (c == (Py_UCS4)-1)
 383         return NULL;
 384     index = (int) _getrecord_ex(c)->east_asian_width;
 385     if (self) {
 386         const change_record *old = get_old_record(self, c);
 387         if (old->category_changed == 0)
 388             index = 0; /* unassigned */
 389     }
 390     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
 391 }
 392
 393 PyDoc_STRVAR(unicodedata_decomposition__doc__,
 394 "decomposition(unichr)\n\
 395 \n\
 396 Returns the character decomposition mapping assigned to the Unicode\n\
 397 character unichr as string. An empty string is returned in case no\n\
 398 such mapping is defined.");
 399
 400 static PyObject *
 401 unicodedata_decomposition(PyObject *self, PyObject *args)
 402 {
 403     PyUnicodeObject *v;
 404     char decomp[256];
 405     int code, index, count, i;
 406     unsigned int prefix_index;
 407     Py_UCS4 c;
 408
 409     if (!PyArg_ParseTuple(args, "O!:decomposition",
 410                           &PyUnicode_Type, &v))
 411         return NULL;
 412     c = getuchar(v);
 413     if (c == (Py_UCS4)-1)
 414         return NULL;
 415
 416     code = (int)c;
 417
 418     if (self) {
 419         const change_record *old = get_old_record(self, c);
 420         if (old->category_changed == 0)
 421             return PyString_FromString(""); /* unassigned */
 422     }
 423
 424     if (code < 0 || code >= 0x110000)
 425         index = 0;
 426     else {
 427         index = decomp_index1[(code>>DECOMP_SHIFT)];
 428         index = decomp_index2[(index<<DECOMP_SHIFT)+
 429                              (code&((1<<DECOMP_SHIFT)-1))];
 430     }
 431
 432     /* high byte is number of hex bytes (usually one or two), low byte
 433        is prefix code (from*/
 434     count = decomp_data[index] >> 8;
 435
 436     /* XXX: could allocate the PyString up front instead
 437        (strlen(prefix) + 5 * count + 1 bytes) */
 438
 439     /* Based on how index is calculated above and decomp_data is generated
 440        from Tools/unicode/makeunicodedata.py, it should not be possible
 441        to overflow decomp_prefix. */
 442     prefix_index = decomp_data[index] & 255;
 443     assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
 444
 445     /* copy prefix */
 446     i = strlen(decomp_prefix[prefix_index]);
 447     memcpy(decomp, decomp_prefix[prefix_index], i);
 448
 449     while (count-- > 0) {
 450         if (i)
 451             decomp[i++] = ' ';
 452         assert((size_t)i < sizeof(decomp));
 453         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 454                       decomp_data[++index]);
 455         i += strlen(decomp + i);
 456     }
 457
 458     decomp[i] = '\0';
 459
 460     return PyString_FromString(decomp);
 461 }
 462
 463 static void
 464 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 465 {
 466     if (code >= 0x110000) {
 467         *index = 0;
 468     } else if (self && get_old_record(self, code)->category_changed==0) {
 469         /* unassigned in old version */
 470         *index = 0;
 471     }
 472     else {
 473         *index = decomp_index1[(code>>DECOMP_SHIFT)];
 474         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 475                                (code&((1<<DECOMP_SHIFT)-1))];
 476     }
 477
 478     /* high byte is number of hex bytes (usually one or two), low byte
 479        is prefix code (from*/
 480     *count = decomp_data[*index] >> 8;
 481     *prefix = decomp_data[*index] & 255;
 482
 483     (*index)++;
 484 }
 485
 486 #define SBase   0xAC00
 487 #define LBase   0x1100
 488 #define VBase   0x1161
 489 #define TBase   0x11A7
 490 #define LCount  19
 491 #define VCount  21
 492 #define TCount  28
 493 #define NCount  (VCount*TCount)
 494 #define SCount  (LCount*NCount)
 495
 496 static PyObject*
 497 nfd_nfkd(PyObject *self, PyObject *input, int k)
 498 {
 499     PyObject *result;
 500     Py_UNICODE *i, *end, *o;
 501     /* Longest decomposition in Unicode 3.2: U+FDFA */
 502     Py_UNICODE stack[20];
 503     Py_ssize_t space, isize;
 504     int index, prefix, count, stackptr;
 505     unsigned char prev, cur;
 506
 507     stackptr = 0;
 508     isize = PyUnicode_GET_SIZE(input);
 509     /* Overallocate atmost 10 characters. */
 510     space = (isize > 10 ? 10 : isize) + isize;
 511     result = PyUnicode_FromUnicode(NULL, space);
 512     if (!result)
 513         return NULL;
 514     i = PyUnicode_AS_UNICODE(input);
 515     end = i + isize;
 516     o = PyUnicode_AS_UNICODE(result);
 517
 518     while (i < end) {
 519         stack[stackptr++] = *i++;
 520         while(stackptr) {
 521             Py_UNICODE code = stack[--stackptr];
 522             /* Hangul Decomposition adds three characters in
 523                a single step, so we need atleast that much room. */
 524             if (space < 3) {
 525                 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
 526                 space += 10;
 527                 if (PyUnicode_Resize(&result, newsize) == -1)
 528                     return NULL;
 529                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
 530             }
 531             /* Hangul Decomposition. */
 532             if (SBase <= code && code < (SBase+SCount)) {
 533                 int SIndex = code - SBase;
 534                 int L = LBase + SIndex / NCount;
 535                 int V = VBase + (SIndex % NCount) / TCount;
 536                 int T = TBase + SIndex % TCount;
 537                 *o++ = L;
 538                 *o++ = V;
 539                 space -= 2;
 540                 if (T != TBase) {
 541                     *o++ = T;
 542                     space --;
 543                 }
 544                 continue;
 545             }
 546             /* normalization changes */
 547             if (self) {
 548                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
 549                 if (value != 0) {
 550                     stack[stackptr++] = value;
 551                     continue;
 552                 }
 553             }
 554
 555             /* Other decompositions. */
 556             get_decomp_record(self, code, &index, &prefix, &count);
 557
 558             /* Copy character if it is not decomposable, or has a
 559                compatibility decomposition, but we do NFD. */
 560             if (!count || (prefix && !k)) {
 561                 *o++ = code;
 562                 space--;
 563                 continue;
 564             }
 565             /* Copy decomposition onto the stack, in reverse
 566                order.  */
 567             while(count) {
 568                 code = decomp_data[index + (--count)];
 569                 stack[stackptr++] = code;
 570             }
 571         }
 572     }
 573
 574     /* Drop overallocation. Cannot fail. */
 575     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 576
 577     /* Sort canonically. */
 578     i = PyUnicode_AS_UNICODE(result);
 579     prev = _getrecord_ex(*i)->combining;
 580     end = i + PyUnicode_GET_SIZE(result);
 581     for (i++; i < end; i++) {
 582         cur = _getrecord_ex(*i)->combining;
 583         if (prev == 0 || cur == 0 || prev <= cur) {
 584             prev = cur;
 585             continue;
 586         }
 587         /* Non-canonical order. Need to switch *i with previous. */
 588         o = i - 1;
 589         while (1) {
 590             Py_UNICODE tmp = o[1];
 591             o[1] = o[0];
 592             o[0] = tmp;
 593             o--;
 594             if (o < PyUnicode_AS_UNICODE(result))
 595                 break;
 596             prev = _getrecord_ex(*o)->combining;
 597             if (prev == 0 || prev <= cur)
 598                 break;
 599         }
 600         prev = _getrecord_ex(*i)->combining;
 601     }
 602     return result;
 603 }
 604
 605 static int
 606 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
 607 {
 608     int index;
 609     for (index = 0; nfc[index].start; index++) {
 610         int start = nfc[index].start;
 611         if (code < start)
 612             return -1;
 613         if (code <= start + nfc[index].count) {
 614             int delta = code - start;
 615             return nfc[index].index + delta;
 616         }
 617     }
 618     return -1;
 619 }
 620
 621 static PyObject*
 622 nfc_nfkc(PyObject *self, PyObject *input, int k)
 623 {
 624     PyObject *result;
 625     Py_UNICODE *i, *i1, *o, *end;
 626     int f,l,index,index1,comb;
 627     Py_UNICODE code;
 628     Py_UNICODE *skipped[20];
 629     int cskipped = 0;
 630
 631     result = nfd_nfkd(self, input, k);
 632     if (!result)
 633         return NULL;
 634
 635     /* We are going to modify result in-place.
 636        If nfd_nfkd is changed to sometimes return the input,
 637        this code needs to be reviewed. */
 638     assert(result != input);
 639
 640     i = PyUnicode_AS_UNICODE(result);
 641     end = i + PyUnicode_GET_SIZE(result);
 642     o = PyUnicode_AS_UNICODE(result);
 643
 644   again:
 645     while (i < end) {
 646       for (index = 0; index < cskipped; index++) {
 647           if (skipped[index] == i) {
 648               /* *i character is skipped.
 649                  Remove from list. */
 650               skipped[index] = skipped[cskipped-1];
 651               cskipped--;
 652               i++;
 653               goto again; /* continue while */
 654           }
 655       }
 656       /* Hangul Composition. We don't need to check for <LV,T>
 657          pairs, since we always have decomposed data. */
 658       if (LBase <= *i && *i < (LBase+LCount) &&
 659           i + 1 < end &&
 660           VBase <= i[1] && i[1] <= (VBase+VCount)) {
 661           int LIndex, VIndex;
 662           LIndex = i[0] - LBase;
 663           VIndex = i[1] - VBase;
 664           code = SBase + (LIndex*VCount+VIndex)*TCount;
 665           i+=2;
 666           if (i < end &&
 667               TBase <= *i && *i <= (TBase+TCount)) {
 668               code += *i-TBase;
 669               i++;
 670           }
 671           *o++ = code;
 672           continue;
 673       }
 674
 675       f = find_nfc_index(self, nfc_first, *i);
 676       if (f == -1) {
 677           *o++ = *i++;
 678           continue;
 679       }
 680       /* Find next unblocked character. */
 681       i1 = i+1;
 682       comb = 0;
 683       while (i1 < end) {
 684           int comb1 = _getrecord_ex(*i1)->combining;
 685           if (comb1 && comb == comb1) {
 686               /* Character is blocked. */
 687               i1++;
 688               continue;
 689           }
 690           l = find_nfc_index(self, nfc_last, *i1);
 691           /* *i1 cannot be combined with *i. If *i1
 692              is a starter, we don't need to look further.
 693              Otherwise, record the combining class. */
 694           if (l == -1) {
 695             not_combinable:
 696               if (comb1 == 0)
 697                   break;
 698               comb = comb1;
 699               i1++;
 700               continue;
 701           }
 702           index = f*TOTAL_LAST + l;
 703           index1 = comp_index[index >> COMP_SHIFT];
 704           code = comp_data[(index1<<COMP_SHIFT)+
 705                            (index&((1<<COMP_SHIFT)-1))];
 706           if (code == 0)
 707               goto not_combinable;
 708
 709           /* Replace the original character. */
 710           *i = code;
 711           /* Mark the second character unused. */
 712           skipped[cskipped++] = i1;
 713           i1++;
 714           f = find_nfc_index(self, nfc_first, *i);
 715           if (f == -1)
 716               break;
 717       }
 718       *o++ = *i++;
 719     }
 720     if (o != end)
 721         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 722     return result;
 723 }
 724
 725 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
 726 static int
 727 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
 728 {
 729     Py_UNICODE *i, *end;
 730     unsigned char prev_combining = 0, quickcheck_mask;
 731
 732     /* An older version of the database is requested, quickchecks must be
 733        disabled. */
 734     if (self != NULL)
 735         return 0;
 736
 737     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
 738        as described in http://unicode.org/reports/tr15/#Annex8. */
 739     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
 740
 741     i = PyUnicode_AS_UNICODE(input);
 742     end = i + PyUnicode_GET_SIZE(input);
 743     while (i < end) {
 744         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
 745         unsigned char combining = record->combining;
 746         unsigned char quickcheck = record->normalization_quick_check;
 747
 748         if (quickcheck & quickcheck_mask)
 749             return 0; /* this string might need normalization */
 750         if (combining && prev_combining > combining)
 751             return 0; /* non-canonical sort order, not normalized */
 752         prev_combining = combining;
 753     }
 754     return 1; /* certainly normalized */
 755 }
 756
 757 PyDoc_STRVAR(unicodedata_normalize__doc__,
 758 "normalize(form, unistr)\n\
 759 \n\
 760 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
 761 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
 762
 763 static PyObject*
 764 unicodedata_normalize(PyObject *self, PyObject *args)
 765 {
 766     char *form;
 767     PyObject *input;
 768
 769     if(!PyArg_ParseTuple(args, "sO!:normalize",
 770                          &form, &PyUnicode_Type, &input))
 771         return NULL;
 772
 773     if (PyUnicode_GetSize(input) == 0) {
 774         /* Special case empty input strings, since resizing
 775            them  later would cause internal errors. */
 776         Py_INCREF(input);
 777         return input;
 778     }
 779
 780     if (strcmp(form, "NFC") == 0) {
 781         if (is_normalized(self, input, 1, 0)) {
 782             Py_INCREF(input);
 783             return input;
 784         }
 785         return nfc_nfkc(self, input, 0);
 786     }
 787     if (strcmp(form, "NFKC") == 0) {
 788         if (is_normalized(self, input, 1, 1)) {
 789             Py_INCREF(input);
 790             return input;
 791         }
 792         return nfc_nfkc(self, input, 1);
 793     }
 794     if (strcmp(form, "NFD") == 0) {
 795         if (is_normalized(self, input, 0, 0)) {
 796             Py_INCREF(input);
 797             return input;
 798         }
 799         return nfd_nfkd(self, input, 0);
 800     }
 801     if (strcmp(form, "NFKD") == 0) {
 802         if (is_normalized(self, input, 0, 1)) {
 803             Py_INCREF(input);
 804             return input;
 805         }
 806         return nfd_nfkd(self, input, 1);
 807     }
 808     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 809     return NULL;
 810 }
 811
 812 /* -------------------------------------------------------------------- */
 813 /* unicode character name tables */
 814
 815 /* data file generated by Tools/unicode/makeunicodedata.py */
 816 #include "unicodename_db.h"
 817
 818 /* -------------------------------------------------------------------- */
 819 /* database code (cut and pasted from the unidb package) */
 820
 821 static unsigned long
 822 _gethash(const char *s, int len, int scale)
 823 {
 824     int i;
 825     unsigned long h = 0;
 826     unsigned long ix;
 827     for (i = 0; i < len; i++) {
 828         h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
 829         ix = h & 0xff000000;
 830         if (ix)
 831             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 832     }
 833     return h;
 834 }
 835
 836 static char *hangul_syllables[][3] = {
 837     { "G",  "A",   ""   },
 838     { "GG", "AE",  "G"  },
 839     { "N",  "YA",  "GG" },
 840     { "D",  "YAE", "GS" },
 841     { "DD", "EO",  "N", },
 842     { "R",  "E",   "NJ" },
 843     { "M",  "YEO", "NH" },
 844     { "B",  "YE",  "D"  },
 845     { "BB", "O",   "L"  },
 846     { "S",  "WA",  "LG" },
 847     { "SS", "WAE", "LM" },
 848     { "",   "OE",  "LB" },
 849     { "J",  "YO",  "LS" },
 850     { "JJ", "U",   "LT" },
 851     { "C",  "WEO", "LP" },
 852     { "K",  "WE",  "LH" },
 853     { "T",  "WI",  "M"  },
 854     { "P",  "YU",  "B"  },
 855     { "H",  "EU",  "BS" },
 856     { 0,    "YI",  "S"  },
 857     { 0,    "I",   "SS" },
 858     { 0,    0,     "NG" },
 859     { 0,    0,     "J"  },
 860     { 0,    0,     "C"  },
 861     { 0,    0,     "K"  },
 862     { 0,    0,     "T"  },
 863     { 0,    0,     "P"  },
 864     { 0,    0,     "H"  }
 865 };
 866
 867 static int
 868 is_unified_ideograph(Py_UCS4 code)
 869 {
 870     return (
 871         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 872         (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
 873         (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
 874 }
 875
 876 static int
 877 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
 878 {
 879     int offset;
 880     int i;
 881     int word;
 882     unsigned char* w;
 883
 884     if (code >= 0x110000)
 885         return 0;
 886
 887     if (self) {
 888         const change_record *old = get_old_record(self, code);
 889         if (old->category_changed == 0) {
 890             /* unassigned */
 891             return 0;
 892         }
 893     }
 894
 895     if (SBase <= code && code < SBase+SCount) {
 896         /* Hangul syllable. */
 897         int SIndex = code - SBase;
 898         int L = SIndex / NCount;
 899         int V = (SIndex % NCount) / TCount;
 900         int T = SIndex % TCount;
 901
 902         if (buflen < 27)
 903             /* Worst case: HANGUL SYLLABLE <10chars>. */
 904             return 0;
 905         strcpy(buffer, "HANGUL SYLLABLE ");
 906         buffer += 16;
 907         strcpy(buffer, hangul_syllables[L][0]);
 908         buffer += strlen(hangul_syllables[L][0]);
 909         strcpy(buffer, hangul_syllables[V][1]);
 910         buffer += strlen(hangul_syllables[V][1]);
 911         strcpy(buffer, hangul_syllables[T][2]);
 912         buffer += strlen(hangul_syllables[T][2]);
 913         *buffer = '\0';
 914         return 1;
 915     }
 916
 917     if (is_unified_ideograph(code)) {
 918         if (buflen < 28)
 919             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 920             return 0;
 921         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 922         return 1;
 923     }
 924
 925     /* get offset into phrasebook */
 926     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 927     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 928                                (code&((1<<phrasebook_shift)-1))];
 929     if (!offset)
 930         return 0;
 931
 932     i = 0;
 933
 934     for (;;) {
 935         /* get word index */
 936         word = phrasebook[offset] - phrasebook_short;
 937         if (word >= 0) {
 938             word = (word << 8) + phrasebook[offset+1];
 939             offset += 2;
 940         } else
 941             word = phrasebook[offset++];
 942         if (i) {
 943             if (i > buflen)
 944                 return 0; /* buffer overflow */
 945             buffer[i++] = ' ';
 946         }
 947         /* copy word string from lexicon.  the last character in the
 948            word has bit 7 set.  the last word in a string ends with
 949            0x80 */
 950         w = lexicon + lexicon_offset[word];
 951         while (*w < 128) {
 952             if (i >= buflen)
 953                 return 0; /* buffer overflow */
 954             buffer[i++] = *w++;
 955         }
 956         if (i >= buflen)
 957             return 0; /* buffer overflow */
 958         buffer[i++] = *w & 127;
 959         if (*w == 128)
 960             break; /* end of word */
 961     }
 962
 963     return 1;
 964 }
 965
 966 static int
 967 _cmpname(PyObject *self, int code, const char* name, int namelen)
 968 {
 969     /* check if code corresponds to the given name */
 970     int i;
 971     char buffer[NAME_MAXLEN];
 972     if (!_getucname(self, code, buffer, sizeof(buffer)))
 973         return 0;
 974     for (i = 0; i < namelen; i++) {
 975         if (toupper(Py_CHARMASK(name[i])) != buffer[i])
 976             return 0;
 977     }
 978     return buffer[namelen] == '\0';
 979 }
 980
 981 static void
 982 find_syllable(const char *str, int *len, int *pos, int count, int column)
 983 {
 984     int i, len1;
 985     *len = -1;
 986     for (i = 0; i < count; i++) {
 987         char *s = hangul_syllables[i][column];
 988         len1 = strlen(s);
 989         if (len1 <= *len)
 990             continue;
 991         if (strncmp(str, s, len1) == 0) {
 992             *len = len1;
 993             *pos = i;
 994         }
 995     }
 996     if (*len == -1) {
 997         *len = 0;
 998     }
 999 }
1000
1001 static int
1002 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
1003 {
1004     unsigned int h, v;
1005     unsigned int mask = code_size-1;
1006     unsigned int i, incr;
1007
1008     /* Check for hangul syllables. */
1009     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1010         int len, L = -1, V = -1, T = -1;
1011         const char *pos = name + 16;
1012         find_syllable(pos, &len, &L, LCount, 0);
1013         pos += len;
1014         find_syllable(pos, &len, &V, VCount, 1);
1015         pos += len;
1016         find_syllable(pos, &len, &T, TCount, 2);
1017         pos += len;
1018         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1019             *code = SBase + (L*VCount+V)*TCount + T;
1020             return 1;
1021         }
1022         /* Otherwise, it's an illegal syllable name. */
1023         return 0;
1024     }
1025
1026     /* Check for unified ideographs. */
1027     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1028         /* Four or five hexdigits must follow. */
1029         v = 0;
1030         name += 22;
1031         namelen -= 22;
1032         if (namelen != 4 && namelen != 5)
1033             return 0;
1034         while (namelen--) {
1035             v *= 16;
1036             if (*name >= '0' && *name <= '9')
1037                 v += *name - '0';
1038             else if (*name >= 'A' && *name <= 'F')
1039                 v += *name - 'A' + 10;
1040             else
1041                 return 0;
1042             name++;
1043         }
1044         if (!is_unified_ideograph(v))
1045             return 0;
1046         *code = v;
1047         return 1;
1048     }
1049
1050     /* the following is the same as python's dictionary lookup, with
1051        only minor changes.  see the makeunicodedata script for more
1052        details */
1053
1054     h = (unsigned int) _gethash(name, namelen, code_magic);
1055     i = (~h) & mask;
1056     v = code_hash[i];
1057     if (!v)
1058         return 0;
1059     if (_cmpname(self, v, name, namelen)) {
1060         *code = v;
1061         return 1;
1062     }
1063     incr = (h ^ (h >> 3)) & mask;
1064     if (!incr)
1065         incr = mask;
1066     for (;;) {
1067         i = (i + incr) & mask;
1068         v = code_hash[i];
1069         if (!v)
1070             return 0;
1071         if (_cmpname(self, v, name, namelen)) {
1072             *code = v;
1073             return 1;
1074         }
1075         incr = incr << 1;
1076         if (incr > mask)
1077             incr = incr ^ code_poly;
1078     }
1079 }
1080
1081 static const _PyUnicode_Name_CAPI hashAPI =
1082 {
1083     sizeof(_PyUnicode_Name_CAPI),
1084     _getucname,
1085     _getcode
1086 };
1087
1088 /* -------------------------------------------------------------------- */
1089 /* Python bindings */
1090
1091 PyDoc_STRVAR(unicodedata_name__doc__,
1092 "name(unichr[, default])\n\
1093 Returns the name assigned to the Unicode character unichr as a\n\
1094 string. If no name is defined, default is returned, or, if not\n\
1095 given, ValueError is raised.");
1096
1097 static PyObject *
1098 unicodedata_name(PyObject* self, PyObject* args)
1099 {
1100     char name[NAME_MAXLEN];
1101     Py_UCS4 c;
1102
1103     PyUnicodeObject* v;
1104     PyObject* defobj = NULL;
1105     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1106         return NULL;
1107
1108     c = getuchar(v);
1109     if (c == (Py_UCS4)-1)
1110         return NULL;
1111
1112     if (!_getucname(self, c, name, sizeof(name))) {
1113         if (defobj == NULL) {
1114             PyErr_SetString(PyExc_ValueError, "no such name");
1115             return NULL;
1116         }
1117         else {
1118             Py_INCREF(defobj);
1119             return defobj;
1120         }
1121     }
1122
1123     return Py_BuildValue("s", name);
1124 }
1125
1126 PyDoc_STRVAR(unicodedata_lookup__doc__,
1127 "lookup(name)\n\
1128 \n\
1129 Look up character by name.  If a character with the\n\
1130 given name is found, return the corresponding Unicode\n\
1131 character.  If not found, KeyError is raised.");
1132
1133 static PyObject *
1134 unicodedata_lookup(PyObject* self, PyObject* args)
1135 {
1136     Py_UCS4 code;
1137     Py_UNICODE str[2];
1138
1139     char* name;
1140     int namelen;
1141     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1142         return NULL;
1143
1144     if (!_getcode(self, name, namelen, &code)) {
1145         PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1146                      name);
1147         return NULL;
1148     }
1149
1150 #ifndef Py_UNICODE_WIDE
1151     if (code >= 0x10000) {
1152         str[0] = 0xd800 + ((code - 0x10000) >> 10);
1153         str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1154         return PyUnicode_FromUnicode(str, 2);
1155     }
1156 #endif
1157     str[0] = (Py_UNICODE) code;
1158     return PyUnicode_FromUnicode(str, 1);
1159 }
1160
1161 /* XXX Add doc strings. */
1162
1163 static PyMethodDef unicodedata_functions[] = {
1164     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1165     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1166     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1167     {"category", unicodedata_category, METH_VARARGS,
1168                  unicodedata_category__doc__},
1169     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1170                       unicodedata_bidirectional__doc__},
1171     {"combining", unicodedata_combining, METH_VARARGS,
1172                   unicodedata_combining__doc__},
1173     {"mirrored", unicodedata_mirrored, METH_VARARGS,
1174                  unicodedata_mirrored__doc__},
1175     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1176                          unicodedata_east_asian_width__doc__},
1177     {"decomposition", unicodedata_decomposition, METH_VARARGS,
1178                       unicodedata_decomposition__doc__},
1179     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1180     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1181     {"normalize", unicodedata_normalize, METH_VARARGS,
1182                   unicodedata_normalize__doc__},
1183     {NULL, NULL}                /* sentinel */
1184 };
1185
1186 static PyTypeObject UCD_Type = {
1187         /* The ob_type field must be initialized in the module init function
1188          * to be portable to Windows without using C++. */
1189         PyVarObject_HEAD_INIT(NULL, 0)
1190         "unicodedata.UCD",              /*tp_name*/
1191         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1192         0,                      /*tp_itemsize*/
1193         /* methods */
1194         (destructor)PyObject_Del, /*tp_dealloc*/
1195         0,                      /*tp_print*/
1196         0,                      /*tp_getattr*/
1197         0,                      /*tp_setattr*/
1198         0,                      /*tp_compare*/
1199         0,                      /*tp_repr*/
1200         0,                      /*tp_as_number*/
1201         0,                      /*tp_as_sequence*/
1202         0,                      /*tp_as_mapping*/
1203         0,                      /*tp_hash*/
1204         0,                      /*tp_call*/
1205         0,                      /*tp_str*/
1206         PyObject_GenericGetAttr,/*tp_getattro*/
1207         0,                      /*tp_setattro*/
1208         0,                      /*tp_as_buffer*/
1209         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1210         0,                      /*tp_doc*/
1211         0,                      /*tp_traverse*/
1212         0,                      /*tp_clear*/
1213         0,                      /*tp_richcompare*/
1214         0,                      /*tp_weaklistoffset*/
1215         0,                      /*tp_iter*/
1216         0,                      /*tp_iternext*/
1217         unicodedata_functions,  /*tp_methods*/
1218         DB_members,             /*tp_members*/
1219         0,                      /*tp_getset*/
1220         0,                      /*tp_base*/
1221         0,                      /*tp_dict*/
1222         0,                      /*tp_descr_get*/
1223         0,                      /*tp_descr_set*/
1224         0,                      /*tp_dictoffset*/
1225         0,                      /*tp_init*/
1226         0,                      /*tp_alloc*/
1227         0,                      /*tp_new*/
1228         0,                      /*tp_free*/
1229         0,                      /*tp_is_gc*/
1230 };
1231
1232 PyDoc_STRVAR(unicodedata_docstring,
1233 "This module provides access to the Unicode Character Database which\n\
1234 defines character properties for all Unicode characters. The data in\n\
1235 this database is based on the UnicodeData.txt file version\n\
1236 5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1237 \n\
1238 The module uses the same names and symbols as defined by the\n\
1239 UnicodeData File Format 5.1.0 (see\n\
1240 http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
1241
1242 PyMODINIT_FUNC
1243 initunicodedata(void)
1244 {
1245     PyObject *m, *v;
1246
1247     Py_TYPE(&UCD_Type) = &PyType_Type;
1248
1249     m = Py_InitModule3(
1250         "unicodedata", unicodedata_functions, unicodedata_docstring);
1251     if (!m)
1252         return;
1253
1254     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1255     Py_INCREF(&UCD_Type);
1256     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1257
1258     /* Previous versions */
1259     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1260     if (v != NULL)
1261         PyModule_AddObject(m, "ucd_3_2_0", v);
1262
1263     /* Export C API */
1264     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1265     if (v != NULL)
1266         PyModule_AddObject(m, "ucnhash_CAPI", v);
1267 }
1268
1269 /*
1270 Local variables:
1271 c-basic-offset: 4
1272 indent-tabs-mode: nil
1273 End:
1274 */