Modules/unicodedata.c

   1 /* ------------------------------------------------------------------------
   2
   3    unicodedata -- Provides access to the Unicode 3.2 data base.
   4
   5    Data was extracted from the Unicode 3.2 UnicodeData.txt file.
   6
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9    Modified by Martin v. Löwis (martin@v.loewis.de)
  10
  11    Copyright (c) Corporation for National Research Initiatives.
  12
  13    ------------------------------------------------------------------------ */
  14
  15 #include "Python.h"
  16 #include "ucnhash.h"
  17 #include "structmember.h"
  18
  19 /* character properties */
  20
  21 typedef struct {
  22     const unsigned char category;       /* index into
  23                                            _PyUnicode_CategoryNames */
  24     const unsigned char combining;      /* combining class value 0 - 255 */
  25     const unsigned char bidirectional;  /* index into
  26                                            _PyUnicode_BidirectionalNames */
  27     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  28     const unsigned char east_asian_width;       /* index into
  29                                                    _PyUnicode_EastAsianWidth */
  30 } _PyUnicode_DatabaseRecord;
  31
  32 typedef struct change_record {
  33     /* sequence of fields should be the same as in merge_old_version */
  34     const unsigned char bidir_changed;
  35     const unsigned char category_changed;
  36     const unsigned char decimal_changed;
  37     const int numeric_changed;
  38 } change_record;
  39
  40 /* data file generated by Tools/unicode/makeunicodedata.py */
  41 #include "unicodedata_db.h"
  42
  43 static const _PyUnicode_DatabaseRecord*
  44 _getrecord_ex(Py_UCS4 code)
  45 {
  46     int index;
  47     if (code >= 0x110000)
  48         index = 0;
  49     else {
  50         index = index1[(code>>SHIFT)];
  51         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  52     }
  53
  54     return &_PyUnicode_Database_Records[index];
  55 }
  56
  57 static const _PyUnicode_DatabaseRecord*
  58 _getrecord(PyUnicodeObject* v)
  59 {
  60     return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
  61 }
  62
  63 /* ------------- Previous-version API ------------------------------------- */
  64 typedef struct previous_version {
  65     PyObject_HEAD
  66     const char *name;
  67     const change_record* (*getrecord)(Py_UCS4);
  68     Py_UCS4 (*normalization)(Py_UCS4);
  69 } PreviousDBVersion;
  70
  71 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
  72
  73 static PyMemberDef DB_members[] = {
  74         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
  75         {NULL}
  76 };
  77
  78 // forward declaration
  79 static PyTypeObject UCD_Type;
  80
  81 static PyObject*
  82 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
  83                      Py_UCS4 (*normalization)(Py_UCS4))
  84 {
  85         PreviousDBVersion *self;
  86         self = PyObject_New(PreviousDBVersion, &UCD_Type);
  87         if (self == NULL)
  88                 return NULL;
  89         self->name = name;
  90         self->getrecord = getrecord;
  91         self->normalization = normalization;
  92         return (PyObject*)self;
  93 }
  94
  95 /* --- Module API --------------------------------------------------------- */
  96
  97 PyDoc_STRVAR(unicodedata_decimal__doc__,
  98 "decimal(unichr[, default])\n\
  99 \n\
 100 Returns the decimal value assigned to the Unicode character unichr\n\
 101 as integer. If no such value is defined, default is returned, or, if\n\
 102 not given, ValueError is raised.");
 103
 104 static PyObject *
 105 unicodedata_decimal(PyObject *self, PyObject *args)
 106 {
 107     PyUnicodeObject *v;
 108     PyObject *defobj = NULL;
 109     int have_old = 0;
 110     long rc;
 111
 112     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
 113         return NULL;
 114     if (PyUnicode_GET_SIZE(v) != 1) {
 115         PyErr_SetString(PyExc_TypeError,
 116                         "need a single Unicode character as parameter");
 117         return NULL;
 118     }
 119
 120     if (self) {
 121         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 122         if (old->category_changed == 0) {
 123             /* unassigned */
 124             have_old = 1;
 125             rc = -1;
 126         }
 127         else if (old->decimal_changed != 0xFF) {
 128             have_old = 1;
 129             rc = old->decimal_changed;
 130         }
 131     }
 132
 133     if (!have_old)
 134         rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
 135     if (rc < 0) {
 136         if (defobj == NULL) {
 137             PyErr_SetString(PyExc_ValueError,
 138                             "not a decimal");
 139             return NULL;
 140         }
 141         else {
 142             Py_INCREF(defobj);
 143             return defobj;
 144         }
 145     }
 146     return PyInt_FromLong(rc);
 147 }
 148
 149 PyDoc_STRVAR(unicodedata_digit__doc__,
 150 "digit(unichr[, default])\n\
 151 \n\
 152 Returns the digit value assigned to the Unicode character unichr as\n\
 153 integer. If no such value is defined, default is returned, or, if\n\
 154 not given, ValueError is raised.");
 155
 156 static PyObject *
 157 unicodedata_digit(PyObject *self, PyObject *args)
 158 {
 159     PyUnicodeObject *v;
 160     PyObject *defobj = NULL;
 161     long rc;
 162
 163     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
 164         return NULL;
 165     if (PyUnicode_GET_SIZE(v) != 1) {
 166         PyErr_SetString(PyExc_TypeError,
 167                         "need a single Unicode character as parameter");
 168         return NULL;
 169     }
 170     rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
 171     if (rc < 0) {
 172         if (defobj == NULL) {
 173             PyErr_SetString(PyExc_ValueError, "not a digit");
 174             return NULL;
 175         }
 176         else {
 177             Py_INCREF(defobj);
 178             return defobj;
 179         }
 180     }
 181     return PyInt_FromLong(rc);
 182 }
 183
 184 PyDoc_STRVAR(unicodedata_numeric__doc__,
 185 "numeric(unichr[, default])\n\
 186 \n\
 187 Returns the numeric value assigned to the Unicode character unichr\n\
 188 as float. If no such value is defined, default is returned, or, if\n\
 189 not given, ValueError is raised.");
 190
 191 static PyObject *
 192 unicodedata_numeric(PyObject *self, PyObject *args)
 193 {
 194     PyUnicodeObject *v;
 195     PyObject *defobj = NULL;
 196     int have_old = 0;
 197     double rc;
 198
 199     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 200         return NULL;
 201     if (PyUnicode_GET_SIZE(v) != 1) {
 202         PyErr_SetString(PyExc_TypeError,
 203                         "need a single Unicode character as parameter");
 204         return NULL;
 205     }
 206
 207     if (self) {
 208         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 209         if (old->category_changed == 0) {
 210             /* unassigned */
 211             have_old = 1;
 212             rc = -1;
 213         }
 214         else if (old->decimal_changed != 0xFF) {
 215             have_old = 1;
 216             rc = old->decimal_changed;
 217         }
 218     }
 219
 220     if (!have_old)
 221         rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
 222     if (rc < 0) {
 223         if (defobj == NULL) {
 224             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 225             return NULL;
 226         }
 227         else {
 228             Py_INCREF(defobj);
 229             return defobj;
 230         }
 231     }
 232     return PyFloat_FromDouble(rc);
 233 }
 234
 235 PyDoc_STRVAR(unicodedata_category__doc__,
 236 "category(unichr)\n\
 237 \n\
 238 Returns the general category assigned to the Unicode character\n\
 239 unichr as string.");
 240
 241 static PyObject *
 242 unicodedata_category(PyObject *self, PyObject *args)
 243 {
 244     PyUnicodeObject *v;
 245     int index;
 246
 247     if (!PyArg_ParseTuple(args, "O!:category",
 248                           &PyUnicode_Type, &v))
 249         return NULL;
 250     if (PyUnicode_GET_SIZE(v) != 1) {
 251         PyErr_SetString(PyExc_TypeError,
 252                         "need a single Unicode character as parameter");
 253         return NULL;
 254     }
 255     index = (int) _getrecord(v)->category;
 256     if (self) {
 257         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 258         if (old->category_changed != 0xFF)
 259             index = old->category_changed;
 260     }
 261     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 262 }
 263
 264 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
 265 "bidirectional(unichr)\n\
 266 \n\
 267 Returns the bidirectional category assigned to the Unicode character\n\
 268 unichr as string. If no such value is defined, an empty string is\n\
 269 returned.");
 270
 271 static PyObject *
 272 unicodedata_bidirectional(PyObject *self, PyObject *args)
 273 {
 274     PyUnicodeObject *v;
 275     int index;
 276
 277     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 278                           &PyUnicode_Type, &v))
 279         return NULL;
 280     if (PyUnicode_GET_SIZE(v) != 1) {
 281         PyErr_SetString(PyExc_TypeError,
 282                         "need a single Unicode character as parameter");
 283         return NULL;
 284     }
 285     index = (int) _getrecord(v)->bidirectional;
 286     if (self) {
 287         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 288         if (old->category_changed == 0)
 289             index = 0; /* unassigned */
 290         else if (old->bidir_changed != 0xFF)
 291             index = old->bidir_changed;
 292     }
 293     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 294 }
 295
 296 PyDoc_STRVAR(unicodedata_combining__doc__,
 297 "combining(unichr)\n\
 298 \n\
 299 Returns the canonical combining class assigned to the Unicode\n\
 300 character unichr as integer. Returns 0 if no combining class is\n\
 301 defined.");
 302
 303 static PyObject *
 304 unicodedata_combining(PyObject *self, PyObject *args)
 305 {
 306     PyUnicodeObject *v;
 307     int index;
 308
 309     if (!PyArg_ParseTuple(args, "O!:combining",
 310                           &PyUnicode_Type, &v))
 311         return NULL;
 312     if (PyUnicode_GET_SIZE(v) != 1) {
 313         PyErr_SetString(PyExc_TypeError,
 314                         "need a single Unicode character as parameter");
 315         return NULL;
 316     }
 317     index = (int) _getrecord(v)->combining;
 318     if (self) {
 319         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 320         if (old->category_changed == 0)
 321             index = 0; /* unassigned */
 322     }
 323     return PyInt_FromLong(index);
 324 }
 325
 326 PyDoc_STRVAR(unicodedata_mirrored__doc__,
 327 "mirrored(unichr)\n\
 328 \n\
 329 Returns the mirrored property assigned to the Unicode character\n\
 330 unichr as integer. Returns 1 if the character has been identified as\n\
 331 a \"mirrored\" character in bidirectional text, 0 otherwise.");
 332
 333 static PyObject *
 334 unicodedata_mirrored(PyObject *self, PyObject *args)
 335 {
 336     PyUnicodeObject *v;
 337     int index;
 338
 339     if (!PyArg_ParseTuple(args, "O!:mirrored",
 340                           &PyUnicode_Type, &v))
 341         return NULL;
 342     if (PyUnicode_GET_SIZE(v) != 1) {
 343         PyErr_SetString(PyExc_TypeError,
 344                         "need a single Unicode character as parameter");
 345         return NULL;
 346     }
 347     index = (int) _getrecord(v)->mirrored;
 348     if (self) {
 349         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 350         if (old->category_changed == 0)
 351             index = 0; /* unassigned */
 352     }
 353     return PyInt_FromLong(index);
 354 }
 355
 356 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
 357 "east_asian_width(unichr)\n\
 358 \n\
 359 Returns the east asian width assigned to the Unicode character\n\
 360 unichr as string.");
 361
 362 static PyObject *
 363 unicodedata_east_asian_width(PyObject *self, PyObject *args)
 364 {
 365     PyUnicodeObject *v;
 366     int index;
 367
 368     if (!PyArg_ParseTuple(args, "O!:east_asian_width",
 369                           &PyUnicode_Type, &v))
 370         return NULL;
 371     if (PyUnicode_GET_SIZE(v) != 1) {
 372         PyErr_SetString(PyExc_TypeError,
 373                         "need a single Unicode character as parameter");
 374         return NULL;
 375     }
 376     index = (int) _getrecord(v)->east_asian_width;
 377     if (self) {
 378         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 379         if (old->category_changed == 0)
 380             index = 0; /* unassigned */
 381     }
 382     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
 383 }
 384
 385 PyDoc_STRVAR(unicodedata_decomposition__doc__,
 386 "decomposition(unichr)\n\
 387 \n\
 388 Returns the character decomposition mapping assigned to the Unicode\n\
 389 character unichr as string. An empty string is returned in case no\n\
 390 such mapping is defined.");
 391
 392 static PyObject *
 393 unicodedata_decomposition(PyObject *self, PyObject *args)
 394 {
 395     PyUnicodeObject *v;
 396     char decomp[256];
 397     int code, index, count, i;
 398
 399     if (!PyArg_ParseTuple(args, "O!:decomposition",
 400                           &PyUnicode_Type, &v))
 401         return NULL;
 402     if (PyUnicode_GET_SIZE(v) != 1) {
 403         PyErr_SetString(PyExc_TypeError,
 404                         "need a single Unicode character as parameter");
 405         return NULL;
 406     }
 407
 408     code = (int) *PyUnicode_AS_UNICODE(v);
 409
 410     if (self) {
 411         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
 412         if (old->category_changed == 0)
 413             return PyString_FromString(""); /* unassigned */
 414     }
 415
 416     if (code < 0 || code >= 0x110000)
 417         index = 0;
 418     else {
 419         index = decomp_index1[(code>>DECOMP_SHIFT)];
 420         index = decomp_index2[(index<<DECOMP_SHIFT)+
 421                              (code&((1<<DECOMP_SHIFT)-1))];
 422     }
 423
 424     /* high byte is number of hex bytes (usually one or two), low byte
 425        is prefix code (from*/
 426     count = decomp_data[index] >> 8;
 427
 428     /* XXX: could allocate the PyString up front instead
 429        (strlen(prefix) + 5 * count + 1 bytes) */
 430
 431     /* copy prefix */
 432     i = strlen(decomp_prefix[decomp_data[index] & 255]);
 433     memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
 434
 435     while (count-- > 0) {
 436         if (i)
 437             decomp[i++] = ' ';
 438         assert((size_t)i < sizeof(decomp));
 439         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 440                       decomp_data[++index]);
 441         i += strlen(decomp + i);
 442     }
 443
 444     decomp[i] = '\0';
 445
 446     return PyString_FromString(decomp);
 447 }
 448
 449 void
 450 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 451 {
 452     if (code >= 0x110000) {
 453         *index = 0;
 454     } else if (self && get_old_record(self, code)->category_changed==0) {
 455         /* unassigned in old version */
 456         *index = 0;
 457     }
 458     else {
 459         *index = decomp_index1[(code>>DECOMP_SHIFT)];
 460         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 461                                (code&((1<<DECOMP_SHIFT)-1))];
 462     }
 463
 464     /* high byte is number of hex bytes (usually one or two), low byte
 465        is prefix code (from*/
 466     *count = decomp_data[*index] >> 8;
 467     *prefix = decomp_data[*index] & 255;
 468
 469     (*index)++;
 470 }
 471
 472 #define SBase   0xAC00
 473 #define LBase   0x1100
 474 #define VBase   0x1161
 475 #define TBase   0x11A7
 476 #define LCount  19
 477 #define VCount  21
 478 #define TCount  28
 479 #define NCount  (VCount*TCount)
 480 #define SCount  (LCount*NCount)
 481
 482 static PyObject*
 483 nfd_nfkd(PyObject *self, PyObject *input, int k)
 484 {
 485     PyObject *result;
 486     Py_UNICODE *i, *end, *o;
 487     /* Longest decomposition in Unicode 3.2: U+FDFA */
 488     Py_UNICODE stack[20];
 489     int space, stackptr, isize;
 490     int index, prefix, count;
 491     unsigned char prev, cur;
 492
 493     stackptr = 0;
 494     isize = PyUnicode_GET_SIZE(input);
 495     /* Overallocate atmost 10 characters. */
 496     space = (isize > 10 ? 10 : isize) + isize;
 497     result = PyUnicode_FromUnicode(NULL, space);
 498     if (!result)
 499         return NULL;
 500     i = PyUnicode_AS_UNICODE(input);
 501     end = i + isize;
 502     o = PyUnicode_AS_UNICODE(result);
 503
 504     while (i < end) {
 505         stack[stackptr++] = *i++;
 506         while(stackptr) {
 507             Py_UNICODE code = stack[--stackptr];
 508             /* Hangul Decomposition adds three characters in
 509                a single step, so we need atleast that much room. */
 510             if (space < 3) {
 511                 int newsize = PyString_GET_SIZE(result) + 10;
 512                 space += 10;
 513                 if (PyUnicode_Resize(&result, newsize) == -1)
 514                     return NULL;
 515                 o = PyUnicode_AS_UNICODE(result) + newsize - space;
 516             }
 517             /* Hangul Decomposition. */
 518             if (SBase <= code && code < (SBase+SCount)) {
 519                 int SIndex = code - SBase;
 520                 int L = LBase + SIndex / NCount;
 521                 int V = VBase + (SIndex % NCount) / TCount;
 522                 int T = TBase + SIndex % TCount;
 523                 *o++ = L;
 524                 *o++ = V;
 525                 space -= 2;
 526                 if (T != TBase) {
 527                     *o++ = T;
 528                     space --;
 529                 }
 530                 continue;
 531             }
 532             /* normalization changes */
 533             if (self) {
 534                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
 535                 if (value != 0) {
 536                     stack[stackptr++] = value;
 537                     continue;
 538                 }
 539             }
 540
 541             /* Other decompositions. */
 542             get_decomp_record(self, code, &index, &prefix, &count);
 543
 544             /* Copy character if it is not decomposable, or has a
 545                compatibility decomposition, but we do NFD. */
 546             if (!count || (prefix && !k)) {
 547                 *o++ = code;
 548                 space--;
 549                 continue;
 550             }
 551             /* Copy decomposition onto the stack, in reverse
 552                order.  */
 553             while(count) {
 554                 code = decomp_data[index + (--count)];
 555                 stack[stackptr++] = code;
 556             }
 557         }
 558     }
 559
 560     /* Drop overallocation. Cannot fail. */
 561     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 562
 563     /* Sort canonically. */
 564     i = PyUnicode_AS_UNICODE(result);
 565     prev = _getrecord_ex(*i)->combining;
 566     end = i + PyUnicode_GET_SIZE(result);
 567     for (i++; i < end; i++) {
 568         cur = _getrecord_ex(*i)->combining;
 569         if (prev == 0 || cur == 0 || prev <= cur) {
 570             prev = cur;
 571             continue;
 572         }
 573         /* Non-canonical order. Need to switch *i with previous. */
 574         o = i - 1;
 575         while (1) {
 576             Py_UNICODE tmp = o[1];
 577             o[1] = o[0];
 578             o[0] = tmp;
 579             o--;
 580             if (o < PyUnicode_AS_UNICODE(result))
 581                 break;
 582             prev = _getrecord_ex(*o)->combining;
 583             if (prev == 0 || prev <= cur)
 584                 break;
 585         }
 586         prev = _getrecord_ex(*i)->combining;
 587     }
 588     return result;
 589 }
 590
 591 static int
 592 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
 593 {
 594     int index;
 595     for (index = 0; nfc[index].start; index++) {
 596         int start = nfc[index].start;
 597         if (code < start)
 598             return -1;
 599         if (code <= start + nfc[index].count) {
 600             int delta = code - start;
 601             return nfc[index].index + delta;
 602         }
 603     }
 604     return -1;
 605 }
 606
 607 static PyObject*
 608 nfc_nfkc(PyObject *self, PyObject *input, int k)
 609 {
 610     PyObject *result;
 611     Py_UNICODE *i, *i1, *o, *end;
 612     int f,l,index,index1,comb;
 613     Py_UNICODE code;
 614     Py_UNICODE *skipped[20];
 615     int cskipped = 0;
 616
 617     result = nfd_nfkd(self, input, k);
 618     if (!result)
 619         return NULL;
 620
 621     /* We are going to modify result in-place.
 622        If nfd_nfkd is changed to sometimes return the input,
 623        this code needs to be reviewed. */
 624     assert(result != input);
 625
 626     i = PyUnicode_AS_UNICODE(result);
 627     end = i + PyUnicode_GET_SIZE(result);
 628     o = PyUnicode_AS_UNICODE(result);
 629
 630   again:
 631     while (i < end) {
 632       for (index = 0; index < cskipped; index++) {
 633           if (skipped[index] == i) {
 634               /* *i character is skipped.
 635                  Remove from list. */
 636               skipped[index] = skipped[cskipped-1];
 637               cskipped--;
 638               i++;
 639               goto again; /* continue while */
 640           }
 641       }
 642       /* Hangul Composition. We don't need to check for <LV,T>
 643          pairs, since we always have decomposed data. */
 644       if (LBase <= *i && *i < (LBase+LCount) &&
 645           i + 1 < end &&
 646           VBase <= i[1] && i[1] <= (VBase+VCount)) {
 647           int LIndex, VIndex;
 648           LIndex = i[0] - LBase;
 649           VIndex = i[1] - VBase;
 650           code = SBase + (LIndex*VCount+VIndex)*TCount;
 651           i+=2;
 652           if (i < end &&
 653               TBase <= *i && *i <= (TBase+TCount)) {
 654               code += *i-TBase;
 655               i++;
 656           }
 657           *o++ = code;
 658           continue;
 659       }
 660
 661       f = find_nfc_index(self, nfc_first, *i);
 662       if (f == -1) {
 663           *o++ = *i++;
 664           continue;
 665       }
 666       /* Find next unblocked character. */
 667       i1 = i+1;
 668       comb = 0;
 669       while (i1 < end) {
 670           int comb1 = _getrecord_ex(*i1)->combining;
 671           if (comb1 && comb == comb1) {
 672               /* Character is blocked. */
 673               i1++;
 674               continue;
 675           }
 676           l = find_nfc_index(self, nfc_last, *i1);
 677           /* *i1 cannot be combined with *i. If *i1
 678              is a starter, we don't need to look further.
 679              Otherwise, record the combining class. */
 680           if (l == -1) {
 681             not_combinable:
 682               if (comb1 == 0)
 683                   break;
 684               comb = comb1;
 685               i1++;
 686               continue;
 687           }
 688           index = f*TOTAL_LAST + l;
 689           index1 = comp_index[index >> COMP_SHIFT];
 690           code = comp_data[(index1<<COMP_SHIFT)+
 691                            (index&((1<<COMP_SHIFT)-1))];
 692           if (code == 0)
 693               goto not_combinable;
 694
 695           /* Replace the original character. */
 696           *i = code;
 697           /* Mark the second character unused. */
 698           skipped[cskipped++] = i1;
 699           i1++;
 700           f = find_nfc_index(self, nfc_first, *i);
 701           if (f == -1)
 702               break;
 703       }
 704       *o++ = *i++;
 705     }
 706     if (o != end)
 707         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 708     return result;
 709 }
 710
 711 PyDoc_STRVAR(unicodedata_normalize__doc__,
 712 "normalize(form, unistr)\n\
 713 \n\
 714 Return the normal form 'form' for the Unicode string unistr.  Valid\n\
 715 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
 716
 717 static PyObject*
 718 unicodedata_normalize(PyObject *self, PyObject *args)
 719 {
 720     char *form;
 721     PyObject *input;
 722
 723     if(!PyArg_ParseTuple(args, "sO!:normalize",
 724                          &form, &PyUnicode_Type, &input))
 725         return NULL;
 726
 727     if (PyUnicode_GetSize(input) == 0) {
 728         /* Special case empty input strings, since resizing
 729            them  later would cause internal errors. */
 730         Py_INCREF(input);
 731         return input;
 732     }
 733
 734     if (strcmp(form, "NFC") == 0)
 735         return nfc_nfkc(self, input, 0);
 736     if (strcmp(form, "NFKC") == 0)
 737         return nfc_nfkc(self, input, 1);
 738     if (strcmp(form, "NFD") == 0)
 739         return nfd_nfkd(self, input, 0);
 740     if (strcmp(form, "NFKD") == 0)
 741         return nfd_nfkd(self, input, 1);
 742     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 743     return NULL;
 744 }
 745
 746 /* -------------------------------------------------------------------- */
 747 /* unicode character name tables */
 748
 749 /* data file generated by Tools/unicode/makeunicodedata.py */
 750 #include "unicodename_db.h"
 751
 752 /* -------------------------------------------------------------------- */
 753 /* database code (cut and pasted from the unidb package) */
 754
 755 static unsigned long
 756 _gethash(const char *s, int len, int scale)
 757 {
 758     int i;
 759     unsigned long h = 0;
 760     unsigned long ix;
 761     for (i = 0; i < len; i++) {
 762         h = (h * scale) + (unsigned char) toupper(s[i]);
 763         ix = h & 0xff000000;
 764         if (ix)
 765             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 766     }
 767     return h;
 768 }
 769
 770 static char *hangul_syllables[][3] = {
 771     { "G",  "A",   ""   },
 772     { "GG", "AE",  "G"  },
 773     { "N",  "YA",  "GG" },
 774     { "D",  "YAE", "GS" },
 775     { "DD", "EO",  "N", },
 776     { "R",  "E",   "NJ" },
 777     { "M",  "YEO", "NH" },
 778     { "B",  "YE",  "D"  },
 779     { "BB", "O",   "L"  },
 780     { "S",  "WA",  "LG" },
 781     { "SS", "WAE", "LM" },
 782     { "",   "OE",  "LB" },
 783     { "J",  "YO",  "LS" },
 784     { "JJ", "U",   "LT" },
 785     { "C",  "WEO", "LP" },
 786     { "K",  "WE",  "LH" },
 787     { "T",  "WI",  "M"  },
 788     { "P",  "YU",  "B"  },
 789     { "H",  "EU",  "BS" },
 790     { 0,    "YI",  "S"  },
 791     { 0,    "I",   "SS" },
 792     { 0,    0,     "NG" },
 793     { 0,    0,     "J"  },
 794     { 0,    0,     "C"  },
 795     { 0,    0,     "K"  },
 796     { 0,    0,     "T"  },
 797     { 0,    0,     "P"  },
 798     { 0,    0,     "H"  }
 799 };
 800
 801 static int
 802 is_unified_ideograph(Py_UCS4 code)
 803 {
 804     return (
 805         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 806         (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
 807         (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
 808 }
 809
 810 static int
 811 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
 812 {
 813     int offset;
 814     int i;
 815     int word;
 816     unsigned char* w;
 817
 818     if (code >= 0x110000)
 819         return 0;
 820
 821     if (self) {
 822         const change_record *old = get_old_record(self, code);
 823         if (old->category_changed == 0) {
 824             /* unassigned */
 825             return 0;
 826         }
 827     }
 828
 829     if (SBase <= code && code < SBase+SCount) {
 830         /* Hangul syllable. */
 831         int SIndex = code - SBase;
 832         int L = SIndex / NCount;
 833         int V = (SIndex % NCount) / TCount;
 834         int T = SIndex % TCount;
 835
 836         if (buflen < 27)
 837             /* Worst case: HANGUL SYLLABLE <10chars>. */
 838             return 0;
 839         strcpy(buffer, "HANGUL SYLLABLE ");
 840         buffer += 16;
 841         strcpy(buffer, hangul_syllables[L][0]);
 842         buffer += strlen(hangul_syllables[L][0]);
 843         strcpy(buffer, hangul_syllables[V][1]);
 844         buffer += strlen(hangul_syllables[V][1]);
 845         strcpy(buffer, hangul_syllables[T][2]);
 846         buffer += strlen(hangul_syllables[T][2]);
 847         *buffer = '\0';
 848         return 1;
 849     }
 850
 851     if (is_unified_ideograph(code)) {
 852         if (buflen < 28)
 853             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 854             return 0;
 855         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 856         return 1;
 857     }
 858
 859     /* get offset into phrasebook */
 860     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 861     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 862                                (code&((1<<phrasebook_shift)-1))];
 863     if (!offset)
 864         return 0;
 865
 866     i = 0;
 867
 868     for (;;) {
 869         /* get word index */
 870         word = phrasebook[offset] - phrasebook_short;
 871         if (word >= 0) {
 872             word = (word << 8) + phrasebook[offset+1];
 873             offset += 2;
 874         } else
 875             word = phrasebook[offset++];
 876         if (i) {
 877             if (i > buflen)
 878                 return 0; /* buffer overflow */
 879             buffer[i++] = ' ';
 880         }
 881         /* copy word string from lexicon.  the last character in the
 882            word has bit 7 set.  the last word in a string ends with
 883            0x80 */
 884         w = lexicon + lexicon_offset[word];
 885         while (*w < 128) {
 886             if (i >= buflen)
 887                 return 0; /* buffer overflow */
 888             buffer[i++] = *w++;
 889         }
 890         if (i >= buflen)
 891             return 0; /* buffer overflow */
 892         buffer[i++] = *w & 127;
 893         if (*w == 128)
 894             break; /* end of word */
 895     }
 896
 897     return 1;
 898 }
 899
 900 static int
 901 _cmpname(PyObject *self, int code, const char* name, int namelen)
 902 {
 903     /* check if code corresponds to the given name */
 904     int i;
 905     char buffer[NAME_MAXLEN];
 906     if (!_getucname(self, code, buffer, sizeof(buffer)))
 907         return 0;
 908     for (i = 0; i < namelen; i++) {
 909         if (toupper(name[i]) != buffer[i])
 910             return 0;
 911     }
 912     return buffer[namelen] == '\0';
 913 }
 914
 915 static void
 916 find_syllable(const char *str, int *len, int *pos, int count, int column)
 917 {
 918     int i, len1;
 919     *len = -1;
 920     for (i = 0; i < count; i++) {
 921         char *s = hangul_syllables[i][column];
 922         len1 = strlen(s);
 923         if (len1 <= *len)
 924             continue;
 925         if (strncmp(str, s, len1) == 0) {
 926             *len = len1;
 927             *pos = i;
 928         }
 929     }
 930     if (*len == -1) {
 931         *len = 0;
 932     }
 933 }
 934
 935 static int
 936 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
 937 {
 938     unsigned int h, v;
 939     unsigned int mask = code_size-1;
 940     unsigned int i, incr;
 941
 942     /* Check for hangul syllables. */
 943     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
 944         int len, L = -1, V = -1, T = -1;
 945         const char *pos = name + 16;
 946         find_syllable(pos, &len, &L, LCount, 0);
 947         pos += len;
 948         find_syllable(pos, &len, &V, VCount, 1);
 949         pos += len;
 950         find_syllable(pos, &len, &T, TCount, 2);
 951         pos += len;
 952         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
 953             *code = SBase + (L*VCount+V)*TCount + T;
 954             return 1;
 955         }
 956         /* Otherwise, it's an illegal syllable name. */
 957         return 0;
 958     }
 959
 960     /* Check for unified ideographs. */
 961     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
 962         /* Four or five hexdigits must follow. */
 963         v = 0;
 964         name += 22;
 965         namelen -= 22;
 966         if (namelen != 4 && namelen != 5)
 967             return 0;
 968         while (namelen--) {
 969             v *= 16;
 970             if (*name >= '0' && *name <= '9')
 971                 v += *name - '0';
 972             else if (*name >= 'A' && *name <= 'F')
 973                 v += *name - 'A' + 10;
 974             else
 975                 return 0;
 976             name++;
 977         }
 978         if (!is_unified_ideograph(v))
 979             return 0;
 980         *code = v;
 981         return 1;
 982     }
 983
 984     /* the following is the same as python's dictionary lookup, with
 985        only minor changes.  see the makeunicodedata script for more
 986        details */
 987
 988     h = (unsigned int) _gethash(name, namelen, code_magic);
 989     i = (~h) & mask;
 990     v = code_hash[i];
 991     if (!v)
 992         return 0;
 993     if (_cmpname(self, v, name, namelen)) {
 994         *code = v;
 995         return 1;
 996     }
 997     incr = (h ^ (h >> 3)) & mask;
 998     if (!incr)
 999         incr = mask;
1000     for (;;) {
1001         i = (i + incr) & mask;
1002         v = code_hash[i];
1003         if (!v)
1004             return 0;
1005         if (_cmpname(self, v, name, namelen)) {
1006             *code = v;
1007             return 1;
1008         }
1009         incr = incr << 1;
1010         if (incr > mask)
1011             incr = incr ^ code_poly;
1012     }
1013 }
1014
1015 static const _PyUnicode_Name_CAPI hashAPI =
1016 {
1017     sizeof(_PyUnicode_Name_CAPI),
1018     _getucname,
1019     _getcode
1020 };
1021
1022 /* -------------------------------------------------------------------- */
1023 /* Python bindings */
1024
1025 PyDoc_STRVAR(unicodedata_name__doc__,
1026 "name(unichr[, default])\n\
1027 Returns the name assigned to the Unicode character unichr as a\n\
1028 string. If no name is defined, default is returned, or, if not\n\
1029 given, ValueError is raised.");
1030
1031 static PyObject *
1032 unicodedata_name(PyObject* self, PyObject* args)
1033 {
1034     char name[NAME_MAXLEN];
1035
1036     PyUnicodeObject* v;
1037     PyObject* defobj = NULL;
1038     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1039         return NULL;
1040
1041     if (PyUnicode_GET_SIZE(v) != 1) {
1042         PyErr_SetString(PyExc_TypeError,
1043                         "need a single Unicode character as parameter");
1044         return NULL;
1045     }
1046
1047     if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
1048                     name, sizeof(name))) {
1049         if (defobj == NULL) {
1050             PyErr_SetString(PyExc_ValueError, "no such name");
1051             return NULL;
1052         }
1053         else {
1054             Py_INCREF(defobj);
1055             return defobj;
1056         }
1057     }
1058
1059     return Py_BuildValue("s", name);
1060 }
1061
1062 PyDoc_STRVAR(unicodedata_lookup__doc__,
1063 "lookup(name)\n\
1064 \n\
1065 Look up character by name.  If a character with the\n\
1066 given name is found, return the corresponding Unicode\n\
1067 character.  If not found, KeyError is raised.");
1068
1069 static PyObject *
1070 unicodedata_lookup(PyObject* self, PyObject* args)
1071 {
1072     Py_UCS4 code;
1073     Py_UNICODE str[1];
1074
1075     char* name;
1076     int namelen;
1077     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1078         return NULL;
1079
1080     if (!_getcode(self, name, namelen, &code)) {
1081         char fmt[] = "undefined character name '%s'";
1082         char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
1083         sprintf(buf, fmt, name);
1084         PyErr_SetString(PyExc_KeyError, buf);
1085         PyMem_FREE(buf);
1086         return NULL;
1087     }
1088
1089     str[0] = (Py_UNICODE) code;
1090     return PyUnicode_FromUnicode(str, 1);
1091 }
1092
1093 /* XXX Add doc strings. */
1094
1095 static PyMethodDef unicodedata_functions[] = {
1096     {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1097     {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1098     {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1099     {"category", unicodedata_category, METH_VARARGS,
1100                  unicodedata_category__doc__},
1101     {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1102                       unicodedata_bidirectional__doc__},
1103     {"combining", unicodedata_combining, METH_VARARGS,
1104                   unicodedata_combining__doc__},
1105     {"mirrored", unicodedata_mirrored, METH_VARARGS,
1106                  unicodedata_mirrored__doc__},
1107     {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1108                          unicodedata_east_asian_width__doc__},
1109     {"decomposition", unicodedata_decomposition, METH_VARARGS,
1110                       unicodedata_decomposition__doc__},
1111     {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1112     {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1113     {"normalize", unicodedata_normalize, METH_VARARGS,
1114                   unicodedata_normalize__doc__},
1115     {NULL, NULL}                /* sentinel */
1116 };
1117
1118 static PyTypeObject UCD_Type = {
1119         /* The ob_type field must be initialized in the module init function
1120          * to be portable to Windows without using C++. */
1121         PyObject_HEAD_INIT(NULL)
1122         0,                      /*ob_size*/
1123         "unicodedata.UCD",              /*tp_name*/
1124         sizeof(PreviousDBVersion),      /*tp_basicsize*/
1125         0,                      /*tp_itemsize*/
1126         /* methods */
1127         (destructor)PyObject_Del, /*tp_dealloc*/
1128         0,                      /*tp_print*/
1129         0,                      /*tp_getattr*/
1130         0,                      /*tp_setattr*/
1131         0,                      /*tp_compare*/
1132         0,                      /*tp_repr*/
1133         0,                      /*tp_as_number*/
1134         0,                      /*tp_as_sequence*/
1135         0,                      /*tp_as_mapping*/
1136         0,                      /*tp_hash*/
1137         0,                      /*tp_call*/
1138         0,                      /*tp_str*/
1139         PyObject_GenericGetAttr,/*tp_getattro*/
1140         0,                      /*tp_setattro*/
1141         0,                      /*tp_as_buffer*/
1142         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1143         0,                      /*tp_doc*/
1144         0,                      /*tp_traverse*/
1145         0,                      /*tp_clear*/
1146         0,                      /*tp_richcompare*/
1147         0,                      /*tp_weaklistoffset*/
1148         0,                      /*tp_iter*/
1149         0,                      /*tp_iternext*/
1150         unicodedata_functions,  /*tp_methods*/
1151         DB_members,             /*tp_members*/
1152         0,                      /*tp_getset*/
1153         0,                      /*tp_base*/
1154         0,                      /*tp_dict*/
1155         0,                      /*tp_descr_get*/
1156         0,                      /*tp_descr_set*/
1157         0,                      /*tp_dictoffset*/
1158         0,                      /*tp_init*/
1159         0,                      /*tp_alloc*/
1160         0,                      /*tp_new*/
1161         0,                      /*tp_free*/
1162         0,                      /*tp_is_gc*/
1163 };
1164
1165 PyDoc_STRVAR(unicodedata_docstring,
1166 "This module provides access to the Unicode Character Database which\n\
1167 defines character properties for all Unicode characters. The data in\n\
1168 this database is based on the UnicodeData.txt file version\n\
1169 3.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
1170 \n\
1171 The module uses the same names and symbols as defined by the\n\
1172 UnicodeData File Format 3.2.0 (see\n\
1173 http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html).");
1174
1175 PyMODINIT_FUNC
1176 initunicodedata(void)
1177 {
1178     PyObject *m, *v;
1179
1180     UCD_Type.ob_type = &PyType_Type;
1181
1182     m = Py_InitModule3(
1183         "unicodedata", unicodedata_functions, unicodedata_docstring);
1184     if (!m)
1185         return;
1186
1187     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1188     Py_INCREF(&UCD_Type);
1189     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1190
1191     /* Previous versions */
1192     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1193     if (v != NULL)
1194         PyModule_AddObject(m, "ucd_3_2_0", v);
1195
1196     /* Export C API */
1197     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1198     if (v != NULL)
1199         PyModule_AddObject(m, "ucnhash_CAPI", v);
1200 }
1201
1202 /*
1203 Local variables:
1204 c-basic-offset: 4
1205 indent-tabs-mode: nil
1206 End:
1207 */