Minor fix for currentframe (SF #1652788).
[python.git] / Modules / unicodedata.c
bloba30d30c8eb7291e26d95f06ab31ec1c67540727f
1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 4.1 data base.
5 Data was extracted from the Unicode 4.1 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
15 #include "Python.h"
16 #include "ucnhash.h"
17 #include "structmember.h"
19 /* character properties */
21 typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
30 } _PyUnicode_DatabaseRecord;
32 typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
37 const int numeric_changed;
38 } change_record;
40 /* data file generated by Tools/unicode/makeunicodedata.py */
41 #include "unicodedata_db.h"
43 static const _PyUnicode_DatabaseRecord*
44 _getrecord_ex(Py_UCS4 code)
46 int index;
47 if (code >= 0x110000)
48 index = 0;
49 else {
50 index = index1[(code>>SHIFT)];
51 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 return &_PyUnicode_Database_Records[index];
57 static const _PyUnicode_DatabaseRecord*
58 _getrecord(PyUnicodeObject* v)
60 return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
63 /* ------------- Previous-version API ------------------------------------- */
64 typedef struct previous_version {
65 PyObject_HEAD
66 const char *name;
67 const change_record* (*getrecord)(Py_UCS4);
68 Py_UCS4 (*normalization)(Py_UCS4);
69 } PreviousDBVersion;
71 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
73 static PyMemberDef DB_members[] = {
74 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
75 {NULL}
78 /* forward declaration */
79 static PyTypeObject UCD_Type;
81 static PyObject*
82 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
83 Py_UCS4 (*normalization)(Py_UCS4))
85 PreviousDBVersion *self;
86 self = PyObject_New(PreviousDBVersion, &UCD_Type);
87 if (self == NULL)
88 return NULL;
89 self->name = name;
90 self->getrecord = getrecord;
91 self->normalization = normalization;
92 return (PyObject*)self;
95 /* --- Module API --------------------------------------------------------- */
97 PyDoc_STRVAR(unicodedata_decimal__doc__,
98 "decimal(unichr[, default])\n\
99 \n\
100 Returns the decimal value assigned to the Unicode character unichr\n\
101 as integer. If no such value is defined, default is returned, or, if\n\
102 not given, ValueError is raised.");
104 static PyObject *
105 unicodedata_decimal(PyObject *self, PyObject *args)
107 PyUnicodeObject *v;
108 PyObject *defobj = NULL;
109 int have_old = 0;
110 long rc;
112 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
113 return NULL;
114 if (PyUnicode_GET_SIZE(v) != 1) {
115 PyErr_SetString(PyExc_TypeError,
116 "need a single Unicode character as parameter");
117 return NULL;
120 if (self) {
121 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
122 if (old->category_changed == 0) {
123 /* unassigned */
124 have_old = 1;
125 rc = -1;
127 else if (old->decimal_changed != 0xFF) {
128 have_old = 1;
129 rc = old->decimal_changed;
133 if (!have_old)
134 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
135 if (rc < 0) {
136 if (defobj == NULL) {
137 PyErr_SetString(PyExc_ValueError,
138 "not a decimal");
139 return NULL;
141 else {
142 Py_INCREF(defobj);
143 return defobj;
146 return PyInt_FromLong(rc);
149 PyDoc_STRVAR(unicodedata_digit__doc__,
150 "digit(unichr[, default])\n\
152 Returns the digit value assigned to the Unicode character unichr as\n\
153 integer. If no such value is defined, default is returned, or, if\n\
154 not given, ValueError is raised.");
156 static PyObject *
157 unicodedata_digit(PyObject *self, PyObject *args)
159 PyUnicodeObject *v;
160 PyObject *defobj = NULL;
161 long rc;
163 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
164 return NULL;
165 if (PyUnicode_GET_SIZE(v) != 1) {
166 PyErr_SetString(PyExc_TypeError,
167 "need a single Unicode character as parameter");
168 return NULL;
170 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
171 if (rc < 0) {
172 if (defobj == NULL) {
173 PyErr_SetString(PyExc_ValueError, "not a digit");
174 return NULL;
176 else {
177 Py_INCREF(defobj);
178 return defobj;
181 return PyInt_FromLong(rc);
184 PyDoc_STRVAR(unicodedata_numeric__doc__,
185 "numeric(unichr[, default])\n\
187 Returns the numeric value assigned to the Unicode character unichr\n\
188 as float. If no such value is defined, default is returned, or, if\n\
189 not given, ValueError is raised.");
191 static PyObject *
192 unicodedata_numeric(PyObject *self, PyObject *args)
194 PyUnicodeObject *v;
195 PyObject *defobj = NULL;
196 int have_old = 0;
197 double rc;
199 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
200 return NULL;
201 if (PyUnicode_GET_SIZE(v) != 1) {
202 PyErr_SetString(PyExc_TypeError,
203 "need a single Unicode character as parameter");
204 return NULL;
207 if (self) {
208 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
209 if (old->category_changed == 0) {
210 /* unassigned */
211 have_old = 1;
212 rc = -1.0;
214 else if (old->decimal_changed != 0xFF) {
215 have_old = 1;
216 rc = old->decimal_changed;
220 if (!have_old)
221 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
222 if (rc == -1.0) {
223 if (defobj == NULL) {
224 PyErr_SetString(PyExc_ValueError, "not a numeric character");
225 return NULL;
227 else {
228 Py_INCREF(defobj);
229 return defobj;
232 return PyFloat_FromDouble(rc);
235 PyDoc_STRVAR(unicodedata_category__doc__,
236 "category(unichr)\n\
238 Returns the general category assigned to the Unicode character\n\
239 unichr as string.");
241 static PyObject *
242 unicodedata_category(PyObject *self, PyObject *args)
244 PyUnicodeObject *v;
245 int index;
247 if (!PyArg_ParseTuple(args, "O!:category",
248 &PyUnicode_Type, &v))
249 return NULL;
250 if (PyUnicode_GET_SIZE(v) != 1) {
251 PyErr_SetString(PyExc_TypeError,
252 "need a single Unicode character as parameter");
253 return NULL;
255 index = (int) _getrecord(v)->category;
256 if (self) {
257 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
258 if (old->category_changed != 0xFF)
259 index = old->category_changed;
261 return PyString_FromString(_PyUnicode_CategoryNames[index]);
264 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
265 "bidirectional(unichr)\n\
267 Returns the bidirectional category assigned to the Unicode character\n\
268 unichr as string. If no such value is defined, an empty string is\n\
269 returned.");
271 static PyObject *
272 unicodedata_bidirectional(PyObject *self, PyObject *args)
274 PyUnicodeObject *v;
275 int index;
277 if (!PyArg_ParseTuple(args, "O!:bidirectional",
278 &PyUnicode_Type, &v))
279 return NULL;
280 if (PyUnicode_GET_SIZE(v) != 1) {
281 PyErr_SetString(PyExc_TypeError,
282 "need a single Unicode character as parameter");
283 return NULL;
285 index = (int) _getrecord(v)->bidirectional;
286 if (self) {
287 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
288 if (old->category_changed == 0)
289 index = 0; /* unassigned */
290 else if (old->bidir_changed != 0xFF)
291 index = old->bidir_changed;
293 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
296 PyDoc_STRVAR(unicodedata_combining__doc__,
297 "combining(unichr)\n\
299 Returns the canonical combining class assigned to the Unicode\n\
300 character unichr as integer. Returns 0 if no combining class is\n\
301 defined.");
303 static PyObject *
304 unicodedata_combining(PyObject *self, PyObject *args)
306 PyUnicodeObject *v;
307 int index;
309 if (!PyArg_ParseTuple(args, "O!:combining",
310 &PyUnicode_Type, &v))
311 return NULL;
312 if (PyUnicode_GET_SIZE(v) != 1) {
313 PyErr_SetString(PyExc_TypeError,
314 "need a single Unicode character as parameter");
315 return NULL;
317 index = (int) _getrecord(v)->combining;
318 if (self) {
319 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
320 if (old->category_changed == 0)
321 index = 0; /* unassigned */
323 return PyInt_FromLong(index);
326 PyDoc_STRVAR(unicodedata_mirrored__doc__,
327 "mirrored(unichr)\n\
329 Returns the mirrored property assigned to the Unicode character\n\
330 unichr as integer. Returns 1 if the character has been identified as\n\
331 a \"mirrored\" character in bidirectional text, 0 otherwise.");
333 static PyObject *
334 unicodedata_mirrored(PyObject *self, PyObject *args)
336 PyUnicodeObject *v;
337 int index;
339 if (!PyArg_ParseTuple(args, "O!:mirrored",
340 &PyUnicode_Type, &v))
341 return NULL;
342 if (PyUnicode_GET_SIZE(v) != 1) {
343 PyErr_SetString(PyExc_TypeError,
344 "need a single Unicode character as parameter");
345 return NULL;
347 index = (int) _getrecord(v)->mirrored;
348 if (self) {
349 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
350 if (old->category_changed == 0)
351 index = 0; /* unassigned */
353 return PyInt_FromLong(index);
356 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
357 "east_asian_width(unichr)\n\
359 Returns the east asian width assigned to the Unicode character\n\
360 unichr as string.");
362 static PyObject *
363 unicodedata_east_asian_width(PyObject *self, PyObject *args)
365 PyUnicodeObject *v;
366 int index;
368 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
369 &PyUnicode_Type, &v))
370 return NULL;
371 if (PyUnicode_GET_SIZE(v) != 1) {
372 PyErr_SetString(PyExc_TypeError,
373 "need a single Unicode character as parameter");
374 return NULL;
376 index = (int) _getrecord(v)->east_asian_width;
377 if (self) {
378 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
379 if (old->category_changed == 0)
380 index = 0; /* unassigned */
382 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
385 PyDoc_STRVAR(unicodedata_decomposition__doc__,
386 "decomposition(unichr)\n\
388 Returns the character decomposition mapping assigned to the Unicode\n\
389 character unichr as string. An empty string is returned in case no\n\
390 such mapping is defined.");
392 static PyObject *
393 unicodedata_decomposition(PyObject *self, PyObject *args)
395 PyUnicodeObject *v;
396 char decomp[256];
397 int code, index, count, i;
398 unsigned int prefix_index;
400 if (!PyArg_ParseTuple(args, "O!:decomposition",
401 &PyUnicode_Type, &v))
402 return NULL;
403 if (PyUnicode_GET_SIZE(v) != 1) {
404 PyErr_SetString(PyExc_TypeError,
405 "need a single Unicode character as parameter");
406 return NULL;
409 code = (int) *PyUnicode_AS_UNICODE(v);
411 if (self) {
412 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
413 if (old->category_changed == 0)
414 return PyString_FromString(""); /* unassigned */
417 if (code < 0 || code >= 0x110000)
418 index = 0;
419 else {
420 index = decomp_index1[(code>>DECOMP_SHIFT)];
421 index = decomp_index2[(index<<DECOMP_SHIFT)+
422 (code&((1<<DECOMP_SHIFT)-1))];
425 /* high byte is number of hex bytes (usually one or two), low byte
426 is prefix code (from*/
427 count = decomp_data[index] >> 8;
429 /* XXX: could allocate the PyString up front instead
430 (strlen(prefix) + 5 * count + 1 bytes) */
432 /* Based on how index is calculated above and decomp_data is generated
433 from Tools/unicode/makeunicodedata.py, it should not be possible
434 to overflow decomp_prefix. */
435 prefix_index = decomp_data[index] & 255;
436 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
438 /* copy prefix */
439 i = strlen(decomp_prefix[prefix_index]);
440 memcpy(decomp, decomp_prefix[prefix_index], i);
442 while (count-- > 0) {
443 if (i)
444 decomp[i++] = ' ';
445 assert((size_t)i < sizeof(decomp));
446 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
447 decomp_data[++index]);
448 i += strlen(decomp + i);
451 decomp[i] = '\0';
453 return PyString_FromString(decomp);
456 static void
457 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
459 if (code >= 0x110000) {
460 *index = 0;
461 } else if (self && get_old_record(self, code)->category_changed==0) {
462 /* unassigned in old version */
463 *index = 0;
465 else {
466 *index = decomp_index1[(code>>DECOMP_SHIFT)];
467 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
468 (code&((1<<DECOMP_SHIFT)-1))];
471 /* high byte is number of hex bytes (usually one or two), low byte
472 is prefix code (from*/
473 *count = decomp_data[*index] >> 8;
474 *prefix = decomp_data[*index] & 255;
476 (*index)++;
479 #define SBase 0xAC00
480 #define LBase 0x1100
481 #define VBase 0x1161
482 #define TBase 0x11A7
483 #define LCount 19
484 #define VCount 21
485 #define TCount 28
486 #define NCount (VCount*TCount)
487 #define SCount (LCount*NCount)
489 static PyObject*
490 nfd_nfkd(PyObject *self, PyObject *input, int k)
492 PyObject *result;
493 Py_UNICODE *i, *end, *o;
494 /* Longest decomposition in Unicode 3.2: U+FDFA */
495 Py_UNICODE stack[20];
496 Py_ssize_t space, isize;
497 int index, prefix, count, stackptr;
498 unsigned char prev, cur;
500 stackptr = 0;
501 isize = PyUnicode_GET_SIZE(input);
502 /* Overallocate atmost 10 characters. */
503 space = (isize > 10 ? 10 : isize) + isize;
504 result = PyUnicode_FromUnicode(NULL, space);
505 if (!result)
506 return NULL;
507 i = PyUnicode_AS_UNICODE(input);
508 end = i + isize;
509 o = PyUnicode_AS_UNICODE(result);
511 while (i < end) {
512 stack[stackptr++] = *i++;
513 while(stackptr) {
514 Py_UNICODE code = stack[--stackptr];
515 /* Hangul Decomposition adds three characters in
516 a single step, so we need atleast that much room. */
517 if (space < 3) {
518 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
519 space += 10;
520 if (PyUnicode_Resize(&result, newsize) == -1)
521 return NULL;
522 o = PyUnicode_AS_UNICODE(result) + newsize - space;
524 /* Hangul Decomposition. */
525 if (SBase <= code && code < (SBase+SCount)) {
526 int SIndex = code - SBase;
527 int L = LBase + SIndex / NCount;
528 int V = VBase + (SIndex % NCount) / TCount;
529 int T = TBase + SIndex % TCount;
530 *o++ = L;
531 *o++ = V;
532 space -= 2;
533 if (T != TBase) {
534 *o++ = T;
535 space --;
537 continue;
539 /* normalization changes */
540 if (self) {
541 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
542 if (value != 0) {
543 stack[stackptr++] = value;
544 continue;
548 /* Other decompositions. */
549 get_decomp_record(self, code, &index, &prefix, &count);
551 /* Copy character if it is not decomposable, or has a
552 compatibility decomposition, but we do NFD. */
553 if (!count || (prefix && !k)) {
554 *o++ = code;
555 space--;
556 continue;
558 /* Copy decomposition onto the stack, in reverse
559 order. */
560 while(count) {
561 code = decomp_data[index + (--count)];
562 stack[stackptr++] = code;
567 /* Drop overallocation. Cannot fail. */
568 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
570 /* Sort canonically. */
571 i = PyUnicode_AS_UNICODE(result);
572 prev = _getrecord_ex(*i)->combining;
573 end = i + PyUnicode_GET_SIZE(result);
574 for (i++; i < end; i++) {
575 cur = _getrecord_ex(*i)->combining;
576 if (prev == 0 || cur == 0 || prev <= cur) {
577 prev = cur;
578 continue;
580 /* Non-canonical order. Need to switch *i with previous. */
581 o = i - 1;
582 while (1) {
583 Py_UNICODE tmp = o[1];
584 o[1] = o[0];
585 o[0] = tmp;
586 o--;
587 if (o < PyUnicode_AS_UNICODE(result))
588 break;
589 prev = _getrecord_ex(*o)->combining;
590 if (prev == 0 || prev <= cur)
591 break;
593 prev = _getrecord_ex(*i)->combining;
595 return result;
598 static int
599 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
601 int index;
602 for (index = 0; nfc[index].start; index++) {
603 int start = nfc[index].start;
604 if (code < start)
605 return -1;
606 if (code <= start + nfc[index].count) {
607 int delta = code - start;
608 return nfc[index].index + delta;
611 return -1;
614 static PyObject*
615 nfc_nfkc(PyObject *self, PyObject *input, int k)
617 PyObject *result;
618 Py_UNICODE *i, *i1, *o, *end;
619 int f,l,index,index1,comb;
620 Py_UNICODE code;
621 Py_UNICODE *skipped[20];
622 int cskipped = 0;
624 result = nfd_nfkd(self, input, k);
625 if (!result)
626 return NULL;
628 /* We are going to modify result in-place.
629 If nfd_nfkd is changed to sometimes return the input,
630 this code needs to be reviewed. */
631 assert(result != input);
633 i = PyUnicode_AS_UNICODE(result);
634 end = i + PyUnicode_GET_SIZE(result);
635 o = PyUnicode_AS_UNICODE(result);
637 again:
638 while (i < end) {
639 for (index = 0; index < cskipped; index++) {
640 if (skipped[index] == i) {
641 /* *i character is skipped.
642 Remove from list. */
643 skipped[index] = skipped[cskipped-1];
644 cskipped--;
645 i++;
646 goto again; /* continue while */
649 /* Hangul Composition. We don't need to check for <LV,T>
650 pairs, since we always have decomposed data. */
651 if (LBase <= *i && *i < (LBase+LCount) &&
652 i + 1 < end &&
653 VBase <= i[1] && i[1] <= (VBase+VCount)) {
654 int LIndex, VIndex;
655 LIndex = i[0] - LBase;
656 VIndex = i[1] - VBase;
657 code = SBase + (LIndex*VCount+VIndex)*TCount;
658 i+=2;
659 if (i < end &&
660 TBase <= *i && *i <= (TBase+TCount)) {
661 code += *i-TBase;
662 i++;
664 *o++ = code;
665 continue;
668 f = find_nfc_index(self, nfc_first, *i);
669 if (f == -1) {
670 *o++ = *i++;
671 continue;
673 /* Find next unblocked character. */
674 i1 = i+1;
675 comb = 0;
676 while (i1 < end) {
677 int comb1 = _getrecord_ex(*i1)->combining;
678 if (comb1 && comb == comb1) {
679 /* Character is blocked. */
680 i1++;
681 continue;
683 l = find_nfc_index(self, nfc_last, *i1);
684 /* *i1 cannot be combined with *i. If *i1
685 is a starter, we don't need to look further.
686 Otherwise, record the combining class. */
687 if (l == -1) {
688 not_combinable:
689 if (comb1 == 0)
690 break;
691 comb = comb1;
692 i1++;
693 continue;
695 index = f*TOTAL_LAST + l;
696 index1 = comp_index[index >> COMP_SHIFT];
697 code = comp_data[(index1<<COMP_SHIFT)+
698 (index&((1<<COMP_SHIFT)-1))];
699 if (code == 0)
700 goto not_combinable;
702 /* Replace the original character. */
703 *i = code;
704 /* Mark the second character unused. */
705 skipped[cskipped++] = i1;
706 i1++;
707 f = find_nfc_index(self, nfc_first, *i);
708 if (f == -1)
709 break;
711 *o++ = *i++;
713 if (o != end)
714 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
715 return result;
718 PyDoc_STRVAR(unicodedata_normalize__doc__,
719 "normalize(form, unistr)\n\
721 Return the normal form 'form' for the Unicode string unistr. Valid\n\
722 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
724 static PyObject*
725 unicodedata_normalize(PyObject *self, PyObject *args)
727 char *form;
728 PyObject *input;
730 if(!PyArg_ParseTuple(args, "sO!:normalize",
731 &form, &PyUnicode_Type, &input))
732 return NULL;
734 if (PyUnicode_GetSize(input) == 0) {
735 /* Special case empty input strings, since resizing
736 them later would cause internal errors. */
737 Py_INCREF(input);
738 return input;
741 if (strcmp(form, "NFC") == 0)
742 return nfc_nfkc(self, input, 0);
743 if (strcmp(form, "NFKC") == 0)
744 return nfc_nfkc(self, input, 1);
745 if (strcmp(form, "NFD") == 0)
746 return nfd_nfkd(self, input, 0);
747 if (strcmp(form, "NFKD") == 0)
748 return nfd_nfkd(self, input, 1);
749 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
750 return NULL;
753 /* -------------------------------------------------------------------- */
754 /* unicode character name tables */
756 /* data file generated by Tools/unicode/makeunicodedata.py */
757 #include "unicodename_db.h"
759 /* -------------------------------------------------------------------- */
760 /* database code (cut and pasted from the unidb package) */
762 static unsigned long
763 _gethash(const char *s, int len, int scale)
765 int i;
766 unsigned long h = 0;
767 unsigned long ix;
768 for (i = 0; i < len; i++) {
769 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
770 ix = h & 0xff000000;
771 if (ix)
772 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
774 return h;
777 static char *hangul_syllables[][3] = {
778 { "G", "A", "" },
779 { "GG", "AE", "G" },
780 { "N", "YA", "GG" },
781 { "D", "YAE", "GS" },
782 { "DD", "EO", "N", },
783 { "R", "E", "NJ" },
784 { "M", "YEO", "NH" },
785 { "B", "YE", "D" },
786 { "BB", "O", "L" },
787 { "S", "WA", "LG" },
788 { "SS", "WAE", "LM" },
789 { "", "OE", "LB" },
790 { "J", "YO", "LS" },
791 { "JJ", "U", "LT" },
792 { "C", "WEO", "LP" },
793 { "K", "WE", "LH" },
794 { "T", "WI", "M" },
795 { "P", "YU", "B" },
796 { "H", "EU", "BS" },
797 { 0, "YI", "S" },
798 { 0, "I", "SS" },
799 { 0, 0, "NG" },
800 { 0, 0, "J" },
801 { 0, 0, "C" },
802 { 0, 0, "K" },
803 { 0, 0, "T" },
804 { 0, 0, "P" },
805 { 0, 0, "H" }
808 static int
809 is_unified_ideograph(Py_UCS4 code)
811 return (
812 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
813 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
814 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
817 static int
818 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
820 int offset;
821 int i;
822 int word;
823 unsigned char* w;
825 if (code >= 0x110000)
826 return 0;
828 if (self) {
829 const change_record *old = get_old_record(self, code);
830 if (old->category_changed == 0) {
831 /* unassigned */
832 return 0;
836 if (SBase <= code && code < SBase+SCount) {
837 /* Hangul syllable. */
838 int SIndex = code - SBase;
839 int L = SIndex / NCount;
840 int V = (SIndex % NCount) / TCount;
841 int T = SIndex % TCount;
843 if (buflen < 27)
844 /* Worst case: HANGUL SYLLABLE <10chars>. */
845 return 0;
846 strcpy(buffer, "HANGUL SYLLABLE ");
847 buffer += 16;
848 strcpy(buffer, hangul_syllables[L][0]);
849 buffer += strlen(hangul_syllables[L][0]);
850 strcpy(buffer, hangul_syllables[V][1]);
851 buffer += strlen(hangul_syllables[V][1]);
852 strcpy(buffer, hangul_syllables[T][2]);
853 buffer += strlen(hangul_syllables[T][2]);
854 *buffer = '\0';
855 return 1;
858 if (is_unified_ideograph(code)) {
859 if (buflen < 28)
860 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
861 return 0;
862 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
863 return 1;
866 /* get offset into phrasebook */
867 offset = phrasebook_offset1[(code>>phrasebook_shift)];
868 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
869 (code&((1<<phrasebook_shift)-1))];
870 if (!offset)
871 return 0;
873 i = 0;
875 for (;;) {
876 /* get word index */
877 word = phrasebook[offset] - phrasebook_short;
878 if (word >= 0) {
879 word = (word << 8) + phrasebook[offset+1];
880 offset += 2;
881 } else
882 word = phrasebook[offset++];
883 if (i) {
884 if (i > buflen)
885 return 0; /* buffer overflow */
886 buffer[i++] = ' ';
888 /* copy word string from lexicon. the last character in the
889 word has bit 7 set. the last word in a string ends with
890 0x80 */
891 w = lexicon + lexicon_offset[word];
892 while (*w < 128) {
893 if (i >= buflen)
894 return 0; /* buffer overflow */
895 buffer[i++] = *w++;
897 if (i >= buflen)
898 return 0; /* buffer overflow */
899 buffer[i++] = *w & 127;
900 if (*w == 128)
901 break; /* end of word */
904 return 1;
907 static int
908 _cmpname(PyObject *self, int code, const char* name, int namelen)
910 /* check if code corresponds to the given name */
911 int i;
912 char buffer[NAME_MAXLEN];
913 if (!_getucname(self, code, buffer, sizeof(buffer)))
914 return 0;
915 for (i = 0; i < namelen; i++) {
916 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
917 return 0;
919 return buffer[namelen] == '\0';
922 static void
923 find_syllable(const char *str, int *len, int *pos, int count, int column)
925 int i, len1;
926 *len = -1;
927 for (i = 0; i < count; i++) {
928 char *s = hangul_syllables[i][column];
929 len1 = strlen(s);
930 if (len1 <= *len)
931 continue;
932 if (strncmp(str, s, len1) == 0) {
933 *len = len1;
934 *pos = i;
937 if (*len == -1) {
938 *len = 0;
942 static int
943 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
945 unsigned int h, v;
946 unsigned int mask = code_size-1;
947 unsigned int i, incr;
949 /* Check for hangul syllables. */
950 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
951 int len, L = -1, V = -1, T = -1;
952 const char *pos = name + 16;
953 find_syllable(pos, &len, &L, LCount, 0);
954 pos += len;
955 find_syllable(pos, &len, &V, VCount, 1);
956 pos += len;
957 find_syllable(pos, &len, &T, TCount, 2);
958 pos += len;
959 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
960 *code = SBase + (L*VCount+V)*TCount + T;
961 return 1;
963 /* Otherwise, it's an illegal syllable name. */
964 return 0;
967 /* Check for unified ideographs. */
968 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
969 /* Four or five hexdigits must follow. */
970 v = 0;
971 name += 22;
972 namelen -= 22;
973 if (namelen != 4 && namelen != 5)
974 return 0;
975 while (namelen--) {
976 v *= 16;
977 if (*name >= '0' && *name <= '9')
978 v += *name - '0';
979 else if (*name >= 'A' && *name <= 'F')
980 v += *name - 'A' + 10;
981 else
982 return 0;
983 name++;
985 if (!is_unified_ideograph(v))
986 return 0;
987 *code = v;
988 return 1;
991 /* the following is the same as python's dictionary lookup, with
992 only minor changes. see the makeunicodedata script for more
993 details */
995 h = (unsigned int) _gethash(name, namelen, code_magic);
996 i = (~h) & mask;
997 v = code_hash[i];
998 if (!v)
999 return 0;
1000 if (_cmpname(self, v, name, namelen)) {
1001 *code = v;
1002 return 1;
1004 incr = (h ^ (h >> 3)) & mask;
1005 if (!incr)
1006 incr = mask;
1007 for (;;) {
1008 i = (i + incr) & mask;
1009 v = code_hash[i];
1010 if (!v)
1011 return 0;
1012 if (_cmpname(self, v, name, namelen)) {
1013 *code = v;
1014 return 1;
1016 incr = incr << 1;
1017 if (incr > mask)
1018 incr = incr ^ code_poly;
1022 static const _PyUnicode_Name_CAPI hashAPI =
1024 sizeof(_PyUnicode_Name_CAPI),
1025 _getucname,
1026 _getcode
1029 /* -------------------------------------------------------------------- */
1030 /* Python bindings */
1032 PyDoc_STRVAR(unicodedata_name__doc__,
1033 "name(unichr[, default])\n\
1034 Returns the name assigned to the Unicode character unichr as a\n\
1035 string. If no name is defined, default is returned, or, if not\n\
1036 given, ValueError is raised.");
1038 static PyObject *
1039 unicodedata_name(PyObject* self, PyObject* args)
1041 char name[NAME_MAXLEN];
1043 PyUnicodeObject* v;
1044 PyObject* defobj = NULL;
1045 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1046 return NULL;
1048 if (PyUnicode_GET_SIZE(v) != 1) {
1049 PyErr_SetString(PyExc_TypeError,
1050 "need a single Unicode character as parameter");
1051 return NULL;
1054 if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
1055 name, sizeof(name))) {
1056 if (defobj == NULL) {
1057 PyErr_SetString(PyExc_ValueError, "no such name");
1058 return NULL;
1060 else {
1061 Py_INCREF(defobj);
1062 return defobj;
1066 return Py_BuildValue("s", name);
1069 PyDoc_STRVAR(unicodedata_lookup__doc__,
1070 "lookup(name)\n\
1072 Look up character by name. If a character with the\n\
1073 given name is found, return the corresponding Unicode\n\
1074 character. If not found, KeyError is raised.");
1076 static PyObject *
1077 unicodedata_lookup(PyObject* self, PyObject* args)
1079 Py_UCS4 code;
1080 Py_UNICODE str[1];
1081 char errbuf[256];
1083 char* name;
1084 int namelen;
1085 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1086 return NULL;
1088 if (!_getcode(self, name, namelen, &code)) {
1089 /* XXX(nnorwitz): why are we allocating for the error msg?
1090 Why not always use snprintf? */
1091 char fmt[] = "undefined character name '%s'";
1092 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
1093 if (buf)
1094 sprintf(buf, fmt, name);
1095 else {
1096 buf = errbuf;
1097 PyOS_snprintf(buf, sizeof(errbuf), fmt, name);
1099 PyErr_SetString(PyExc_KeyError, buf);
1100 if (buf != errbuf)
1101 PyMem_FREE(buf);
1102 return NULL;
1105 str[0] = (Py_UNICODE) code;
1106 return PyUnicode_FromUnicode(str, 1);
1109 /* XXX Add doc strings. */
1111 static PyMethodDef unicodedata_functions[] = {
1112 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1113 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1114 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1115 {"category", unicodedata_category, METH_VARARGS,
1116 unicodedata_category__doc__},
1117 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1118 unicodedata_bidirectional__doc__},
1119 {"combining", unicodedata_combining, METH_VARARGS,
1120 unicodedata_combining__doc__},
1121 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1122 unicodedata_mirrored__doc__},
1123 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1124 unicodedata_east_asian_width__doc__},
1125 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1126 unicodedata_decomposition__doc__},
1127 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1128 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1129 {"normalize", unicodedata_normalize, METH_VARARGS,
1130 unicodedata_normalize__doc__},
1131 {NULL, NULL} /* sentinel */
1134 static PyTypeObject UCD_Type = {
1135 /* The ob_type field must be initialized in the module init function
1136 * to be portable to Windows without using C++. */
1137 PyObject_HEAD_INIT(NULL)
1138 0, /*ob_size*/
1139 "unicodedata.UCD", /*tp_name*/
1140 sizeof(PreviousDBVersion), /*tp_basicsize*/
1141 0, /*tp_itemsize*/
1142 /* methods */
1143 (destructor)PyObject_Del, /*tp_dealloc*/
1144 0, /*tp_print*/
1145 0, /*tp_getattr*/
1146 0, /*tp_setattr*/
1147 0, /*tp_compare*/
1148 0, /*tp_repr*/
1149 0, /*tp_as_number*/
1150 0, /*tp_as_sequence*/
1151 0, /*tp_as_mapping*/
1152 0, /*tp_hash*/
1153 0, /*tp_call*/
1154 0, /*tp_str*/
1155 PyObject_GenericGetAttr,/*tp_getattro*/
1156 0, /*tp_setattro*/
1157 0, /*tp_as_buffer*/
1158 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1159 0, /*tp_doc*/
1160 0, /*tp_traverse*/
1161 0, /*tp_clear*/
1162 0, /*tp_richcompare*/
1163 0, /*tp_weaklistoffset*/
1164 0, /*tp_iter*/
1165 0, /*tp_iternext*/
1166 unicodedata_functions, /*tp_methods*/
1167 DB_members, /*tp_members*/
1168 0, /*tp_getset*/
1169 0, /*tp_base*/
1170 0, /*tp_dict*/
1171 0, /*tp_descr_get*/
1172 0, /*tp_descr_set*/
1173 0, /*tp_dictoffset*/
1174 0, /*tp_init*/
1175 0, /*tp_alloc*/
1176 0, /*tp_new*/
1177 0, /*tp_free*/
1178 0, /*tp_is_gc*/
1181 PyDoc_STRVAR(unicodedata_docstring,
1182 "This module provides access to the Unicode Character Database which\n\
1183 defines character properties for all Unicode characters. The data in\n\
1184 this database is based on the UnicodeData.txt file version\n\
1185 4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1187 The module uses the same names and symbols as defined by the\n\
1188 UnicodeData File Format 4.1.0 (see\n\
1189 http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
1191 PyMODINIT_FUNC
1192 initunicodedata(void)
1194 PyObject *m, *v;
1196 UCD_Type.ob_type = &PyType_Type;
1198 m = Py_InitModule3(
1199 "unicodedata", unicodedata_functions, unicodedata_docstring);
1200 if (!m)
1201 return;
1203 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1204 Py_INCREF(&UCD_Type);
1205 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1207 /* Previous versions */
1208 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1209 if (v != NULL)
1210 PyModule_AddObject(m, "ucd_3_2_0", v);
1212 /* Export C API */
1213 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1214 if (v != NULL)
1215 PyModule_AddObject(m, "ucnhash_CAPI", v);
1219 Local variables:
1220 c-basic-offset: 4
1221 indent-tabs-mode: nil
1222 End: