Don't abbreviate ABS, use long name ABSOLUTE.
[python.git] / Modules / unicodedata.c
blob9eda6537b6a6cbc78cee6a0f0dbaf103d293c3e5
1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 3.2 data base.
5 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
15 #include "Python.h"
16 #include "ucnhash.h"
17 #include "structmember.h"
19 /* character properties */
21 typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
30 } _PyUnicode_DatabaseRecord;
32 typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
37 const int numeric_changed;
38 } change_record;
40 /* data file generated by Tools/unicode/makeunicodedata.py */
41 #include "unicodedata_db.h"
43 static const _PyUnicode_DatabaseRecord*
44 _getrecord_ex(Py_UCS4 code)
46 int index;
47 if (code >= 0x110000)
48 index = 0;
49 else {
50 index = index1[(code>>SHIFT)];
51 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 return &_PyUnicode_Database_Records[index];
57 static const _PyUnicode_DatabaseRecord*
58 _getrecord(PyUnicodeObject* v)
60 return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
63 /* ------------- Previous-version API ------------------------------------- */
64 typedef struct previous_version {
65 PyObject_HEAD
66 const char *name;
67 const change_record* (*getrecord)(Py_UCS4);
68 Py_UCS4 (*normalization)(Py_UCS4);
69 } PreviousDBVersion;
71 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
73 static PyMemberDef DB_members[] = {
74 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
75 {NULL}
78 // forward declaration
79 static PyTypeObject UCD_Type;
81 static PyObject*
82 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
83 Py_UCS4 (*normalization)(Py_UCS4))
85 PreviousDBVersion *self;
86 self = PyObject_New(PreviousDBVersion, &UCD_Type);
87 if (self == NULL)
88 return NULL;
89 self->name = name;
90 self->getrecord = getrecord;
91 self->normalization = normalization;
92 return (PyObject*)self;
95 /* --- Module API --------------------------------------------------------- */
97 PyDoc_STRVAR(unicodedata_decimal__doc__,
98 "decimal(unichr[, default])\n\
99 \n\
100 Returns the decimal value assigned to the Unicode character unichr\n\
101 as integer. If no such value is defined, default is returned, or, if\n\
102 not given, ValueError is raised.");
104 static PyObject *
105 unicodedata_decimal(PyObject *self, PyObject *args)
107 PyUnicodeObject *v;
108 PyObject *defobj = NULL;
109 int have_old = 0;
110 long rc;
112 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
113 return NULL;
114 if (PyUnicode_GET_SIZE(v) != 1) {
115 PyErr_SetString(PyExc_TypeError,
116 "need a single Unicode character as parameter");
117 return NULL;
120 if (self) {
121 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
122 if (old->category_changed == 0) {
123 /* unassigned */
124 have_old = 1;
125 rc = -1;
127 else if (old->decimal_changed != 0xFF) {
128 have_old = 1;
129 rc = old->decimal_changed;
133 if (!have_old)
134 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
135 if (rc < 0) {
136 if (defobj == NULL) {
137 PyErr_SetString(PyExc_ValueError,
138 "not a decimal");
139 return NULL;
141 else {
142 Py_INCREF(defobj);
143 return defobj;
146 return PyInt_FromLong(rc);
149 PyDoc_STRVAR(unicodedata_digit__doc__,
150 "digit(unichr[, default])\n\
152 Returns the digit value assigned to the Unicode character unichr as\n\
153 integer. If no such value is defined, default is returned, or, if\n\
154 not given, ValueError is raised.");
156 static PyObject *
157 unicodedata_digit(PyObject *self, PyObject *args)
159 PyUnicodeObject *v;
160 PyObject *defobj = NULL;
161 long rc;
163 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
164 return NULL;
165 if (PyUnicode_GET_SIZE(v) != 1) {
166 PyErr_SetString(PyExc_TypeError,
167 "need a single Unicode character as parameter");
168 return NULL;
170 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
171 if (rc < 0) {
172 if (defobj == NULL) {
173 PyErr_SetString(PyExc_ValueError, "not a digit");
174 return NULL;
176 else {
177 Py_INCREF(defobj);
178 return defobj;
181 return PyInt_FromLong(rc);
184 PyDoc_STRVAR(unicodedata_numeric__doc__,
185 "numeric(unichr[, default])\n\
187 Returns the numeric value assigned to the Unicode character unichr\n\
188 as float. If no such value is defined, default is returned, or, if\n\
189 not given, ValueError is raised.");
191 static PyObject *
192 unicodedata_numeric(PyObject *self, PyObject *args)
194 PyUnicodeObject *v;
195 PyObject *defobj = NULL;
196 int have_old = 0;
197 double rc;
199 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
200 return NULL;
201 if (PyUnicode_GET_SIZE(v) != 1) {
202 PyErr_SetString(PyExc_TypeError,
203 "need a single Unicode character as parameter");
204 return NULL;
207 if (self) {
208 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
209 if (old->category_changed == 0) {
210 /* unassigned */
211 have_old = 1;
212 rc = -1;
214 else if (old->decimal_changed != 0xFF) {
215 have_old = 1;
216 rc = old->decimal_changed;
220 if (!have_old)
221 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
222 if (rc < 0) {
223 if (defobj == NULL) {
224 PyErr_SetString(PyExc_ValueError, "not a numeric character");
225 return NULL;
227 else {
228 Py_INCREF(defobj);
229 return defobj;
232 return PyFloat_FromDouble(rc);
235 PyDoc_STRVAR(unicodedata_category__doc__,
236 "category(unichr)\n\
238 Returns the general category assigned to the Unicode character\n\
239 unichr as string.");
241 static PyObject *
242 unicodedata_category(PyObject *self, PyObject *args)
244 PyUnicodeObject *v;
245 int index;
247 if (!PyArg_ParseTuple(args, "O!:category",
248 &PyUnicode_Type, &v))
249 return NULL;
250 if (PyUnicode_GET_SIZE(v) != 1) {
251 PyErr_SetString(PyExc_TypeError,
252 "need a single Unicode character as parameter");
253 return NULL;
255 index = (int) _getrecord(v)->category;
256 if (self) {
257 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
258 if (old->category_changed != 0xFF)
259 index = old->category_changed;
261 return PyString_FromString(_PyUnicode_CategoryNames[index]);
264 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
265 "bidirectional(unichr)\n\
267 Returns the bidirectional category assigned to the Unicode character\n\
268 unichr as string. If no such value is defined, an empty string is\n\
269 returned.");
271 static PyObject *
272 unicodedata_bidirectional(PyObject *self, PyObject *args)
274 PyUnicodeObject *v;
275 int index;
277 if (!PyArg_ParseTuple(args, "O!:bidirectional",
278 &PyUnicode_Type, &v))
279 return NULL;
280 if (PyUnicode_GET_SIZE(v) != 1) {
281 PyErr_SetString(PyExc_TypeError,
282 "need a single Unicode character as parameter");
283 return NULL;
285 index = (int) _getrecord(v)->bidirectional;
286 if (self) {
287 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
288 if (old->category_changed == 0)
289 index = 0; /* unassigned */
290 else if (old->bidir_changed != 0xFF)
291 index = old->bidir_changed;
293 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
296 PyDoc_STRVAR(unicodedata_combining__doc__,
297 "combining(unichr)\n\
299 Returns the canonical combining class assigned to the Unicode\n\
300 character unichr as integer. Returns 0 if no combining class is\n\
301 defined.");
303 static PyObject *
304 unicodedata_combining(PyObject *self, PyObject *args)
306 PyUnicodeObject *v;
307 int index;
309 if (!PyArg_ParseTuple(args, "O!:combining",
310 &PyUnicode_Type, &v))
311 return NULL;
312 if (PyUnicode_GET_SIZE(v) != 1) {
313 PyErr_SetString(PyExc_TypeError,
314 "need a single Unicode character as parameter");
315 return NULL;
317 index = (int) _getrecord(v)->combining;
318 if (self) {
319 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
320 if (old->category_changed == 0)
321 index = 0; /* unassigned */
323 return PyInt_FromLong(index);
326 PyDoc_STRVAR(unicodedata_mirrored__doc__,
327 "mirrored(unichr)\n\
329 Returns the mirrored property assigned to the Unicode character\n\
330 unichr as integer. Returns 1 if the character has been identified as\n\
331 a \"mirrored\" character in bidirectional text, 0 otherwise.");
333 static PyObject *
334 unicodedata_mirrored(PyObject *self, PyObject *args)
336 PyUnicodeObject *v;
337 int index;
339 if (!PyArg_ParseTuple(args, "O!:mirrored",
340 &PyUnicode_Type, &v))
341 return NULL;
342 if (PyUnicode_GET_SIZE(v) != 1) {
343 PyErr_SetString(PyExc_TypeError,
344 "need a single Unicode character as parameter");
345 return NULL;
347 index = (int) _getrecord(v)->mirrored;
348 if (self) {
349 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
350 if (old->category_changed == 0)
351 index = 0; /* unassigned */
353 return PyInt_FromLong(index);
356 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
357 "east_asian_width(unichr)\n\
359 Returns the east asian width assigned to the Unicode character\n\
360 unichr as string.");
362 static PyObject *
363 unicodedata_east_asian_width(PyObject *self, PyObject *args)
365 PyUnicodeObject *v;
366 int index;
368 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
369 &PyUnicode_Type, &v))
370 return NULL;
371 if (PyUnicode_GET_SIZE(v) != 1) {
372 PyErr_SetString(PyExc_TypeError,
373 "need a single Unicode character as parameter");
374 return NULL;
376 index = (int) _getrecord(v)->east_asian_width;
377 if (self) {
378 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
379 if (old->category_changed == 0)
380 index = 0; /* unassigned */
382 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
385 PyDoc_STRVAR(unicodedata_decomposition__doc__,
386 "decomposition(unichr)\n\
388 Returns the character decomposition mapping assigned to the Unicode\n\
389 character unichr as string. An empty string is returned in case no\n\
390 such mapping is defined.");
392 static PyObject *
393 unicodedata_decomposition(PyObject *self, PyObject *args)
395 PyUnicodeObject *v;
396 char decomp[256];
397 int code, index, count, i;
399 if (!PyArg_ParseTuple(args, "O!:decomposition",
400 &PyUnicode_Type, &v))
401 return NULL;
402 if (PyUnicode_GET_SIZE(v) != 1) {
403 PyErr_SetString(PyExc_TypeError,
404 "need a single Unicode character as parameter");
405 return NULL;
408 code = (int) *PyUnicode_AS_UNICODE(v);
410 if (self) {
411 const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
412 if (old->category_changed == 0)
413 return PyString_FromString(""); /* unassigned */
416 if (code < 0 || code >= 0x110000)
417 index = 0;
418 else {
419 index = decomp_index1[(code>>DECOMP_SHIFT)];
420 index = decomp_index2[(index<<DECOMP_SHIFT)+
421 (code&((1<<DECOMP_SHIFT)-1))];
424 /* high byte is number of hex bytes (usually one or two), low byte
425 is prefix code (from*/
426 count = decomp_data[index] >> 8;
428 /* XXX: could allocate the PyString up front instead
429 (strlen(prefix) + 5 * count + 1 bytes) */
431 /* copy prefix */
432 i = strlen(decomp_prefix[decomp_data[index] & 255]);
433 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
435 while (count-- > 0) {
436 if (i)
437 decomp[i++] = ' ';
438 assert((size_t)i < sizeof(decomp));
439 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
440 decomp_data[++index]);
441 i += strlen(decomp + i);
444 decomp[i] = '\0';
446 return PyString_FromString(decomp);
449 void
450 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
452 if (code >= 0x110000) {
453 *index = 0;
454 } else if (self && get_old_record(self, code)->category_changed==0) {
455 /* unassigned in old version */
456 *index = 0;
458 else {
459 *index = decomp_index1[(code>>DECOMP_SHIFT)];
460 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
461 (code&((1<<DECOMP_SHIFT)-1))];
464 /* high byte is number of hex bytes (usually one or two), low byte
465 is prefix code (from*/
466 *count = decomp_data[*index] >> 8;
467 *prefix = decomp_data[*index] & 255;
469 (*index)++;
472 #define SBase 0xAC00
473 #define LBase 0x1100
474 #define VBase 0x1161
475 #define TBase 0x11A7
476 #define LCount 19
477 #define VCount 21
478 #define TCount 28
479 #define NCount (VCount*TCount)
480 #define SCount (LCount*NCount)
482 static PyObject*
483 nfd_nfkd(PyObject *self, PyObject *input, int k)
485 PyObject *result;
486 Py_UNICODE *i, *end, *o;
487 /* Longest decomposition in Unicode 3.2: U+FDFA */
488 Py_UNICODE stack[20];
489 int space, stackptr, isize;
490 int index, prefix, count;
491 unsigned char prev, cur;
493 stackptr = 0;
494 isize = PyUnicode_GET_SIZE(input);
495 /* Overallocate atmost 10 characters. */
496 space = (isize > 10 ? 10 : isize) + isize;
497 result = PyUnicode_FromUnicode(NULL, space);
498 if (!result)
499 return NULL;
500 i = PyUnicode_AS_UNICODE(input);
501 end = i + isize;
502 o = PyUnicode_AS_UNICODE(result);
504 while (i < end) {
505 stack[stackptr++] = *i++;
506 while(stackptr) {
507 Py_UNICODE code = stack[--stackptr];
508 /* Hangul Decomposition adds three characters in
509 a single step, so we need atleast that much room. */
510 if (space < 3) {
511 int newsize = PyString_GET_SIZE(result) + 10;
512 space += 10;
513 if (PyUnicode_Resize(&result, newsize) == -1)
514 return NULL;
515 o = PyUnicode_AS_UNICODE(result) + newsize - space;
517 /* Hangul Decomposition. */
518 if (SBase <= code && code < (SBase+SCount)) {
519 int SIndex = code - SBase;
520 int L = LBase + SIndex / NCount;
521 int V = VBase + (SIndex % NCount) / TCount;
522 int T = TBase + SIndex % TCount;
523 *o++ = L;
524 *o++ = V;
525 space -= 2;
526 if (T != TBase) {
527 *o++ = T;
528 space --;
530 continue;
532 /* normalization changes */
533 if (self) {
534 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
535 if (value != 0) {
536 stack[stackptr++] = value;
537 continue;
541 /* Other decompositions. */
542 get_decomp_record(self, code, &index, &prefix, &count);
544 /* Copy character if it is not decomposable, or has a
545 compatibility decomposition, but we do NFD. */
546 if (!count || (prefix && !k)) {
547 *o++ = code;
548 space--;
549 continue;
551 /* Copy decomposition onto the stack, in reverse
552 order. */
553 while(count) {
554 code = decomp_data[index + (--count)];
555 stack[stackptr++] = code;
560 /* Drop overallocation. Cannot fail. */
561 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
563 /* Sort canonically. */
564 i = PyUnicode_AS_UNICODE(result);
565 prev = _getrecord_ex(*i)->combining;
566 end = i + PyUnicode_GET_SIZE(result);
567 for (i++; i < end; i++) {
568 cur = _getrecord_ex(*i)->combining;
569 if (prev == 0 || cur == 0 || prev <= cur) {
570 prev = cur;
571 continue;
573 /* Non-canonical order. Need to switch *i with previous. */
574 o = i - 1;
575 while (1) {
576 Py_UNICODE tmp = o[1];
577 o[1] = o[0];
578 o[0] = tmp;
579 o--;
580 if (o < PyUnicode_AS_UNICODE(result))
581 break;
582 prev = _getrecord_ex(*o)->combining;
583 if (prev == 0 || prev <= cur)
584 break;
586 prev = _getrecord_ex(*i)->combining;
588 return result;
591 static int
592 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
594 int index;
595 for (index = 0; nfc[index].start; index++) {
596 int start = nfc[index].start;
597 if (code < start)
598 return -1;
599 if (code <= start + nfc[index].count) {
600 int delta = code - start;
601 return nfc[index].index + delta;
604 return -1;
607 static PyObject*
608 nfc_nfkc(PyObject *self, PyObject *input, int k)
610 PyObject *result;
611 Py_UNICODE *i, *i1, *o, *end;
612 int f,l,index,index1,comb;
613 Py_UNICODE code;
614 Py_UNICODE *skipped[20];
615 int cskipped = 0;
617 result = nfd_nfkd(self, input, k);
618 if (!result)
619 return NULL;
621 /* We are going to modify result in-place.
622 If nfd_nfkd is changed to sometimes return the input,
623 this code needs to be reviewed. */
624 assert(result != input);
626 i = PyUnicode_AS_UNICODE(result);
627 end = i + PyUnicode_GET_SIZE(result);
628 o = PyUnicode_AS_UNICODE(result);
630 again:
631 while (i < end) {
632 for (index = 0; index < cskipped; index++) {
633 if (skipped[index] == i) {
634 /* *i character is skipped.
635 Remove from list. */
636 skipped[index] = skipped[cskipped-1];
637 cskipped--;
638 i++;
639 goto again; /* continue while */
642 /* Hangul Composition. We don't need to check for <LV,T>
643 pairs, since we always have decomposed data. */
644 if (LBase <= *i && *i < (LBase+LCount) &&
645 i + 1 < end &&
646 VBase <= i[1] && i[1] <= (VBase+VCount)) {
647 int LIndex, VIndex;
648 LIndex = i[0] - LBase;
649 VIndex = i[1] - VBase;
650 code = SBase + (LIndex*VCount+VIndex)*TCount;
651 i+=2;
652 if (i < end &&
653 TBase <= *i && *i <= (TBase+TCount)) {
654 code += *i-TBase;
655 i++;
657 *o++ = code;
658 continue;
661 f = find_nfc_index(self, nfc_first, *i);
662 if (f == -1) {
663 *o++ = *i++;
664 continue;
666 /* Find next unblocked character. */
667 i1 = i+1;
668 comb = 0;
669 while (i1 < end) {
670 int comb1 = _getrecord_ex(*i1)->combining;
671 if (comb1 && comb == comb1) {
672 /* Character is blocked. */
673 i1++;
674 continue;
676 l = find_nfc_index(self, nfc_last, *i1);
677 /* *i1 cannot be combined with *i. If *i1
678 is a starter, we don't need to look further.
679 Otherwise, record the combining class. */
680 if (l == -1) {
681 not_combinable:
682 if (comb1 == 0)
683 break;
684 comb = comb1;
685 i1++;
686 continue;
688 index = f*TOTAL_LAST + l;
689 index1 = comp_index[index >> COMP_SHIFT];
690 code = comp_data[(index1<<COMP_SHIFT)+
691 (index&((1<<COMP_SHIFT)-1))];
692 if (code == 0)
693 goto not_combinable;
695 /* Replace the original character. */
696 *i = code;
697 /* Mark the second character unused. */
698 skipped[cskipped++] = i1;
699 i1++;
700 f = find_nfc_index(self, nfc_first, *i);
701 if (f == -1)
702 break;
704 *o++ = *i++;
706 if (o != end)
707 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
708 return result;
711 PyDoc_STRVAR(unicodedata_normalize__doc__,
712 "normalize(form, unistr)\n\
714 Return the normal form 'form' for the Unicode string unistr. Valid\n\
715 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
717 static PyObject*
718 unicodedata_normalize(PyObject *self, PyObject *args)
720 char *form;
721 PyObject *input;
723 if(!PyArg_ParseTuple(args, "sO!:normalize",
724 &form, &PyUnicode_Type, &input))
725 return NULL;
727 if (PyUnicode_GetSize(input) == 0) {
728 /* Special case empty input strings, since resizing
729 them later would cause internal errors. */
730 Py_INCREF(input);
731 return input;
734 if (strcmp(form, "NFC") == 0)
735 return nfc_nfkc(self, input, 0);
736 if (strcmp(form, "NFKC") == 0)
737 return nfc_nfkc(self, input, 1);
738 if (strcmp(form, "NFD") == 0)
739 return nfd_nfkd(self, input, 0);
740 if (strcmp(form, "NFKD") == 0)
741 return nfd_nfkd(self, input, 1);
742 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
743 return NULL;
746 /* -------------------------------------------------------------------- */
747 /* unicode character name tables */
749 /* data file generated by Tools/unicode/makeunicodedata.py */
750 #include "unicodename_db.h"
752 /* -------------------------------------------------------------------- */
753 /* database code (cut and pasted from the unidb package) */
755 static unsigned long
756 _gethash(const char *s, int len, int scale)
758 int i;
759 unsigned long h = 0;
760 unsigned long ix;
761 for (i = 0; i < len; i++) {
762 h = (h * scale) + (unsigned char) toupper(s[i]);
763 ix = h & 0xff000000;
764 if (ix)
765 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
767 return h;
770 static char *hangul_syllables[][3] = {
771 { "G", "A", "" },
772 { "GG", "AE", "G" },
773 { "N", "YA", "GG" },
774 { "D", "YAE", "GS" },
775 { "DD", "EO", "N", },
776 { "R", "E", "NJ" },
777 { "M", "YEO", "NH" },
778 { "B", "YE", "D" },
779 { "BB", "O", "L" },
780 { "S", "WA", "LG" },
781 { "SS", "WAE", "LM" },
782 { "", "OE", "LB" },
783 { "J", "YO", "LS" },
784 { "JJ", "U", "LT" },
785 { "C", "WEO", "LP" },
786 { "K", "WE", "LH" },
787 { "T", "WI", "M" },
788 { "P", "YU", "B" },
789 { "H", "EU", "BS" },
790 { 0, "YI", "S" },
791 { 0, "I", "SS" },
792 { 0, 0, "NG" },
793 { 0, 0, "J" },
794 { 0, 0, "C" },
795 { 0, 0, "K" },
796 { 0, 0, "T" },
797 { 0, 0, "P" },
798 { 0, 0, "H" }
801 static int
802 is_unified_ideograph(Py_UCS4 code)
804 return (
805 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
806 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
807 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
810 static int
811 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
813 int offset;
814 int i;
815 int word;
816 unsigned char* w;
818 if (code >= 0x110000)
819 return 0;
821 if (self) {
822 const change_record *old = get_old_record(self, code);
823 if (old->category_changed == 0) {
824 /* unassigned */
825 return 0;
829 if (SBase <= code && code < SBase+SCount) {
830 /* Hangul syllable. */
831 int SIndex = code - SBase;
832 int L = SIndex / NCount;
833 int V = (SIndex % NCount) / TCount;
834 int T = SIndex % TCount;
836 if (buflen < 27)
837 /* Worst case: HANGUL SYLLABLE <10chars>. */
838 return 0;
839 strcpy(buffer, "HANGUL SYLLABLE ");
840 buffer += 16;
841 strcpy(buffer, hangul_syllables[L][0]);
842 buffer += strlen(hangul_syllables[L][0]);
843 strcpy(buffer, hangul_syllables[V][1]);
844 buffer += strlen(hangul_syllables[V][1]);
845 strcpy(buffer, hangul_syllables[T][2]);
846 buffer += strlen(hangul_syllables[T][2]);
847 *buffer = '\0';
848 return 1;
851 if (is_unified_ideograph(code)) {
852 if (buflen < 28)
853 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
854 return 0;
855 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
856 return 1;
859 /* get offset into phrasebook */
860 offset = phrasebook_offset1[(code>>phrasebook_shift)];
861 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
862 (code&((1<<phrasebook_shift)-1))];
863 if (!offset)
864 return 0;
866 i = 0;
868 for (;;) {
869 /* get word index */
870 word = phrasebook[offset] - phrasebook_short;
871 if (word >= 0) {
872 word = (word << 8) + phrasebook[offset+1];
873 offset += 2;
874 } else
875 word = phrasebook[offset++];
876 if (i) {
877 if (i > buflen)
878 return 0; /* buffer overflow */
879 buffer[i++] = ' ';
881 /* copy word string from lexicon. the last character in the
882 word has bit 7 set. the last word in a string ends with
883 0x80 */
884 w = lexicon + lexicon_offset[word];
885 while (*w < 128) {
886 if (i >= buflen)
887 return 0; /* buffer overflow */
888 buffer[i++] = *w++;
890 if (i >= buflen)
891 return 0; /* buffer overflow */
892 buffer[i++] = *w & 127;
893 if (*w == 128)
894 break; /* end of word */
897 return 1;
900 static int
901 _cmpname(PyObject *self, int code, const char* name, int namelen)
903 /* check if code corresponds to the given name */
904 int i;
905 char buffer[NAME_MAXLEN];
906 if (!_getucname(self, code, buffer, sizeof(buffer)))
907 return 0;
908 for (i = 0; i < namelen; i++) {
909 if (toupper(name[i]) != buffer[i])
910 return 0;
912 return buffer[namelen] == '\0';
915 static void
916 find_syllable(const char *str, int *len, int *pos, int count, int column)
918 int i, len1;
919 *len = -1;
920 for (i = 0; i < count; i++) {
921 char *s = hangul_syllables[i][column];
922 len1 = strlen(s);
923 if (len1 <= *len)
924 continue;
925 if (strncmp(str, s, len1) == 0) {
926 *len = len1;
927 *pos = i;
930 if (*len == -1) {
931 *len = 0;
935 static int
936 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
938 unsigned int h, v;
939 unsigned int mask = code_size-1;
940 unsigned int i, incr;
942 /* Check for hangul syllables. */
943 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
944 int len, L = -1, V = -1, T = -1;
945 const char *pos = name + 16;
946 find_syllable(pos, &len, &L, LCount, 0);
947 pos += len;
948 find_syllable(pos, &len, &V, VCount, 1);
949 pos += len;
950 find_syllable(pos, &len, &T, TCount, 2);
951 pos += len;
952 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
953 *code = SBase + (L*VCount+V)*TCount + T;
954 return 1;
956 /* Otherwise, it's an illegal syllable name. */
957 return 0;
960 /* Check for unified ideographs. */
961 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
962 /* Four or five hexdigits must follow. */
963 v = 0;
964 name += 22;
965 namelen -= 22;
966 if (namelen != 4 && namelen != 5)
967 return 0;
968 while (namelen--) {
969 v *= 16;
970 if (*name >= '0' && *name <= '9')
971 v += *name - '0';
972 else if (*name >= 'A' && *name <= 'F')
973 v += *name - 'A' + 10;
974 else
975 return 0;
976 name++;
978 if (!is_unified_ideograph(v))
979 return 0;
980 *code = v;
981 return 1;
984 /* the following is the same as python's dictionary lookup, with
985 only minor changes. see the makeunicodedata script for more
986 details */
988 h = (unsigned int) _gethash(name, namelen, code_magic);
989 i = (~h) & mask;
990 v = code_hash[i];
991 if (!v)
992 return 0;
993 if (_cmpname(self, v, name, namelen)) {
994 *code = v;
995 return 1;
997 incr = (h ^ (h >> 3)) & mask;
998 if (!incr)
999 incr = mask;
1000 for (;;) {
1001 i = (i + incr) & mask;
1002 v = code_hash[i];
1003 if (!v)
1004 return 0;
1005 if (_cmpname(self, v, name, namelen)) {
1006 *code = v;
1007 return 1;
1009 incr = incr << 1;
1010 if (incr > mask)
1011 incr = incr ^ code_poly;
1015 static const _PyUnicode_Name_CAPI hashAPI =
1017 sizeof(_PyUnicode_Name_CAPI),
1018 _getucname,
1019 _getcode
1022 /* -------------------------------------------------------------------- */
1023 /* Python bindings */
1025 PyDoc_STRVAR(unicodedata_name__doc__,
1026 "name(unichr[, default])\n\
1027 Returns the name assigned to the Unicode character unichr as a\n\
1028 string. If no name is defined, default is returned, or, if not\n\
1029 given, ValueError is raised.");
1031 static PyObject *
1032 unicodedata_name(PyObject* self, PyObject* args)
1034 char name[NAME_MAXLEN];
1036 PyUnicodeObject* v;
1037 PyObject* defobj = NULL;
1038 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1039 return NULL;
1041 if (PyUnicode_GET_SIZE(v) != 1) {
1042 PyErr_SetString(PyExc_TypeError,
1043 "need a single Unicode character as parameter");
1044 return NULL;
1047 if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
1048 name, sizeof(name))) {
1049 if (defobj == NULL) {
1050 PyErr_SetString(PyExc_ValueError, "no such name");
1051 return NULL;
1053 else {
1054 Py_INCREF(defobj);
1055 return defobj;
1059 return Py_BuildValue("s", name);
1062 PyDoc_STRVAR(unicodedata_lookup__doc__,
1063 "lookup(name)\n\
1065 Look up character by name. If a character with the\n\
1066 given name is found, return the corresponding Unicode\n\
1067 character. If not found, KeyError is raised.");
1069 static PyObject *
1070 unicodedata_lookup(PyObject* self, PyObject* args)
1072 Py_UCS4 code;
1073 Py_UNICODE str[1];
1075 char* name;
1076 int namelen;
1077 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1078 return NULL;
1080 if (!_getcode(self, name, namelen, &code)) {
1081 char fmt[] = "undefined character name '%s'";
1082 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
1083 sprintf(buf, fmt, name);
1084 PyErr_SetString(PyExc_KeyError, buf);
1085 PyMem_FREE(buf);
1086 return NULL;
1089 str[0] = (Py_UNICODE) code;
1090 return PyUnicode_FromUnicode(str, 1);
1093 /* XXX Add doc strings. */
1095 static PyMethodDef unicodedata_functions[] = {
1096 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1097 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1098 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1099 {"category", unicodedata_category, METH_VARARGS,
1100 unicodedata_category__doc__},
1101 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1102 unicodedata_bidirectional__doc__},
1103 {"combining", unicodedata_combining, METH_VARARGS,
1104 unicodedata_combining__doc__},
1105 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1106 unicodedata_mirrored__doc__},
1107 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1108 unicodedata_east_asian_width__doc__},
1109 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1110 unicodedata_decomposition__doc__},
1111 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1112 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1113 {"normalize", unicodedata_normalize, METH_VARARGS,
1114 unicodedata_normalize__doc__},
1115 {NULL, NULL} /* sentinel */
1118 static PyTypeObject UCD_Type = {
1119 /* The ob_type field must be initialized in the module init function
1120 * to be portable to Windows without using C++. */
1121 PyObject_HEAD_INIT(NULL)
1122 0, /*ob_size*/
1123 "unicodedata.UCD", /*tp_name*/
1124 sizeof(PreviousDBVersion), /*tp_basicsize*/
1125 0, /*tp_itemsize*/
1126 /* methods */
1127 (destructor)PyObject_Del, /*tp_dealloc*/
1128 0, /*tp_print*/
1129 0, /*tp_getattr*/
1130 0, /*tp_setattr*/
1131 0, /*tp_compare*/
1132 0, /*tp_repr*/
1133 0, /*tp_as_number*/
1134 0, /*tp_as_sequence*/
1135 0, /*tp_as_mapping*/
1136 0, /*tp_hash*/
1137 0, /*tp_call*/
1138 0, /*tp_str*/
1139 PyObject_GenericGetAttr,/*tp_getattro*/
1140 0, /*tp_setattro*/
1141 0, /*tp_as_buffer*/
1142 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1143 0, /*tp_doc*/
1144 0, /*tp_traverse*/
1145 0, /*tp_clear*/
1146 0, /*tp_richcompare*/
1147 0, /*tp_weaklistoffset*/
1148 0, /*tp_iter*/
1149 0, /*tp_iternext*/
1150 unicodedata_functions, /*tp_methods*/
1151 DB_members, /*tp_members*/
1152 0, /*tp_getset*/
1153 0, /*tp_base*/
1154 0, /*tp_dict*/
1155 0, /*tp_descr_get*/
1156 0, /*tp_descr_set*/
1157 0, /*tp_dictoffset*/
1158 0, /*tp_init*/
1159 0, /*tp_alloc*/
1160 0, /*tp_new*/
1161 0, /*tp_free*/
1162 0, /*tp_is_gc*/
1165 PyDoc_STRVAR(unicodedata_docstring,
1166 "This module provides access to the Unicode Character Database which\n\
1167 defines character properties for all Unicode characters. The data in\n\
1168 this database is based on the UnicodeData.txt file version\n\
1169 3.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
1171 The module uses the same names and symbols as defined by the\n\
1172 UnicodeData File Format 3.2.0 (see\n\
1173 http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html).");
1175 PyMODINIT_FUNC
1176 initunicodedata(void)
1178 PyObject *m, *v;
1180 UCD_Type.ob_type = &PyType_Type;
1182 m = Py_InitModule3(
1183 "unicodedata", unicodedata_functions, unicodedata_docstring);
1184 if (!m)
1185 return;
1187 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1188 Py_INCREF(&UCD_Type);
1189 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1191 /* Previous versions */
1192 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1193 if (v != NULL)
1194 PyModule_AddObject(m, "ucd_3_2_0", v);
1196 /* Export C API */
1197 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1198 if (v != NULL)
1199 PyModule_AddObject(m, "ucnhash_CAPI", v);
1203 Local variables:
1204 c-basic-offset: 4
1205 indent-tabs-mode: nil
1206 End: