Added more cross-reference targets and tidied up list of useful handlers.
[python.git] / Modules / unicodedata.c
blobad77651a26c4fce17f918b61d4eaac990071017a
1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 5.1 data base.
5 Data was extracted from the Unicode 5.1 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
15 #include "Python.h"
16 #include "ucnhash.h"
17 #include "structmember.h"
19 /* character properties */
21 typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
30 } _PyUnicode_DatabaseRecord;
32 typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
37 const unsigned char mirrored_changed;
38 const int numeric_changed;
39 } change_record;
41 /* data file generated by Tools/unicode/makeunicodedata.py */
42 #include "unicodedata_db.h"
44 static const _PyUnicode_DatabaseRecord*
45 _getrecord_ex(Py_UCS4 code)
47 int index;
48 if (code >= 0x110000)
49 index = 0;
50 else {
51 index = index1[(code>>SHIFT)];
52 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
55 return &_PyUnicode_Database_Records[index];
58 /* ------------- Previous-version API ------------------------------------- */
59 typedef struct previous_version {
60 PyObject_HEAD
61 const char *name;
62 const change_record* (*getrecord)(Py_UCS4);
63 Py_UCS4 (*normalization)(Py_UCS4);
64 } PreviousDBVersion;
66 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68 static PyMemberDef DB_members[] = {
69 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
70 {NULL}
73 /* forward declaration */
74 static PyTypeObject UCD_Type;
76 static PyObject*
77 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
78 Py_UCS4 (*normalization)(Py_UCS4))
80 PreviousDBVersion *self;
81 self = PyObject_New(PreviousDBVersion, &UCD_Type);
82 if (self == NULL)
83 return NULL;
84 self->name = name;
85 self->getrecord = getrecord;
86 self->normalization = normalization;
87 return (PyObject*)self;
91 static Py_UCS4 getuchar(PyUnicodeObject *obj)
93 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
95 if (PyUnicode_GET_SIZE(obj) == 1)
96 return *v;
97 #ifndef Py_UNICODE_WIDE
98 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
99 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
100 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
101 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
102 #endif
103 PyErr_SetString(PyExc_TypeError,
104 "need a single Unicode character as parameter");
105 return (Py_UCS4)-1;
108 /* --- Module API --------------------------------------------------------- */
110 PyDoc_STRVAR(unicodedata_decimal__doc__,
111 "decimal(unichr[, default])\n\
113 Returns the decimal value assigned to the Unicode character unichr\n\
114 as integer. If no such value is defined, default is returned, or, if\n\
115 not given, ValueError is raised.");
117 static PyObject *
118 unicodedata_decimal(PyObject *self, PyObject *args)
120 PyUnicodeObject *v;
121 PyObject *defobj = NULL;
122 int have_old = 0;
123 long rc;
124 Py_UCS4 c;
126 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
127 return NULL;
128 c = getuchar(v);
129 if (c == (Py_UCS4)-1)
130 return NULL;
132 if (self) {
133 const change_record *old = get_old_record(self, c);
134 if (old->category_changed == 0) {
135 /* unassigned */
136 have_old = 1;
137 rc = -1;
139 else if (old->decimal_changed != 0xFF) {
140 have_old = 1;
141 rc = old->decimal_changed;
145 if (!have_old)
146 rc = Py_UNICODE_TODECIMAL(c);
147 if (rc < 0) {
148 if (defobj == NULL) {
149 PyErr_SetString(PyExc_ValueError,
150 "not a decimal");
151 return NULL;
153 else {
154 Py_INCREF(defobj);
155 return defobj;
158 return PyInt_FromLong(rc);
161 PyDoc_STRVAR(unicodedata_digit__doc__,
162 "digit(unichr[, default])\n\
164 Returns the digit value assigned to the Unicode character unichr as\n\
165 integer. If no such value is defined, default is returned, or, if\n\
166 not given, ValueError is raised.");
168 static PyObject *
169 unicodedata_digit(PyObject *self, PyObject *args)
171 PyUnicodeObject *v;
172 PyObject *defobj = NULL;
173 long rc;
174 Py_UCS4 c;
176 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
177 return NULL;
178 c = getuchar(v);
179 if (c == (Py_UCS4)-1)
180 return NULL;
181 rc = Py_UNICODE_TODIGIT(c);
182 if (rc < 0) {
183 if (defobj == NULL) {
184 PyErr_SetString(PyExc_ValueError, "not a digit");
185 return NULL;
187 else {
188 Py_INCREF(defobj);
189 return defobj;
192 return PyInt_FromLong(rc);
195 PyDoc_STRVAR(unicodedata_numeric__doc__,
196 "numeric(unichr[, default])\n\
198 Returns the numeric value assigned to the Unicode character unichr\n\
199 as float. If no such value is defined, default is returned, or, if\n\
200 not given, ValueError is raised.");
202 static PyObject *
203 unicodedata_numeric(PyObject *self, PyObject *args)
205 PyUnicodeObject *v;
206 PyObject *defobj = NULL;
207 int have_old = 0;
208 double rc;
209 Py_UCS4 c;
211 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
212 return NULL;
213 c = getuchar(v);
214 if (c == (Py_UCS4)-1)
215 return NULL;
217 if (self) {
218 const change_record *old = get_old_record(self, c);
219 if (old->category_changed == 0) {
220 /* unassigned */
221 have_old = 1;
222 rc = -1.0;
224 else if (old->decimal_changed != 0xFF) {
225 have_old = 1;
226 rc = old->decimal_changed;
230 if (!have_old)
231 rc = Py_UNICODE_TONUMERIC(c);
232 if (rc == -1.0) {
233 if (defobj == NULL) {
234 PyErr_SetString(PyExc_ValueError, "not a numeric character");
235 return NULL;
237 else {
238 Py_INCREF(defobj);
239 return defobj;
242 return PyFloat_FromDouble(rc);
245 PyDoc_STRVAR(unicodedata_category__doc__,
246 "category(unichr)\n\
248 Returns the general category assigned to the Unicode character\n\
249 unichr as string.");
251 static PyObject *
252 unicodedata_category(PyObject *self, PyObject *args)
254 PyUnicodeObject *v;
255 int index;
256 Py_UCS4 c;
258 if (!PyArg_ParseTuple(args, "O!:category",
259 &PyUnicode_Type, &v))
260 return NULL;
261 c = getuchar(v);
262 if (c == (Py_UCS4)-1)
263 return NULL;
264 index = (int) _getrecord_ex(c)->category;
265 if (self) {
266 const change_record *old = get_old_record(self, c);
267 if (old->category_changed != 0xFF)
268 index = old->category_changed;
270 return PyString_FromString(_PyUnicode_CategoryNames[index]);
273 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
274 "bidirectional(unichr)\n\
276 Returns the bidirectional category assigned to the Unicode character\n\
277 unichr as string. If no such value is defined, an empty string is\n\
278 returned.");
280 static PyObject *
281 unicodedata_bidirectional(PyObject *self, PyObject *args)
283 PyUnicodeObject *v;
284 int index;
285 Py_UCS4 c;
287 if (!PyArg_ParseTuple(args, "O!:bidirectional",
288 &PyUnicode_Type, &v))
289 return NULL;
290 c = getuchar(v);
291 if (c == (Py_UCS4)-1)
292 return NULL;
293 index = (int) _getrecord_ex(c)->bidirectional;
294 if (self) {
295 const change_record *old = get_old_record(self, c);
296 if (old->category_changed == 0)
297 index = 0; /* unassigned */
298 else if (old->bidir_changed != 0xFF)
299 index = old->bidir_changed;
301 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
304 PyDoc_STRVAR(unicodedata_combining__doc__,
305 "combining(unichr)\n\
307 Returns the canonical combining class assigned to the Unicode\n\
308 character unichr as integer. Returns 0 if no combining class is\n\
309 defined.");
311 static PyObject *
312 unicodedata_combining(PyObject *self, PyObject *args)
314 PyUnicodeObject *v;
315 int index;
316 Py_UCS4 c;
318 if (!PyArg_ParseTuple(args, "O!:combining",
319 &PyUnicode_Type, &v))
320 return NULL;
321 c = getuchar(v);
322 if (c == (Py_UCS4)-1)
323 return NULL;
324 index = (int) _getrecord_ex(c)->combining;
325 if (self) {
326 const change_record *old = get_old_record(self, c);
327 if (old->category_changed == 0)
328 index = 0; /* unassigned */
330 return PyInt_FromLong(index);
333 PyDoc_STRVAR(unicodedata_mirrored__doc__,
334 "mirrored(unichr)\n\
336 Returns the mirrored property assigned to the Unicode character\n\
337 unichr as integer. Returns 1 if the character has been identified as\n\
338 a \"mirrored\" character in bidirectional text, 0 otherwise.");
340 static PyObject *
341 unicodedata_mirrored(PyObject *self, PyObject *args)
343 PyUnicodeObject *v;
344 int index;
345 Py_UCS4 c;
347 if (!PyArg_ParseTuple(args, "O!:mirrored",
348 &PyUnicode_Type, &v))
349 return NULL;
350 c = getuchar(v);
351 if (c == (Py_UCS4)-1)
352 return NULL;
353 index = (int) _getrecord_ex(c)->mirrored;
354 if (self) {
355 const change_record *old = get_old_record(self, c);
356 if (old->category_changed == 0)
357 index = 0; /* unassigned */
358 else if (old->mirrored_changed != 0xFF)
359 index = old->mirrored_changed;
361 return PyInt_FromLong(index);
364 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
365 "east_asian_width(unichr)\n\
367 Returns the east asian width assigned to the Unicode character\n\
368 unichr as string.");
370 static PyObject *
371 unicodedata_east_asian_width(PyObject *self, PyObject *args)
373 PyUnicodeObject *v;
374 int index;
375 Py_UCS4 c;
377 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
378 &PyUnicode_Type, &v))
379 return NULL;
380 c = getuchar(v);
381 if (c == (Py_UCS4)-1)
382 return NULL;
383 index = (int) _getrecord_ex(c)->east_asian_width;
384 if (self) {
385 const change_record *old = get_old_record(self, c);
386 if (old->category_changed == 0)
387 index = 0; /* unassigned */
389 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
392 PyDoc_STRVAR(unicodedata_decomposition__doc__,
393 "decomposition(unichr)\n\
395 Returns the character decomposition mapping assigned to the Unicode\n\
396 character unichr as string. An empty string is returned in case no\n\
397 such mapping is defined.");
399 static PyObject *
400 unicodedata_decomposition(PyObject *self, PyObject *args)
402 PyUnicodeObject *v;
403 char decomp[256];
404 int code, index, count, i;
405 unsigned int prefix_index;
406 Py_UCS4 c;
408 if (!PyArg_ParseTuple(args, "O!:decomposition",
409 &PyUnicode_Type, &v))
410 return NULL;
411 c = getuchar(v);
412 if (c == (Py_UCS4)-1)
413 return NULL;
415 code = (int)c;
417 if (self) {
418 const change_record *old = get_old_record(self, c);
419 if (old->category_changed == 0)
420 return PyString_FromString(""); /* unassigned */
423 if (code < 0 || code >= 0x110000)
424 index = 0;
425 else {
426 index = decomp_index1[(code>>DECOMP_SHIFT)];
427 index = decomp_index2[(index<<DECOMP_SHIFT)+
428 (code&((1<<DECOMP_SHIFT)-1))];
431 /* high byte is number of hex bytes (usually one or two), low byte
432 is prefix code (from*/
433 count = decomp_data[index] >> 8;
435 /* XXX: could allocate the PyString up front instead
436 (strlen(prefix) + 5 * count + 1 bytes) */
438 /* Based on how index is calculated above and decomp_data is generated
439 from Tools/unicode/makeunicodedata.py, it should not be possible
440 to overflow decomp_prefix. */
441 prefix_index = decomp_data[index] & 255;
442 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
444 /* copy prefix */
445 i = strlen(decomp_prefix[prefix_index]);
446 memcpy(decomp, decomp_prefix[prefix_index], i);
448 while (count-- > 0) {
449 if (i)
450 decomp[i++] = ' ';
451 assert((size_t)i < sizeof(decomp));
452 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
453 decomp_data[++index]);
454 i += strlen(decomp + i);
457 decomp[i] = '\0';
459 return PyString_FromString(decomp);
462 static void
463 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
465 if (code >= 0x110000) {
466 *index = 0;
467 } else if (self && get_old_record(self, code)->category_changed==0) {
468 /* unassigned in old version */
469 *index = 0;
471 else {
472 *index = decomp_index1[(code>>DECOMP_SHIFT)];
473 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
474 (code&((1<<DECOMP_SHIFT)-1))];
477 /* high byte is number of hex bytes (usually one or two), low byte
478 is prefix code (from*/
479 *count = decomp_data[*index] >> 8;
480 *prefix = decomp_data[*index] & 255;
482 (*index)++;
485 #define SBase 0xAC00
486 #define LBase 0x1100
487 #define VBase 0x1161
488 #define TBase 0x11A7
489 #define LCount 19
490 #define VCount 21
491 #define TCount 28
492 #define NCount (VCount*TCount)
493 #define SCount (LCount*NCount)
495 static PyObject*
496 nfd_nfkd(PyObject *self, PyObject *input, int k)
498 PyObject *result;
499 Py_UNICODE *i, *end, *o;
500 /* Longest decomposition in Unicode 3.2: U+FDFA */
501 Py_UNICODE stack[20];
502 Py_ssize_t space, isize;
503 int index, prefix, count, stackptr;
504 unsigned char prev, cur;
506 stackptr = 0;
507 isize = PyUnicode_GET_SIZE(input);
508 /* Overallocate atmost 10 characters. */
509 space = (isize > 10 ? 10 : isize) + isize;
510 result = PyUnicode_FromUnicode(NULL, space);
511 if (!result)
512 return NULL;
513 i = PyUnicode_AS_UNICODE(input);
514 end = i + isize;
515 o = PyUnicode_AS_UNICODE(result);
517 while (i < end) {
518 stack[stackptr++] = *i++;
519 while(stackptr) {
520 Py_UNICODE code = stack[--stackptr];
521 /* Hangul Decomposition adds three characters in
522 a single step, so we need atleast that much room. */
523 if (space < 3) {
524 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
525 space += 10;
526 if (PyUnicode_Resize(&result, newsize) == -1)
527 return NULL;
528 o = PyUnicode_AS_UNICODE(result) + newsize - space;
530 /* Hangul Decomposition. */
531 if (SBase <= code && code < (SBase+SCount)) {
532 int SIndex = code - SBase;
533 int L = LBase + SIndex / NCount;
534 int V = VBase + (SIndex % NCount) / TCount;
535 int T = TBase + SIndex % TCount;
536 *o++ = L;
537 *o++ = V;
538 space -= 2;
539 if (T != TBase) {
540 *o++ = T;
541 space --;
543 continue;
545 /* normalization changes */
546 if (self) {
547 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
548 if (value != 0) {
549 stack[stackptr++] = value;
550 continue;
554 /* Other decompositions. */
555 get_decomp_record(self, code, &index, &prefix, &count);
557 /* Copy character if it is not decomposable, or has a
558 compatibility decomposition, but we do NFD. */
559 if (!count || (prefix && !k)) {
560 *o++ = code;
561 space--;
562 continue;
564 /* Copy decomposition onto the stack, in reverse
565 order. */
566 while(count) {
567 code = decomp_data[index + (--count)];
568 stack[stackptr++] = code;
573 /* Drop overallocation. Cannot fail. */
574 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
576 /* Sort canonically. */
577 i = PyUnicode_AS_UNICODE(result);
578 prev = _getrecord_ex(*i)->combining;
579 end = i + PyUnicode_GET_SIZE(result);
580 for (i++; i < end; i++) {
581 cur = _getrecord_ex(*i)->combining;
582 if (prev == 0 || cur == 0 || prev <= cur) {
583 prev = cur;
584 continue;
586 /* Non-canonical order. Need to switch *i with previous. */
587 o = i - 1;
588 while (1) {
589 Py_UNICODE tmp = o[1];
590 o[1] = o[0];
591 o[0] = tmp;
592 o--;
593 if (o < PyUnicode_AS_UNICODE(result))
594 break;
595 prev = _getrecord_ex(*o)->combining;
596 if (prev == 0 || prev <= cur)
597 break;
599 prev = _getrecord_ex(*i)->combining;
601 return result;
604 static int
605 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
607 int index;
608 for (index = 0; nfc[index].start; index++) {
609 int start = nfc[index].start;
610 if (code < start)
611 return -1;
612 if (code <= start + nfc[index].count) {
613 int delta = code - start;
614 return nfc[index].index + delta;
617 return -1;
620 static PyObject*
621 nfc_nfkc(PyObject *self, PyObject *input, int k)
623 PyObject *result;
624 Py_UNICODE *i, *i1, *o, *end;
625 int f,l,index,index1,comb;
626 Py_UNICODE code;
627 Py_UNICODE *skipped[20];
628 int cskipped = 0;
630 result = nfd_nfkd(self, input, k);
631 if (!result)
632 return NULL;
634 /* We are going to modify result in-place.
635 If nfd_nfkd is changed to sometimes return the input,
636 this code needs to be reviewed. */
637 assert(result != input);
639 i = PyUnicode_AS_UNICODE(result);
640 end = i + PyUnicode_GET_SIZE(result);
641 o = PyUnicode_AS_UNICODE(result);
643 again:
644 while (i < end) {
645 for (index = 0; index < cskipped; index++) {
646 if (skipped[index] == i) {
647 /* *i character is skipped.
648 Remove from list. */
649 skipped[index] = skipped[cskipped-1];
650 cskipped--;
651 i++;
652 goto again; /* continue while */
655 /* Hangul Composition. We don't need to check for <LV,T>
656 pairs, since we always have decomposed data. */
657 if (LBase <= *i && *i < (LBase+LCount) &&
658 i + 1 < end &&
659 VBase <= i[1] && i[1] <= (VBase+VCount)) {
660 int LIndex, VIndex;
661 LIndex = i[0] - LBase;
662 VIndex = i[1] - VBase;
663 code = SBase + (LIndex*VCount+VIndex)*TCount;
664 i+=2;
665 if (i < end &&
666 TBase <= *i && *i <= (TBase+TCount)) {
667 code += *i-TBase;
668 i++;
670 *o++ = code;
671 continue;
674 f = find_nfc_index(self, nfc_first, *i);
675 if (f == -1) {
676 *o++ = *i++;
677 continue;
679 /* Find next unblocked character. */
680 i1 = i+1;
681 comb = 0;
682 while (i1 < end) {
683 int comb1 = _getrecord_ex(*i1)->combining;
684 if (comb1 && comb == comb1) {
685 /* Character is blocked. */
686 i1++;
687 continue;
689 l = find_nfc_index(self, nfc_last, *i1);
690 /* *i1 cannot be combined with *i. If *i1
691 is a starter, we don't need to look further.
692 Otherwise, record the combining class. */
693 if (l == -1) {
694 not_combinable:
695 if (comb1 == 0)
696 break;
697 comb = comb1;
698 i1++;
699 continue;
701 index = f*TOTAL_LAST + l;
702 index1 = comp_index[index >> COMP_SHIFT];
703 code = comp_data[(index1<<COMP_SHIFT)+
704 (index&((1<<COMP_SHIFT)-1))];
705 if (code == 0)
706 goto not_combinable;
708 /* Replace the original character. */
709 *i = code;
710 /* Mark the second character unused. */
711 skipped[cskipped++] = i1;
712 i1++;
713 f = find_nfc_index(self, nfc_first, *i);
714 if (f == -1)
715 break;
717 *o++ = *i++;
719 if (o != end)
720 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
721 return result;
724 PyDoc_STRVAR(unicodedata_normalize__doc__,
725 "normalize(form, unistr)\n\
727 Return the normal form 'form' for the Unicode string unistr. Valid\n\
728 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
730 static PyObject*
731 unicodedata_normalize(PyObject *self, PyObject *args)
733 char *form;
734 PyObject *input;
736 if(!PyArg_ParseTuple(args, "sO!:normalize",
737 &form, &PyUnicode_Type, &input))
738 return NULL;
740 if (PyUnicode_GetSize(input) == 0) {
741 /* Special case empty input strings, since resizing
742 them later would cause internal errors. */
743 Py_INCREF(input);
744 return input;
747 if (strcmp(form, "NFC") == 0)
748 return nfc_nfkc(self, input, 0);
749 if (strcmp(form, "NFKC") == 0)
750 return nfc_nfkc(self, input, 1);
751 if (strcmp(form, "NFD") == 0)
752 return nfd_nfkd(self, input, 0);
753 if (strcmp(form, "NFKD") == 0)
754 return nfd_nfkd(self, input, 1);
755 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
756 return NULL;
759 /* -------------------------------------------------------------------- */
760 /* unicode character name tables */
762 /* data file generated by Tools/unicode/makeunicodedata.py */
763 #include "unicodename_db.h"
765 /* -------------------------------------------------------------------- */
766 /* database code (cut and pasted from the unidb package) */
768 static unsigned long
769 _gethash(const char *s, int len, int scale)
771 int i;
772 unsigned long h = 0;
773 unsigned long ix;
774 for (i = 0; i < len; i++) {
775 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
776 ix = h & 0xff000000;
777 if (ix)
778 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
780 return h;
783 static char *hangul_syllables[][3] = {
784 { "G", "A", "" },
785 { "GG", "AE", "G" },
786 { "N", "YA", "GG" },
787 { "D", "YAE", "GS" },
788 { "DD", "EO", "N", },
789 { "R", "E", "NJ" },
790 { "M", "YEO", "NH" },
791 { "B", "YE", "D" },
792 { "BB", "O", "L" },
793 { "S", "WA", "LG" },
794 { "SS", "WAE", "LM" },
795 { "", "OE", "LB" },
796 { "J", "YO", "LS" },
797 { "JJ", "U", "LT" },
798 { "C", "WEO", "LP" },
799 { "K", "WE", "LH" },
800 { "T", "WI", "M" },
801 { "P", "YU", "B" },
802 { "H", "EU", "BS" },
803 { 0, "YI", "S" },
804 { 0, "I", "SS" },
805 { 0, 0, "NG" },
806 { 0, 0, "J" },
807 { 0, 0, "C" },
808 { 0, 0, "K" },
809 { 0, 0, "T" },
810 { 0, 0, "P" },
811 { 0, 0, "H" }
814 static int
815 is_unified_ideograph(Py_UCS4 code)
817 return (
818 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
819 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
820 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
823 static int
824 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
826 int offset;
827 int i;
828 int word;
829 unsigned char* w;
831 if (code >= 0x110000)
832 return 0;
834 if (self) {
835 const change_record *old = get_old_record(self, code);
836 if (old->category_changed == 0) {
837 /* unassigned */
838 return 0;
842 if (SBase <= code && code < SBase+SCount) {
843 /* Hangul syllable. */
844 int SIndex = code - SBase;
845 int L = SIndex / NCount;
846 int V = (SIndex % NCount) / TCount;
847 int T = SIndex % TCount;
849 if (buflen < 27)
850 /* Worst case: HANGUL SYLLABLE <10chars>. */
851 return 0;
852 strcpy(buffer, "HANGUL SYLLABLE ");
853 buffer += 16;
854 strcpy(buffer, hangul_syllables[L][0]);
855 buffer += strlen(hangul_syllables[L][0]);
856 strcpy(buffer, hangul_syllables[V][1]);
857 buffer += strlen(hangul_syllables[V][1]);
858 strcpy(buffer, hangul_syllables[T][2]);
859 buffer += strlen(hangul_syllables[T][2]);
860 *buffer = '\0';
861 return 1;
864 if (is_unified_ideograph(code)) {
865 if (buflen < 28)
866 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
867 return 0;
868 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
869 return 1;
872 /* get offset into phrasebook */
873 offset = phrasebook_offset1[(code>>phrasebook_shift)];
874 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
875 (code&((1<<phrasebook_shift)-1))];
876 if (!offset)
877 return 0;
879 i = 0;
881 for (;;) {
882 /* get word index */
883 word = phrasebook[offset] - phrasebook_short;
884 if (word >= 0) {
885 word = (word << 8) + phrasebook[offset+1];
886 offset += 2;
887 } else
888 word = phrasebook[offset++];
889 if (i) {
890 if (i > buflen)
891 return 0; /* buffer overflow */
892 buffer[i++] = ' ';
894 /* copy word string from lexicon. the last character in the
895 word has bit 7 set. the last word in a string ends with
896 0x80 */
897 w = lexicon + lexicon_offset[word];
898 while (*w < 128) {
899 if (i >= buflen)
900 return 0; /* buffer overflow */
901 buffer[i++] = *w++;
903 if (i >= buflen)
904 return 0; /* buffer overflow */
905 buffer[i++] = *w & 127;
906 if (*w == 128)
907 break; /* end of word */
910 return 1;
913 static int
914 _cmpname(PyObject *self, int code, const char* name, int namelen)
916 /* check if code corresponds to the given name */
917 int i;
918 char buffer[NAME_MAXLEN];
919 if (!_getucname(self, code, buffer, sizeof(buffer)))
920 return 0;
921 for (i = 0; i < namelen; i++) {
922 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
923 return 0;
925 return buffer[namelen] == '\0';
928 static void
929 find_syllable(const char *str, int *len, int *pos, int count, int column)
931 int i, len1;
932 *len = -1;
933 for (i = 0; i < count; i++) {
934 char *s = hangul_syllables[i][column];
935 len1 = strlen(s);
936 if (len1 <= *len)
937 continue;
938 if (strncmp(str, s, len1) == 0) {
939 *len = len1;
940 *pos = i;
943 if (*len == -1) {
944 *len = 0;
948 static int
949 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
951 unsigned int h, v;
952 unsigned int mask = code_size-1;
953 unsigned int i, incr;
955 /* Check for hangul syllables. */
956 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
957 int len, L = -1, V = -1, T = -1;
958 const char *pos = name + 16;
959 find_syllable(pos, &len, &L, LCount, 0);
960 pos += len;
961 find_syllable(pos, &len, &V, VCount, 1);
962 pos += len;
963 find_syllable(pos, &len, &T, TCount, 2);
964 pos += len;
965 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
966 *code = SBase + (L*VCount+V)*TCount + T;
967 return 1;
969 /* Otherwise, it's an illegal syllable name. */
970 return 0;
973 /* Check for unified ideographs. */
974 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
975 /* Four or five hexdigits must follow. */
976 v = 0;
977 name += 22;
978 namelen -= 22;
979 if (namelen != 4 && namelen != 5)
980 return 0;
981 while (namelen--) {
982 v *= 16;
983 if (*name >= '0' && *name <= '9')
984 v += *name - '0';
985 else if (*name >= 'A' && *name <= 'F')
986 v += *name - 'A' + 10;
987 else
988 return 0;
989 name++;
991 if (!is_unified_ideograph(v))
992 return 0;
993 *code = v;
994 return 1;
997 /* the following is the same as python's dictionary lookup, with
998 only minor changes. see the makeunicodedata script for more
999 details */
1001 h = (unsigned int) _gethash(name, namelen, code_magic);
1002 i = (~h) & mask;
1003 v = code_hash[i];
1004 if (!v)
1005 return 0;
1006 if (_cmpname(self, v, name, namelen)) {
1007 *code = v;
1008 return 1;
1010 incr = (h ^ (h >> 3)) & mask;
1011 if (!incr)
1012 incr = mask;
1013 for (;;) {
1014 i = (i + incr) & mask;
1015 v = code_hash[i];
1016 if (!v)
1017 return 0;
1018 if (_cmpname(self, v, name, namelen)) {
1019 *code = v;
1020 return 1;
1022 incr = incr << 1;
1023 if (incr > mask)
1024 incr = incr ^ code_poly;
1028 static const _PyUnicode_Name_CAPI hashAPI =
1030 sizeof(_PyUnicode_Name_CAPI),
1031 _getucname,
1032 _getcode
1035 /* -------------------------------------------------------------------- */
1036 /* Python bindings */
1038 PyDoc_STRVAR(unicodedata_name__doc__,
1039 "name(unichr[, default])\n\
1040 Returns the name assigned to the Unicode character unichr as a\n\
1041 string. If no name is defined, default is returned, or, if not\n\
1042 given, ValueError is raised.");
1044 static PyObject *
1045 unicodedata_name(PyObject* self, PyObject* args)
1047 char name[NAME_MAXLEN];
1048 Py_UCS4 c;
1050 PyUnicodeObject* v;
1051 PyObject* defobj = NULL;
1052 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1053 return NULL;
1055 c = getuchar(v);
1056 if (c == (Py_UCS4)-1)
1057 return NULL;
1059 if (!_getucname(self, c, name, sizeof(name))) {
1060 if (defobj == NULL) {
1061 PyErr_SetString(PyExc_ValueError, "no such name");
1062 return NULL;
1064 else {
1065 Py_INCREF(defobj);
1066 return defobj;
1070 return Py_BuildValue("s", name);
1073 PyDoc_STRVAR(unicodedata_lookup__doc__,
1074 "lookup(name)\n\
1076 Look up character by name. If a character with the\n\
1077 given name is found, return the corresponding Unicode\n\
1078 character. If not found, KeyError is raised.");
1080 static PyObject *
1081 unicodedata_lookup(PyObject* self, PyObject* args)
1083 Py_UCS4 code;
1084 Py_UNICODE str[2];
1086 char* name;
1087 int namelen;
1088 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1089 return NULL;
1091 if (!_getcode(self, name, namelen, &code)) {
1092 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1093 name);
1094 return NULL;
1097 #ifndef Py_UNICODE_WIDE
1098 if (code >= 0x10000) {
1099 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1100 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1101 return PyUnicode_FromUnicode(str, 2);
1103 #endif
1104 str[0] = (Py_UNICODE) code;
1105 return PyUnicode_FromUnicode(str, 1);
1108 /* XXX Add doc strings. */
1110 static PyMethodDef unicodedata_functions[] = {
1111 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1112 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1113 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1114 {"category", unicodedata_category, METH_VARARGS,
1115 unicodedata_category__doc__},
1116 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1117 unicodedata_bidirectional__doc__},
1118 {"combining", unicodedata_combining, METH_VARARGS,
1119 unicodedata_combining__doc__},
1120 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1121 unicodedata_mirrored__doc__},
1122 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1123 unicodedata_east_asian_width__doc__},
1124 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1125 unicodedata_decomposition__doc__},
1126 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1127 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1128 {"normalize", unicodedata_normalize, METH_VARARGS,
1129 unicodedata_normalize__doc__},
1130 {NULL, NULL} /* sentinel */
1133 static PyTypeObject UCD_Type = {
1134 /* The ob_type field must be initialized in the module init function
1135 * to be portable to Windows without using C++. */
1136 PyVarObject_HEAD_INIT(NULL, 0)
1137 "unicodedata.UCD", /*tp_name*/
1138 sizeof(PreviousDBVersion), /*tp_basicsize*/
1139 0, /*tp_itemsize*/
1140 /* methods */
1141 (destructor)PyObject_Del, /*tp_dealloc*/
1142 0, /*tp_print*/
1143 0, /*tp_getattr*/
1144 0, /*tp_setattr*/
1145 0, /*tp_compare*/
1146 0, /*tp_repr*/
1147 0, /*tp_as_number*/
1148 0, /*tp_as_sequence*/
1149 0, /*tp_as_mapping*/
1150 0, /*tp_hash*/
1151 0, /*tp_call*/
1152 0, /*tp_str*/
1153 PyObject_GenericGetAttr,/*tp_getattro*/
1154 0, /*tp_setattro*/
1155 0, /*tp_as_buffer*/
1156 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1157 0, /*tp_doc*/
1158 0, /*tp_traverse*/
1159 0, /*tp_clear*/
1160 0, /*tp_richcompare*/
1161 0, /*tp_weaklistoffset*/
1162 0, /*tp_iter*/
1163 0, /*tp_iternext*/
1164 unicodedata_functions, /*tp_methods*/
1165 DB_members, /*tp_members*/
1166 0, /*tp_getset*/
1167 0, /*tp_base*/
1168 0, /*tp_dict*/
1169 0, /*tp_descr_get*/
1170 0, /*tp_descr_set*/
1171 0, /*tp_dictoffset*/
1172 0, /*tp_init*/
1173 0, /*tp_alloc*/
1174 0, /*tp_new*/
1175 0, /*tp_free*/
1176 0, /*tp_is_gc*/
1179 PyDoc_STRVAR(unicodedata_docstring,
1180 "This module provides access to the Unicode Character Database which\n\
1181 defines character properties for all Unicode characters. The data in\n\
1182 this database is based on the UnicodeData.txt file version\n\
1183 5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1185 The module uses the same names and symbols as defined by the\n\
1186 UnicodeData File Format 5.1.0 (see\n\
1187 http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
1189 PyMODINIT_FUNC
1190 initunicodedata(void)
1192 PyObject *m, *v;
1194 Py_TYPE(&UCD_Type) = &PyType_Type;
1196 m = Py_InitModule3(
1197 "unicodedata", unicodedata_functions, unicodedata_docstring);
1198 if (!m)
1199 return;
1201 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1202 Py_INCREF(&UCD_Type);
1203 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1205 /* Previous versions */
1206 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1207 if (v != NULL)
1208 PyModule_AddObject(m, "ucd_3_2_0", v);
1210 /* Export C API */
1211 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1212 if (v != NULL)
1213 PyModule_AddObject(m, "ucnhash_CAPI", v);
1217 Local variables:
1218 c-basic-offset: 4
1219 indent-tabs-mode: nil
1220 End: