Exceptions raised during renaming in rotating file handlers are now passed to handleE...
[python.git] / Modules / unicodedata.c
blobbe966f096450a88fcba091bbb7952c2c050faa0b
1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 3.2 data base.
5 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
15 #include "Python.h"
16 #include "ucnhash.h"
18 /* character properties */
20 typedef struct {
21 const unsigned char category; /* index into
22 _PyUnicode_CategoryNames */
23 const unsigned char combining; /* combining class value 0 - 255 */
24 const unsigned char bidirectional; /* index into
25 _PyUnicode_BidirectionalNames */
26 const unsigned char mirrored; /* true if mirrored in bidir mode */
27 const unsigned char east_asian_width; /* index into
28 _PyUnicode_EastAsianWidth */
29 } _PyUnicode_DatabaseRecord;
31 /* data file generated by Tools/unicode/makeunicodedata.py */
32 #include "unicodedata_db.h"
34 static const _PyUnicode_DatabaseRecord*
35 _getrecord_ex(Py_UCS4 code)
37 int index;
38 if (code >= 0x110000)
39 index = 0;
40 else {
41 index = index1[(code>>SHIFT)];
42 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
45 return &_PyUnicode_Database_Records[index];
48 static const _PyUnicode_DatabaseRecord*
49 _getrecord(PyUnicodeObject* v)
51 return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
54 /* --- Module API --------------------------------------------------------- */
56 PyDoc_STRVAR(unicodedata_decimal__doc__,
57 "decimal(unichr[, default])\n\
58 \n\
59 Returns the decimal value assigned to the Unicode character unichr\n\
60 as integer. If no such value is defined, default is returned, or, if\n\
61 not given, ValueError is raised.");
63 static PyObject *
64 unicodedata_decimal(PyObject *self, PyObject *args)
66 PyUnicodeObject *v;
67 PyObject *defobj = NULL;
68 long rc;
70 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
71 return NULL;
72 if (PyUnicode_GET_SIZE(v) != 1) {
73 PyErr_SetString(PyExc_TypeError,
74 "need a single Unicode character as parameter");
75 return NULL;
77 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
78 if (rc < 0) {
79 if (defobj == NULL) {
80 PyErr_SetString(PyExc_ValueError,
81 "not a decimal");
82 return NULL;
84 else {
85 Py_INCREF(defobj);
86 return defobj;
89 return PyInt_FromLong(rc);
92 PyDoc_STRVAR(unicodedata_digit__doc__,
93 "digit(unichr[, default])\n\
94 \n\
95 Returns the digit value assigned to the Unicode character unichr as\n\
96 integer. If no such value is defined, default is returned, or, if\n\
97 not given, ValueError is raised.");
99 static PyObject *
100 unicodedata_digit(PyObject *self, PyObject *args)
102 PyUnicodeObject *v;
103 PyObject *defobj = NULL;
104 long rc;
106 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
107 return NULL;
108 if (PyUnicode_GET_SIZE(v) != 1) {
109 PyErr_SetString(PyExc_TypeError,
110 "need a single Unicode character as parameter");
111 return NULL;
113 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
114 if (rc < 0) {
115 if (defobj == NULL) {
116 PyErr_SetString(PyExc_ValueError, "not a digit");
117 return NULL;
119 else {
120 Py_INCREF(defobj);
121 return defobj;
124 return PyInt_FromLong(rc);
127 PyDoc_STRVAR(unicodedata_numeric__doc__,
128 "numeric(unichr[, default])\n\
130 Returns the numeric value assigned to the Unicode character unichr\n\
131 as float. If no such value is defined, default is returned, or, if\n\
132 not given, ValueError is raised.");
134 static PyObject *
135 unicodedata_numeric(PyObject *self, PyObject *args)
137 PyUnicodeObject *v;
138 PyObject *defobj = NULL;
139 double rc;
141 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
142 return NULL;
143 if (PyUnicode_GET_SIZE(v) != 1) {
144 PyErr_SetString(PyExc_TypeError,
145 "need a single Unicode character as parameter");
146 return NULL;
148 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
149 if (rc < 0) {
150 if (defobj == NULL) {
151 PyErr_SetString(PyExc_ValueError, "not a numeric character");
152 return NULL;
154 else {
155 Py_INCREF(defobj);
156 return defobj;
159 return PyFloat_FromDouble(rc);
162 PyDoc_STRVAR(unicodedata_category__doc__,
163 "category(unichr)\n\
165 Returns the general category assigned to the Unicode character\n\
166 unichr as string.");
168 static PyObject *
169 unicodedata_category(PyObject *self, PyObject *args)
171 PyUnicodeObject *v;
172 int index;
174 if (!PyArg_ParseTuple(args, "O!:category",
175 &PyUnicode_Type, &v))
176 return NULL;
177 if (PyUnicode_GET_SIZE(v) != 1) {
178 PyErr_SetString(PyExc_TypeError,
179 "need a single Unicode character as parameter");
180 return NULL;
182 index = (int) _getrecord(v)->category;
183 return PyString_FromString(_PyUnicode_CategoryNames[index]);
186 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
187 "bidirectional(unichr)\n\
189 Returns the bidirectional category assigned to the Unicode character\n\
190 unichr as string. If no such value is defined, an empty string is\n\
191 returned.");
193 static PyObject *
194 unicodedata_bidirectional(PyObject *self, PyObject *args)
196 PyUnicodeObject *v;
197 int index;
199 if (!PyArg_ParseTuple(args, "O!:bidirectional",
200 &PyUnicode_Type, &v))
201 return NULL;
202 if (PyUnicode_GET_SIZE(v) != 1) {
203 PyErr_SetString(PyExc_TypeError,
204 "need a single Unicode character as parameter");
205 return NULL;
207 index = (int) _getrecord(v)->bidirectional;
208 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
211 PyDoc_STRVAR(unicodedata_combining__doc__,
212 "combining(unichr)\n\
214 Returns the canonical combining class assigned to the Unicode\n\
215 character unichr as integer. Returns 0 if no combining class is\n\
216 defined.");
218 static PyObject *
219 unicodedata_combining(PyObject *self, PyObject *args)
221 PyUnicodeObject *v;
223 if (!PyArg_ParseTuple(args, "O!:combining",
224 &PyUnicode_Type, &v))
225 return NULL;
226 if (PyUnicode_GET_SIZE(v) != 1) {
227 PyErr_SetString(PyExc_TypeError,
228 "need a single Unicode character as parameter");
229 return NULL;
231 return PyInt_FromLong((int) _getrecord(v)->combining);
234 PyDoc_STRVAR(unicodedata_mirrored__doc__,
235 "mirrored(unichr)\n\
237 Returns the mirrored property assigned to the Unicode character\n\
238 unichr as integer. Returns 1 if the character has been identified as\n\
239 a \"mirrored\" character in bidirectional text, 0 otherwise.");
241 static PyObject *
242 unicodedata_mirrored(PyObject *self, PyObject *args)
244 PyUnicodeObject *v;
246 if (!PyArg_ParseTuple(args, "O!:mirrored",
247 &PyUnicode_Type, &v))
248 return NULL;
249 if (PyUnicode_GET_SIZE(v) != 1) {
250 PyErr_SetString(PyExc_TypeError,
251 "need a single Unicode character as parameter");
252 return NULL;
254 return PyInt_FromLong((int) _getrecord(v)->mirrored);
257 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
258 "east_asian_width(unichr)\n\
260 Returns the east asian width assigned to the Unicode character\n\
261 unichr as string.");
263 static PyObject *
264 unicodedata_east_asian_width(PyObject *self, PyObject *args)
266 PyUnicodeObject *v;
267 int index;
269 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
270 &PyUnicode_Type, &v))
271 return NULL;
272 if (PyUnicode_GET_SIZE(v) != 1) {
273 PyErr_SetString(PyExc_TypeError,
274 "need a single Unicode character as parameter");
275 return NULL;
277 index = (int) _getrecord(v)->east_asian_width;
278 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
281 PyDoc_STRVAR(unicodedata_decomposition__doc__,
282 "decomposition(unichr)\n\
284 Returns the character decomposition mapping assigned to the Unicode\n\
285 character unichr as string. An empty string is returned in case no\n\
286 such mapping is defined.");
288 static PyObject *
289 unicodedata_decomposition(PyObject *self, PyObject *args)
291 PyUnicodeObject *v;
292 char decomp[256];
293 int code, index, count, i;
295 if (!PyArg_ParseTuple(args, "O!:decomposition",
296 &PyUnicode_Type, &v))
297 return NULL;
298 if (PyUnicode_GET_SIZE(v) != 1) {
299 PyErr_SetString(PyExc_TypeError,
300 "need a single Unicode character as parameter");
301 return NULL;
304 code = (int) *PyUnicode_AS_UNICODE(v);
306 if (code < 0 || code >= 0x110000)
307 index = 0;
308 else {
309 index = decomp_index1[(code>>DECOMP_SHIFT)];
310 index = decomp_index2[(index<<DECOMP_SHIFT)+
311 (code&((1<<DECOMP_SHIFT)-1))];
314 /* high byte is number of hex bytes (usually one or two), low byte
315 is prefix code (from*/
316 count = decomp_data[index] >> 8;
318 /* XXX: could allocate the PyString up front instead
319 (strlen(prefix) + 5 * count + 1 bytes) */
321 /* copy prefix */
322 i = strlen(decomp_prefix[decomp_data[index] & 255]);
323 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
325 while (count-- > 0) {
326 if (i)
327 decomp[i++] = ' ';
328 assert((size_t)i < sizeof(decomp));
329 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
330 decomp_data[++index]);
331 i += strlen(decomp + i);
334 decomp[i] = '\0';
336 return PyString_FromString(decomp);
339 void
340 get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
342 if (code >= 0x110000) {
343 *index = 0;
345 else {
346 *index = decomp_index1[(code>>DECOMP_SHIFT)];
347 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
348 (code&((1<<DECOMP_SHIFT)-1))];
351 /* high byte is number of hex bytes (usually one or two), low byte
352 is prefix code (from*/
353 *count = decomp_data[*index] >> 8;
354 *prefix = decomp_data[*index] & 255;
356 (*index)++;
359 #define SBase 0xAC00
360 #define LBase 0x1100
361 #define VBase 0x1161
362 #define TBase 0x11A7
363 #define LCount 19
364 #define VCount 21
365 #define TCount 28
366 #define NCount (VCount*TCount)
367 #define SCount (LCount*NCount)
369 static PyObject*
370 nfd_nfkd(PyObject *input, int k)
372 PyObject *result;
373 Py_UNICODE *i, *end, *o;
374 /* Longest decomposition in Unicode 3.2: U+FDFA */
375 Py_UNICODE stack[20];
376 int space, stackptr, isize;
377 int index, prefix, count;
378 unsigned char prev, cur;
380 stackptr = 0;
381 isize = PyUnicode_GET_SIZE(input);
382 /* Overallocate atmost 10 characters. */
383 space = (isize > 10 ? 10 : isize) + isize;
384 result = PyUnicode_FromUnicode(NULL, space);
385 if (!result)
386 return NULL;
387 i = PyUnicode_AS_UNICODE(input);
388 end = i + isize;
389 o = PyUnicode_AS_UNICODE(result);
391 while (i < end) {
392 stack[stackptr++] = *i++;
393 while(stackptr) {
394 Py_UNICODE code = stack[--stackptr];
395 /* Hangul Decomposition adds three characters in
396 a single step, so we need atleast that much room. */
397 if (space < 3) {
398 int newsize = PyString_GET_SIZE(result) + 10;
399 space += 10;
400 if (PyUnicode_Resize(&result, newsize) == -1)
401 return NULL;
402 o = PyUnicode_AS_UNICODE(result) + newsize - space;
404 /* Hangul Decomposition. */
405 if (SBase <= code && code < (SBase+SCount)) {
406 int SIndex = code - SBase;
407 int L = LBase + SIndex / NCount;
408 int V = VBase + (SIndex % NCount) / TCount;
409 int T = TBase + SIndex % TCount;
410 *o++ = L;
411 *o++ = V;
412 space -= 2;
413 if (T != TBase) {
414 *o++ = T;
415 space --;
417 continue;
419 /* Other decompoistions. */
420 get_decomp_record(code, &index, &prefix, &count);
422 /* Copy character if it is not decomposable, or has a
423 compatibility decomposition, but we do NFD. */
424 if (!count || (prefix && !k)) {
425 *o++ = code;
426 space--;
427 continue;
429 /* Copy decomposition onto the stack, in reverse
430 order. */
431 while(count) {
432 code = decomp_data[index + (--count)];
433 stack[stackptr++] = code;
438 /* Drop overallocation. Cannot fail. */
439 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
441 /* Sort canonically. */
442 i = PyUnicode_AS_UNICODE(result);
443 prev = _getrecord_ex(*i)->combining;
444 end = i + PyUnicode_GET_SIZE(result);
445 for (i++; i < end; i++) {
446 cur = _getrecord_ex(*i)->combining;
447 if (prev == 0 || cur == 0 || prev <= cur) {
448 prev = cur;
449 continue;
451 /* Non-canonical order. Need to switch *i with previous. */
452 o = i - 1;
453 while (1) {
454 Py_UNICODE tmp = o[1];
455 o[1] = o[0];
456 o[0] = tmp;
457 o--;
458 if (o < PyUnicode_AS_UNICODE(result))
459 break;
460 prev = _getrecord_ex(*o)->combining;
461 if (prev == 0 || prev <= cur)
462 break;
464 prev = _getrecord_ex(*i)->combining;
466 return result;
469 static int
470 find_nfc_index(struct reindex* nfc, Py_UNICODE code)
472 int index;
473 for (index = 0; nfc[index].start; index++) {
474 int start = nfc[index].start;
475 if (code < start)
476 return -1;
477 if (code <= start + nfc[index].count) {
478 int delta = code - start;
479 return nfc[index].index + delta;
482 return -1;
485 static PyObject*
486 nfc_nfkc(PyObject *input, int k)
488 PyObject *result;
489 Py_UNICODE *i, *i1, *o, *end;
490 int f,l,index,index1,comb;
491 Py_UNICODE code;
492 Py_UNICODE *skipped[20];
493 int cskipped = 0;
495 result = nfd_nfkd(input, k);
496 if (!result)
497 return NULL;
499 /* We are going to modify result in-place.
500 If nfd_nfkd is changed to sometimes return the input,
501 this code needs to be reviewed. */
502 assert(result != input);
504 i = PyUnicode_AS_UNICODE(result);
505 end = i + PyUnicode_GET_SIZE(result);
506 o = PyUnicode_AS_UNICODE(result);
508 again:
509 while (i < end) {
510 for (index = 0; index < cskipped; index++) {
511 if (skipped[index] == i) {
512 /* *i character is skipped.
513 Remove from list. */
514 skipped[index] = skipped[cskipped-1];
515 cskipped--;
516 i++;
517 goto again; /* continue while */
520 /* Hangul Composition. We don't need to check for <LV,T>
521 pairs, since we always have decomposed data. */
522 if (LBase <= *i && *i < (LBase+LCount) &&
523 i + 1 < end &&
524 VBase <= i[1] && i[1] <= (VBase+VCount)) {
525 int LIndex, VIndex;
526 LIndex = i[0] - LBase;
527 VIndex = i[1] - VBase;
528 code = SBase + (LIndex*VCount+VIndex)*TCount;
529 i+=2;
530 if (i < end &&
531 TBase <= *i && *i <= (TBase+TCount)) {
532 code += *i-TBase;
533 i++;
535 *o++ = code;
536 continue;
539 f = find_nfc_index(nfc_first, *i);
540 if (f == -1) {
541 *o++ = *i++;
542 continue;
544 /* Find next unblocked character. */
545 i1 = i+1;
546 comb = 0;
547 while (i1 < end) {
548 int comb1 = _getrecord_ex(*i1)->combining;
549 if (comb1 && comb == comb1) {
550 /* Character is blocked. */
551 i1++;
552 continue;
554 l = find_nfc_index(nfc_last, *i1);
555 /* *i1 cannot be combined with *i. If *i1
556 is a starter, we don't need to look further.
557 Otherwise, record the combining class. */
558 if (l == -1) {
559 not_combinable:
560 if (comb1 == 0)
561 break;
562 comb = comb1;
563 i1++;
564 continue;
566 index = f*TOTAL_LAST + l;
567 index1 = comp_index[index >> COMP_SHIFT];
568 code = comp_data[(index1<<COMP_SHIFT)+
569 (index&((1<<COMP_SHIFT)-1))];
570 if (code == 0)
571 goto not_combinable;
573 /* Replace the original character. */
574 *i = code;
575 /* Mark the second character unused. */
576 skipped[cskipped++] = i1;
577 i1++;
578 f = find_nfc_index(nfc_first, *i);
579 if (f == -1)
580 break;
582 *o++ = *i++;
584 if (o != end)
585 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
586 return result;
589 PyDoc_STRVAR(unicodedata_normalize__doc__,
590 "normalize(form, unistr)\n\
592 Return the normal form 'form' for the Unicode string unistr. Valid\n\
593 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
595 static PyObject*
596 unicodedata_normalize(PyObject *self, PyObject *args)
598 char *form;
599 PyObject *input;
601 if(!PyArg_ParseTuple(args, "sO!:normalize",
602 &form, &PyUnicode_Type, &input))
603 return NULL;
605 if (PyUnicode_GetSize(input) == 0) {
606 /* Special case empty input strings, since resizing
607 them later would cause internal errors. */
608 Py_INCREF(input);
609 return input;
612 if (strcmp(form, "NFC") == 0)
613 return nfc_nfkc(input, 0);
614 if (strcmp(form, "NFKC") == 0)
615 return nfc_nfkc(input, 1);
616 if (strcmp(form, "NFD") == 0)
617 return nfd_nfkd(input, 0);
618 if (strcmp(form, "NFKD") == 0)
619 return nfd_nfkd(input, 1);
620 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
621 return NULL;
624 /* -------------------------------------------------------------------- */
625 /* unicode character name tables */
627 /* data file generated by Tools/unicode/makeunicodedata.py */
628 #include "unicodename_db.h"
630 /* -------------------------------------------------------------------- */
631 /* database code (cut and pasted from the unidb package) */
633 static unsigned long
634 _gethash(const char *s, int len, int scale)
636 int i;
637 unsigned long h = 0;
638 unsigned long ix;
639 for (i = 0; i < len; i++) {
640 h = (h * scale) + (unsigned char) toupper(s[i]);
641 ix = h & 0xff000000;
642 if (ix)
643 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
645 return h;
648 static char *hangul_syllables[][3] = {
649 { "G", "A", "" },
650 { "GG", "AE", "G" },
651 { "N", "YA", "GG" },
652 { "D", "YAE", "GS" },
653 { "DD", "EO", "N", },
654 { "R", "E", "NJ" },
655 { "M", "YEO", "NH" },
656 { "B", "YE", "D" },
657 { "BB", "O", "L" },
658 { "S", "WA", "LG" },
659 { "SS", "WAE", "LM" },
660 { "", "OE", "LB" },
661 { "J", "YO", "LS" },
662 { "JJ", "U", "LT" },
663 { "C", "WEO", "LP" },
664 { "K", "WE", "LH" },
665 { "T", "WI", "M" },
666 { "P", "YU", "B" },
667 { "H", "EU", "BS" },
668 { 0, "YI", "S" },
669 { 0, "I", "SS" },
670 { 0, 0, "NG" },
671 { 0, 0, "J" },
672 { 0, 0, "C" },
673 { 0, 0, "K" },
674 { 0, 0, "T" },
675 { 0, 0, "P" },
676 { 0, 0, "H" }
679 static int
680 is_unified_ideograph(Py_UCS4 code)
682 return (
683 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
684 (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
685 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
688 static int
689 _getucname(Py_UCS4 code, char* buffer, int buflen)
691 int offset;
692 int i;
693 int word;
694 unsigned char* w;
696 if (SBase <= code && code < SBase+SCount) {
697 /* Hangul syllable. */
698 int SIndex = code - SBase;
699 int L = SIndex / NCount;
700 int V = (SIndex % NCount) / TCount;
701 int T = SIndex % TCount;
703 if (buflen < 27)
704 /* Worst case: HANGUL SYLLABLE <10chars>. */
705 return 0;
706 strcpy(buffer, "HANGUL SYLLABLE ");
707 buffer += 16;
708 strcpy(buffer, hangul_syllables[L][0]);
709 buffer += strlen(hangul_syllables[L][0]);
710 strcpy(buffer, hangul_syllables[V][1]);
711 buffer += strlen(hangul_syllables[V][1]);
712 strcpy(buffer, hangul_syllables[T][2]);
713 buffer += strlen(hangul_syllables[T][2]);
714 *buffer = '\0';
715 return 1;
718 if (is_unified_ideograph(code)) {
719 if (buflen < 28)
720 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
721 return 0;
722 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
723 return 1;
726 if (code >= 0x110000)
727 return 0;
729 /* get offset into phrasebook */
730 offset = phrasebook_offset1[(code>>phrasebook_shift)];
731 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
732 (code&((1<<phrasebook_shift)-1))];
733 if (!offset)
734 return 0;
736 i = 0;
738 for (;;) {
739 /* get word index */
740 word = phrasebook[offset] - phrasebook_short;
741 if (word >= 0) {
742 word = (word << 8) + phrasebook[offset+1];
743 offset += 2;
744 } else
745 word = phrasebook[offset++];
746 if (i) {
747 if (i > buflen)
748 return 0; /* buffer overflow */
749 buffer[i++] = ' ';
751 /* copy word string from lexicon. the last character in the
752 word has bit 7 set. the last word in a string ends with
753 0x80 */
754 w = lexicon + lexicon_offset[word];
755 while (*w < 128) {
756 if (i >= buflen)
757 return 0; /* buffer overflow */
758 buffer[i++] = *w++;
760 if (i >= buflen)
761 return 0; /* buffer overflow */
762 buffer[i++] = *w & 127;
763 if (*w == 128)
764 break; /* end of word */
767 return 1;
770 static int
771 _cmpname(int code, const char* name, int namelen)
773 /* check if code corresponds to the given name */
774 int i;
775 char buffer[NAME_MAXLEN];
776 if (!_getucname(code, buffer, sizeof(buffer)))
777 return 0;
778 for (i = 0; i < namelen; i++) {
779 if (toupper(name[i]) != buffer[i])
780 return 0;
782 return buffer[namelen] == '\0';
785 static void
786 find_syllable(const char *str, int *len, int *pos, int count, int column)
788 int i, len1;
789 *len = -1;
790 for (i = 0; i < count; i++) {
791 char *s = hangul_syllables[i][column];
792 len1 = strlen(s);
793 if (len1 <= *len)
794 continue;
795 if (strncmp(str, s, len1) == 0) {
796 *len = len1;
797 *pos = i;
800 if (*len == -1) {
801 *len = 0;
802 *pos = -1;
806 static int
807 _getcode(const char* name, int namelen, Py_UCS4* code)
809 unsigned int h, v;
810 unsigned int mask = code_size-1;
811 unsigned int i, incr;
813 /* Check for hangul syllables. */
814 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
815 int L, V, T, len;
816 const char *pos = name + 16;
817 find_syllable(pos, &len, &L, LCount, 0);
818 pos += len;
819 find_syllable(pos, &len, &V, VCount, 1);
820 pos += len;
821 find_syllable(pos, &len, &T, TCount, 2);
822 pos += len;
823 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
824 *code = SBase + (L*VCount+V)*TCount + T;
825 return 1;
827 /* Otherwise, it's an illegal syllable name. */
828 return 0;
831 /* Check for unified ideographs. */
832 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
833 /* Four or five hexdigits must follow. */
834 v = 0;
835 name += 22;
836 namelen -= 22;
837 if (namelen != 4 && namelen != 5)
838 return 0;
839 while (namelen--) {
840 v *= 16;
841 if (*name >= '0' && *name <= '9')
842 v += *name - '0';
843 else if (*name >= 'A' && *name <= 'F')
844 v += *name - 'A' + 10;
845 else
846 return 0;
847 name++;
849 if (!is_unified_ideograph(v))
850 return 0;
851 *code = v;
852 return 1;
855 /* the following is the same as python's dictionary lookup, with
856 only minor changes. see the makeunicodedata script for more
857 details */
859 h = (unsigned int) _gethash(name, namelen, code_magic);
860 i = (~h) & mask;
861 v = code_hash[i];
862 if (!v)
863 return 0;
864 if (_cmpname(v, name, namelen)) {
865 *code = v;
866 return 1;
868 incr = (h ^ (h >> 3)) & mask;
869 if (!incr)
870 incr = mask;
871 for (;;) {
872 i = (i + incr) & mask;
873 v = code_hash[i];
874 if (!v)
875 return 0;
876 if (_cmpname(v, name, namelen)) {
877 *code = v;
878 return 1;
880 incr = incr << 1;
881 if (incr > mask)
882 incr = incr ^ code_poly;
886 static const _PyUnicode_Name_CAPI hashAPI =
888 sizeof(_PyUnicode_Name_CAPI),
889 _getucname,
890 _getcode
893 /* -------------------------------------------------------------------- */
894 /* Python bindings */
896 PyDoc_STRVAR(unicodedata_name__doc__,
897 "name(unichr[, default])\n\
898 Returns the name assigned to the Unicode character unichr as a\n\
899 string. If no name is defined, default is returned, or, if not\n\
900 given, ValueError is raised.");
902 static PyObject *
903 unicodedata_name(PyObject* self, PyObject* args)
905 char name[NAME_MAXLEN];
907 PyUnicodeObject* v;
908 PyObject* defobj = NULL;
909 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
910 return NULL;
912 if (PyUnicode_GET_SIZE(v) != 1) {
913 PyErr_SetString(PyExc_TypeError,
914 "need a single Unicode character as parameter");
915 return NULL;
918 if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
919 name, sizeof(name))) {
920 if (defobj == NULL) {
921 PyErr_SetString(PyExc_ValueError, "no such name");
922 return NULL;
924 else {
925 Py_INCREF(defobj);
926 return defobj;
930 return Py_BuildValue("s", name);
933 PyDoc_STRVAR(unicodedata_lookup__doc__,
934 "lookup(name)\n\
936 Look up character by name. If a character with the\n\
937 given name is found, return the corresponding Unicode\n\
938 character. If not found, KeyError is raised.");
940 static PyObject *
941 unicodedata_lookup(PyObject* self, PyObject* args)
943 Py_UCS4 code;
944 Py_UNICODE str[1];
946 char* name;
947 int namelen;
948 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
949 return NULL;
951 if (!_getcode(name, namelen, &code)) {
952 char fmt[] = "undefined character name '%s'";
953 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
954 sprintf(buf, fmt, name);
955 PyErr_SetString(PyExc_KeyError, buf);
956 PyMem_FREE(buf);
957 return NULL;
960 str[0] = (Py_UNICODE) code;
961 return PyUnicode_FromUnicode(str, 1);
964 /* XXX Add doc strings. */
966 static PyMethodDef unicodedata_functions[] = {
967 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
968 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
969 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
970 {"category", unicodedata_category, METH_VARARGS,
971 unicodedata_category__doc__},
972 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
973 unicodedata_bidirectional__doc__},
974 {"combining", unicodedata_combining, METH_VARARGS,
975 unicodedata_combining__doc__},
976 {"mirrored", unicodedata_mirrored, METH_VARARGS,
977 unicodedata_mirrored__doc__},
978 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
979 unicodedata_east_asian_width__doc__},
980 {"decomposition", unicodedata_decomposition, METH_VARARGS,
981 unicodedata_decomposition__doc__},
982 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
983 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
984 {"normalize", unicodedata_normalize, METH_VARARGS,
985 unicodedata_normalize__doc__},
986 {NULL, NULL} /* sentinel */
989 PyDoc_STRVAR(unicodedata_docstring,
990 "This module provides access to the Unicode Character Database which\n\
991 defines character properties for all Unicode characters. The data in\n\
992 this database is based on the UnicodeData.txt file version\n\
993 3.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
995 The module uses the same names and symbols as defined by the\n\
996 UnicodeData File Format 3.2.0 (see\n\
997 http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html).");
999 PyMODINIT_FUNC
1000 initunicodedata(void)
1002 PyObject *m, *v;
1004 m = Py_InitModule3(
1005 "unicodedata", unicodedata_functions, unicodedata_docstring);
1006 if (!m)
1007 return;
1009 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1011 /* Export C API */
1012 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1013 if (v != NULL)
1014 PyModule_AddObject(m, "ucnhash_CAPI", v);
1018 Local variables:
1019 c-basic-offset: 4
1020 indent-tabs-mode: nil
1021 End: