Convert from long to Py_ssize_t.
[python.git] / Modules / unicodedata.c
blobabb391a0a1b6273ca5737d40c69767fa352b3ace
1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 4.1 data base.
5 Data was extracted from the Unicode 4.1 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
15 #include "Python.h"
16 #include "ucnhash.h"
17 #include "structmember.h"
19 /* character properties */
21 typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
30 } _PyUnicode_DatabaseRecord;
32 typedef struct change_record {
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed;
35 const unsigned char category_changed;
36 const unsigned char decimal_changed;
37 const int numeric_changed;
38 } change_record;
40 /* data file generated by Tools/unicode/makeunicodedata.py */
41 #include "unicodedata_db.h"
43 static const _PyUnicode_DatabaseRecord*
44 _getrecord_ex(Py_UCS4 code)
46 int index;
47 if (code >= 0x110000)
48 index = 0;
49 else {
50 index = index1[(code>>SHIFT)];
51 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
54 return &_PyUnicode_Database_Records[index];
57 /* ------------- Previous-version API ------------------------------------- */
58 typedef struct previous_version {
59 PyObject_HEAD
60 const char *name;
61 const change_record* (*getrecord)(Py_UCS4);
62 Py_UCS4 (*normalization)(Py_UCS4);
63 } PreviousDBVersion;
65 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
67 static PyMemberDef DB_members[] = {
68 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
69 {NULL}
72 /* forward declaration */
73 static PyTypeObject UCD_Type;
75 static PyObject*
76 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
77 Py_UCS4 (*normalization)(Py_UCS4))
79 PreviousDBVersion *self;
80 self = PyObject_New(PreviousDBVersion, &UCD_Type);
81 if (self == NULL)
82 return NULL;
83 self->name = name;
84 self->getrecord = getrecord;
85 self->normalization = normalization;
86 return (PyObject*)self;
90 static Py_UCS4 getuchar(PyUnicodeObject *obj)
92 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
94 if (PyUnicode_GET_SIZE(obj) == 1)
95 return *v;
96 #ifndef Py_UNICODE_WIDE
97 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
98 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
99 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
100 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
101 #endif
102 PyErr_SetString(PyExc_TypeError,
103 "need a single Unicode character as parameter");
104 return (Py_UCS4)-1;
107 /* --- Module API --------------------------------------------------------- */
109 PyDoc_STRVAR(unicodedata_decimal__doc__,
110 "decimal(unichr[, default])\n\
112 Returns the decimal value assigned to the Unicode character unichr\n\
113 as integer. If no such value is defined, default is returned, or, if\n\
114 not given, ValueError is raised.");
116 static PyObject *
117 unicodedata_decimal(PyObject *self, PyObject *args)
119 PyUnicodeObject *v;
120 PyObject *defobj = NULL;
121 int have_old = 0;
122 long rc;
123 Py_UCS4 c;
125 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
126 return NULL;
127 c = getuchar(v);
128 if (c == (Py_UCS4)-1)
129 return NULL;
131 if (self) {
132 const change_record *old = get_old_record(self, c);
133 if (old->category_changed == 0) {
134 /* unassigned */
135 have_old = 1;
136 rc = -1;
138 else if (old->decimal_changed != 0xFF) {
139 have_old = 1;
140 rc = old->decimal_changed;
144 if (!have_old)
145 rc = Py_UNICODE_TODECIMAL(c);
146 if (rc < 0) {
147 if (defobj == NULL) {
148 PyErr_SetString(PyExc_ValueError,
149 "not a decimal");
150 return NULL;
152 else {
153 Py_INCREF(defobj);
154 return defobj;
157 return PyInt_FromLong(rc);
160 PyDoc_STRVAR(unicodedata_digit__doc__,
161 "digit(unichr[, default])\n\
163 Returns the digit value assigned to the Unicode character unichr as\n\
164 integer. If no such value is defined, default is returned, or, if\n\
165 not given, ValueError is raised.");
167 static PyObject *
168 unicodedata_digit(PyObject *self, PyObject *args)
170 PyUnicodeObject *v;
171 PyObject *defobj = NULL;
172 long rc;
173 Py_UCS4 c;
175 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
176 return NULL;
177 c = getuchar(v);
178 if (c == (Py_UCS4)-1)
179 return NULL;
180 rc = Py_UNICODE_TODIGIT(c);
181 if (rc < 0) {
182 if (defobj == NULL) {
183 PyErr_SetString(PyExc_ValueError, "not a digit");
184 return NULL;
186 else {
187 Py_INCREF(defobj);
188 return defobj;
191 return PyInt_FromLong(rc);
194 PyDoc_STRVAR(unicodedata_numeric__doc__,
195 "numeric(unichr[, default])\n\
197 Returns the numeric value assigned to the Unicode character unichr\n\
198 as float. If no such value is defined, default is returned, or, if\n\
199 not given, ValueError is raised.");
201 static PyObject *
202 unicodedata_numeric(PyObject *self, PyObject *args)
204 PyUnicodeObject *v;
205 PyObject *defobj = NULL;
206 int have_old = 0;
207 double rc;
208 Py_UCS4 c;
210 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
211 return NULL;
212 c = getuchar(v);
213 if (c == (Py_UCS4)-1)
214 return NULL;
216 if (self) {
217 const change_record *old = get_old_record(self, c);
218 if (old->category_changed == 0) {
219 /* unassigned */
220 have_old = 1;
221 rc = -1.0;
223 else if (old->decimal_changed != 0xFF) {
224 have_old = 1;
225 rc = old->decimal_changed;
229 if (!have_old)
230 rc = Py_UNICODE_TONUMERIC(c);
231 if (rc == -1.0) {
232 if (defobj == NULL) {
233 PyErr_SetString(PyExc_ValueError, "not a numeric character");
234 return NULL;
236 else {
237 Py_INCREF(defobj);
238 return defobj;
241 return PyFloat_FromDouble(rc);
244 PyDoc_STRVAR(unicodedata_category__doc__,
245 "category(unichr)\n\
247 Returns the general category assigned to the Unicode character\n\
248 unichr as string.");
250 static PyObject *
251 unicodedata_category(PyObject *self, PyObject *args)
253 PyUnicodeObject *v;
254 int index;
255 Py_UCS4 c;
257 if (!PyArg_ParseTuple(args, "O!:category",
258 &PyUnicode_Type, &v))
259 return NULL;
260 c = getuchar(v);
261 if (c == (Py_UCS4)-1)
262 return NULL;
263 index = (int) _getrecord_ex(c)->category;
264 if (self) {
265 const change_record *old = get_old_record(self, c);
266 if (old->category_changed != 0xFF)
267 index = old->category_changed;
269 return PyString_FromString(_PyUnicode_CategoryNames[index]);
272 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
273 "bidirectional(unichr)\n\
275 Returns the bidirectional category assigned to the Unicode character\n\
276 unichr as string. If no such value is defined, an empty string is\n\
277 returned.");
279 static PyObject *
280 unicodedata_bidirectional(PyObject *self, PyObject *args)
282 PyUnicodeObject *v;
283 int index;
284 Py_UCS4 c;
286 if (!PyArg_ParseTuple(args, "O!:bidirectional",
287 &PyUnicode_Type, &v))
288 return NULL;
289 c = getuchar(v);
290 if (c == (Py_UCS4)-1)
291 return NULL;
292 index = (int) _getrecord_ex(c)->bidirectional;
293 if (self) {
294 const change_record *old = get_old_record(self, c);
295 if (old->category_changed == 0)
296 index = 0; /* unassigned */
297 else if (old->bidir_changed != 0xFF)
298 index = old->bidir_changed;
300 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
303 PyDoc_STRVAR(unicodedata_combining__doc__,
304 "combining(unichr)\n\
306 Returns the canonical combining class assigned to the Unicode\n\
307 character unichr as integer. Returns 0 if no combining class is\n\
308 defined.");
310 static PyObject *
311 unicodedata_combining(PyObject *self, PyObject *args)
313 PyUnicodeObject *v;
314 int index;
315 Py_UCS4 c;
317 if (!PyArg_ParseTuple(args, "O!:combining",
318 &PyUnicode_Type, &v))
319 return NULL;
320 c = getuchar(v);
321 if (c == (Py_UCS4)-1)
322 return NULL;
323 index = (int) _getrecord_ex(c)->combining;
324 if (self) {
325 const change_record *old = get_old_record(self, c);
326 if (old->category_changed == 0)
327 index = 0; /* unassigned */
329 return PyInt_FromLong(index);
332 PyDoc_STRVAR(unicodedata_mirrored__doc__,
333 "mirrored(unichr)\n\
335 Returns the mirrored property assigned to the Unicode character\n\
336 unichr as integer. Returns 1 if the character has been identified as\n\
337 a \"mirrored\" character in bidirectional text, 0 otherwise.");
339 static PyObject *
340 unicodedata_mirrored(PyObject *self, PyObject *args)
342 PyUnicodeObject *v;
343 int index;
344 Py_UCS4 c;
346 if (!PyArg_ParseTuple(args, "O!:mirrored",
347 &PyUnicode_Type, &v))
348 return NULL;
349 c = getuchar(v);
350 if (c == (Py_UCS4)-1)
351 return NULL;
352 index = (int) _getrecord_ex(c)->mirrored;
353 if (self) {
354 const change_record *old = get_old_record(self, c);
355 if (old->category_changed == 0)
356 index = 0; /* unassigned */
358 return PyInt_FromLong(index);
361 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
362 "east_asian_width(unichr)\n\
364 Returns the east asian width assigned to the Unicode character\n\
365 unichr as string.");
367 static PyObject *
368 unicodedata_east_asian_width(PyObject *self, PyObject *args)
370 PyUnicodeObject *v;
371 int index;
372 Py_UCS4 c;
374 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
375 &PyUnicode_Type, &v))
376 return NULL;
377 c = getuchar(v);
378 if (c == (Py_UCS4)-1)
379 return NULL;
380 index = (int) _getrecord_ex(c)->east_asian_width;
381 if (self) {
382 const change_record *old = get_old_record(self, c);
383 if (old->category_changed == 0)
384 index = 0; /* unassigned */
386 return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
389 PyDoc_STRVAR(unicodedata_decomposition__doc__,
390 "decomposition(unichr)\n\
392 Returns the character decomposition mapping assigned to the Unicode\n\
393 character unichr as string. An empty string is returned in case no\n\
394 such mapping is defined.");
396 static PyObject *
397 unicodedata_decomposition(PyObject *self, PyObject *args)
399 PyUnicodeObject *v;
400 char decomp[256];
401 int code, index, count, i;
402 unsigned int prefix_index;
403 Py_UCS4 c;
405 if (!PyArg_ParseTuple(args, "O!:decomposition",
406 &PyUnicode_Type, &v))
407 return NULL;
408 c = getuchar(v);
409 if (c == (Py_UCS4)-1)
410 return NULL;
412 code = (int)c;
414 if (self) {
415 const change_record *old = get_old_record(self, c);
416 if (old->category_changed == 0)
417 return PyString_FromString(""); /* unassigned */
420 if (code < 0 || code >= 0x110000)
421 index = 0;
422 else {
423 index = decomp_index1[(code>>DECOMP_SHIFT)];
424 index = decomp_index2[(index<<DECOMP_SHIFT)+
425 (code&((1<<DECOMP_SHIFT)-1))];
428 /* high byte is number of hex bytes (usually one or two), low byte
429 is prefix code (from*/
430 count = decomp_data[index] >> 8;
432 /* XXX: could allocate the PyString up front instead
433 (strlen(prefix) + 5 * count + 1 bytes) */
435 /* Based on how index is calculated above and decomp_data is generated
436 from Tools/unicode/makeunicodedata.py, it should not be possible
437 to overflow decomp_prefix. */
438 prefix_index = decomp_data[index] & 255;
439 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
441 /* copy prefix */
442 i = strlen(decomp_prefix[prefix_index]);
443 memcpy(decomp, decomp_prefix[prefix_index], i);
445 while (count-- > 0) {
446 if (i)
447 decomp[i++] = ' ';
448 assert((size_t)i < sizeof(decomp));
449 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
450 decomp_data[++index]);
451 i += strlen(decomp + i);
454 decomp[i] = '\0';
456 return PyString_FromString(decomp);
459 static void
460 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
462 if (code >= 0x110000) {
463 *index = 0;
464 } else if (self && get_old_record(self, code)->category_changed==0) {
465 /* unassigned in old version */
466 *index = 0;
468 else {
469 *index = decomp_index1[(code>>DECOMP_SHIFT)];
470 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
471 (code&((1<<DECOMP_SHIFT)-1))];
474 /* high byte is number of hex bytes (usually one or two), low byte
475 is prefix code (from*/
476 *count = decomp_data[*index] >> 8;
477 *prefix = decomp_data[*index] & 255;
479 (*index)++;
482 #define SBase 0xAC00
483 #define LBase 0x1100
484 #define VBase 0x1161
485 #define TBase 0x11A7
486 #define LCount 19
487 #define VCount 21
488 #define TCount 28
489 #define NCount (VCount*TCount)
490 #define SCount (LCount*NCount)
492 static PyObject*
493 nfd_nfkd(PyObject *self, PyObject *input, int k)
495 PyObject *result;
496 Py_UNICODE *i, *end, *o;
497 /* Longest decomposition in Unicode 3.2: U+FDFA */
498 Py_UNICODE stack[20];
499 Py_ssize_t space, isize;
500 int index, prefix, count, stackptr;
501 unsigned char prev, cur;
503 stackptr = 0;
504 isize = PyUnicode_GET_SIZE(input);
505 /* Overallocate atmost 10 characters. */
506 space = (isize > 10 ? 10 : isize) + isize;
507 result = PyUnicode_FromUnicode(NULL, space);
508 if (!result)
509 return NULL;
510 i = PyUnicode_AS_UNICODE(input);
511 end = i + isize;
512 o = PyUnicode_AS_UNICODE(result);
514 while (i < end) {
515 stack[stackptr++] = *i++;
516 while(stackptr) {
517 Py_UNICODE code = stack[--stackptr];
518 /* Hangul Decomposition adds three characters in
519 a single step, so we need atleast that much room. */
520 if (space < 3) {
521 Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
522 space += 10;
523 if (PyUnicode_Resize(&result, newsize) == -1)
524 return NULL;
525 o = PyUnicode_AS_UNICODE(result) + newsize - space;
527 /* Hangul Decomposition. */
528 if (SBase <= code && code < (SBase+SCount)) {
529 int SIndex = code - SBase;
530 int L = LBase + SIndex / NCount;
531 int V = VBase + (SIndex % NCount) / TCount;
532 int T = TBase + SIndex % TCount;
533 *o++ = L;
534 *o++ = V;
535 space -= 2;
536 if (T != TBase) {
537 *o++ = T;
538 space --;
540 continue;
542 /* normalization changes */
543 if (self) {
544 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
545 if (value != 0) {
546 stack[stackptr++] = value;
547 continue;
551 /* Other decompositions. */
552 get_decomp_record(self, code, &index, &prefix, &count);
554 /* Copy character if it is not decomposable, or has a
555 compatibility decomposition, but we do NFD. */
556 if (!count || (prefix && !k)) {
557 *o++ = code;
558 space--;
559 continue;
561 /* Copy decomposition onto the stack, in reverse
562 order. */
563 while(count) {
564 code = decomp_data[index + (--count)];
565 stack[stackptr++] = code;
570 /* Drop overallocation. Cannot fail. */
571 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
573 /* Sort canonically. */
574 i = PyUnicode_AS_UNICODE(result);
575 prev = _getrecord_ex(*i)->combining;
576 end = i + PyUnicode_GET_SIZE(result);
577 for (i++; i < end; i++) {
578 cur = _getrecord_ex(*i)->combining;
579 if (prev == 0 || cur == 0 || prev <= cur) {
580 prev = cur;
581 continue;
583 /* Non-canonical order. Need to switch *i with previous. */
584 o = i - 1;
585 while (1) {
586 Py_UNICODE tmp = o[1];
587 o[1] = o[0];
588 o[0] = tmp;
589 o--;
590 if (o < PyUnicode_AS_UNICODE(result))
591 break;
592 prev = _getrecord_ex(*o)->combining;
593 if (prev == 0 || prev <= cur)
594 break;
596 prev = _getrecord_ex(*i)->combining;
598 return result;
601 static int
602 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
604 int index;
605 for (index = 0; nfc[index].start; index++) {
606 int start = nfc[index].start;
607 if (code < start)
608 return -1;
609 if (code <= start + nfc[index].count) {
610 int delta = code - start;
611 return nfc[index].index + delta;
614 return -1;
617 static PyObject*
618 nfc_nfkc(PyObject *self, PyObject *input, int k)
620 PyObject *result;
621 Py_UNICODE *i, *i1, *o, *end;
622 int f,l,index,index1,comb;
623 Py_UNICODE code;
624 Py_UNICODE *skipped[20];
625 int cskipped = 0;
627 result = nfd_nfkd(self, input, k);
628 if (!result)
629 return NULL;
631 /* We are going to modify result in-place.
632 If nfd_nfkd is changed to sometimes return the input,
633 this code needs to be reviewed. */
634 assert(result != input);
636 i = PyUnicode_AS_UNICODE(result);
637 end = i + PyUnicode_GET_SIZE(result);
638 o = PyUnicode_AS_UNICODE(result);
640 again:
641 while (i < end) {
642 for (index = 0; index < cskipped; index++) {
643 if (skipped[index] == i) {
644 /* *i character is skipped.
645 Remove from list. */
646 skipped[index] = skipped[cskipped-1];
647 cskipped--;
648 i++;
649 goto again; /* continue while */
652 /* Hangul Composition. We don't need to check for <LV,T>
653 pairs, since we always have decomposed data. */
654 if (LBase <= *i && *i < (LBase+LCount) &&
655 i + 1 < end &&
656 VBase <= i[1] && i[1] <= (VBase+VCount)) {
657 int LIndex, VIndex;
658 LIndex = i[0] - LBase;
659 VIndex = i[1] - VBase;
660 code = SBase + (LIndex*VCount+VIndex)*TCount;
661 i+=2;
662 if (i < end &&
663 TBase <= *i && *i <= (TBase+TCount)) {
664 code += *i-TBase;
665 i++;
667 *o++ = code;
668 continue;
671 f = find_nfc_index(self, nfc_first, *i);
672 if (f == -1) {
673 *o++ = *i++;
674 continue;
676 /* Find next unblocked character. */
677 i1 = i+1;
678 comb = 0;
679 while (i1 < end) {
680 int comb1 = _getrecord_ex(*i1)->combining;
681 if (comb1 && comb == comb1) {
682 /* Character is blocked. */
683 i1++;
684 continue;
686 l = find_nfc_index(self, nfc_last, *i1);
687 /* *i1 cannot be combined with *i. If *i1
688 is a starter, we don't need to look further.
689 Otherwise, record the combining class. */
690 if (l == -1) {
691 not_combinable:
692 if (comb1 == 0)
693 break;
694 comb = comb1;
695 i1++;
696 continue;
698 index = f*TOTAL_LAST + l;
699 index1 = comp_index[index >> COMP_SHIFT];
700 code = comp_data[(index1<<COMP_SHIFT)+
701 (index&((1<<COMP_SHIFT)-1))];
702 if (code == 0)
703 goto not_combinable;
705 /* Replace the original character. */
706 *i = code;
707 /* Mark the second character unused. */
708 skipped[cskipped++] = i1;
709 i1++;
710 f = find_nfc_index(self, nfc_first, *i);
711 if (f == -1)
712 break;
714 *o++ = *i++;
716 if (o != end)
717 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
718 return result;
721 PyDoc_STRVAR(unicodedata_normalize__doc__,
722 "normalize(form, unistr)\n\
724 Return the normal form 'form' for the Unicode string unistr. Valid\n\
725 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
727 static PyObject*
728 unicodedata_normalize(PyObject *self, PyObject *args)
730 char *form;
731 PyObject *input;
733 if(!PyArg_ParseTuple(args, "sO!:normalize",
734 &form, &PyUnicode_Type, &input))
735 return NULL;
737 if (PyUnicode_GetSize(input) == 0) {
738 /* Special case empty input strings, since resizing
739 them later would cause internal errors. */
740 Py_INCREF(input);
741 return input;
744 if (strcmp(form, "NFC") == 0)
745 return nfc_nfkc(self, input, 0);
746 if (strcmp(form, "NFKC") == 0)
747 return nfc_nfkc(self, input, 1);
748 if (strcmp(form, "NFD") == 0)
749 return nfd_nfkd(self, input, 0);
750 if (strcmp(form, "NFKD") == 0)
751 return nfd_nfkd(self, input, 1);
752 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
753 return NULL;
756 /* -------------------------------------------------------------------- */
757 /* unicode character name tables */
759 /* data file generated by Tools/unicode/makeunicodedata.py */
760 #include "unicodename_db.h"
762 /* -------------------------------------------------------------------- */
763 /* database code (cut and pasted from the unidb package) */
765 static unsigned long
766 _gethash(const char *s, int len, int scale)
768 int i;
769 unsigned long h = 0;
770 unsigned long ix;
771 for (i = 0; i < len; i++) {
772 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
773 ix = h & 0xff000000;
774 if (ix)
775 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
777 return h;
780 static char *hangul_syllables[][3] = {
781 { "G", "A", "" },
782 { "GG", "AE", "G" },
783 { "N", "YA", "GG" },
784 { "D", "YAE", "GS" },
785 { "DD", "EO", "N", },
786 { "R", "E", "NJ" },
787 { "M", "YEO", "NH" },
788 { "B", "YE", "D" },
789 { "BB", "O", "L" },
790 { "S", "WA", "LG" },
791 { "SS", "WAE", "LM" },
792 { "", "OE", "LB" },
793 { "J", "YO", "LS" },
794 { "JJ", "U", "LT" },
795 { "C", "WEO", "LP" },
796 { "K", "WE", "LH" },
797 { "T", "WI", "M" },
798 { "P", "YU", "B" },
799 { "H", "EU", "BS" },
800 { 0, "YI", "S" },
801 { 0, "I", "SS" },
802 { 0, 0, "NG" },
803 { 0, 0, "J" },
804 { 0, 0, "C" },
805 { 0, 0, "K" },
806 { 0, 0, "T" },
807 { 0, 0, "P" },
808 { 0, 0, "H" }
811 static int
812 is_unified_ideograph(Py_UCS4 code)
814 return (
815 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
816 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
817 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
820 static int
821 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
823 int offset;
824 int i;
825 int word;
826 unsigned char* w;
828 if (code >= 0x110000)
829 return 0;
831 if (self) {
832 const change_record *old = get_old_record(self, code);
833 if (old->category_changed == 0) {
834 /* unassigned */
835 return 0;
839 if (SBase <= code && code < SBase+SCount) {
840 /* Hangul syllable. */
841 int SIndex = code - SBase;
842 int L = SIndex / NCount;
843 int V = (SIndex % NCount) / TCount;
844 int T = SIndex % TCount;
846 if (buflen < 27)
847 /* Worst case: HANGUL SYLLABLE <10chars>. */
848 return 0;
849 strcpy(buffer, "HANGUL SYLLABLE ");
850 buffer += 16;
851 strcpy(buffer, hangul_syllables[L][0]);
852 buffer += strlen(hangul_syllables[L][0]);
853 strcpy(buffer, hangul_syllables[V][1]);
854 buffer += strlen(hangul_syllables[V][1]);
855 strcpy(buffer, hangul_syllables[T][2]);
856 buffer += strlen(hangul_syllables[T][2]);
857 *buffer = '\0';
858 return 1;
861 if (is_unified_ideograph(code)) {
862 if (buflen < 28)
863 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
864 return 0;
865 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
866 return 1;
869 /* get offset into phrasebook */
870 offset = phrasebook_offset1[(code>>phrasebook_shift)];
871 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
872 (code&((1<<phrasebook_shift)-1))];
873 if (!offset)
874 return 0;
876 i = 0;
878 for (;;) {
879 /* get word index */
880 word = phrasebook[offset] - phrasebook_short;
881 if (word >= 0) {
882 word = (word << 8) + phrasebook[offset+1];
883 offset += 2;
884 } else
885 word = phrasebook[offset++];
886 if (i) {
887 if (i > buflen)
888 return 0; /* buffer overflow */
889 buffer[i++] = ' ';
891 /* copy word string from lexicon. the last character in the
892 word has bit 7 set. the last word in a string ends with
893 0x80 */
894 w = lexicon + lexicon_offset[word];
895 while (*w < 128) {
896 if (i >= buflen)
897 return 0; /* buffer overflow */
898 buffer[i++] = *w++;
900 if (i >= buflen)
901 return 0; /* buffer overflow */
902 buffer[i++] = *w & 127;
903 if (*w == 128)
904 break; /* end of word */
907 return 1;
910 static int
911 _cmpname(PyObject *self, int code, const char* name, int namelen)
913 /* check if code corresponds to the given name */
914 int i;
915 char buffer[NAME_MAXLEN];
916 if (!_getucname(self, code, buffer, sizeof(buffer)))
917 return 0;
918 for (i = 0; i < namelen; i++) {
919 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
920 return 0;
922 return buffer[namelen] == '\0';
925 static void
926 find_syllable(const char *str, int *len, int *pos, int count, int column)
928 int i, len1;
929 *len = -1;
930 for (i = 0; i < count; i++) {
931 char *s = hangul_syllables[i][column];
932 len1 = strlen(s);
933 if (len1 <= *len)
934 continue;
935 if (strncmp(str, s, len1) == 0) {
936 *len = len1;
937 *pos = i;
940 if (*len == -1) {
941 *len = 0;
945 static int
946 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
948 unsigned int h, v;
949 unsigned int mask = code_size-1;
950 unsigned int i, incr;
952 /* Check for hangul syllables. */
953 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
954 int len, L = -1, V = -1, T = -1;
955 const char *pos = name + 16;
956 find_syllable(pos, &len, &L, LCount, 0);
957 pos += len;
958 find_syllable(pos, &len, &V, VCount, 1);
959 pos += len;
960 find_syllable(pos, &len, &T, TCount, 2);
961 pos += len;
962 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
963 *code = SBase + (L*VCount+V)*TCount + T;
964 return 1;
966 /* Otherwise, it's an illegal syllable name. */
967 return 0;
970 /* Check for unified ideographs. */
971 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
972 /* Four or five hexdigits must follow. */
973 v = 0;
974 name += 22;
975 namelen -= 22;
976 if (namelen != 4 && namelen != 5)
977 return 0;
978 while (namelen--) {
979 v *= 16;
980 if (*name >= '0' && *name <= '9')
981 v += *name - '0';
982 else if (*name >= 'A' && *name <= 'F')
983 v += *name - 'A' + 10;
984 else
985 return 0;
986 name++;
988 if (!is_unified_ideograph(v))
989 return 0;
990 *code = v;
991 return 1;
994 /* the following is the same as python's dictionary lookup, with
995 only minor changes. see the makeunicodedata script for more
996 details */
998 h = (unsigned int) _gethash(name, namelen, code_magic);
999 i = (~h) & mask;
1000 v = code_hash[i];
1001 if (!v)
1002 return 0;
1003 if (_cmpname(self, v, name, namelen)) {
1004 *code = v;
1005 return 1;
1007 incr = (h ^ (h >> 3)) & mask;
1008 if (!incr)
1009 incr = mask;
1010 for (;;) {
1011 i = (i + incr) & mask;
1012 v = code_hash[i];
1013 if (!v)
1014 return 0;
1015 if (_cmpname(self, v, name, namelen)) {
1016 *code = v;
1017 return 1;
1019 incr = incr << 1;
1020 if (incr > mask)
1021 incr = incr ^ code_poly;
1025 static const _PyUnicode_Name_CAPI hashAPI =
1027 sizeof(_PyUnicode_Name_CAPI),
1028 _getucname,
1029 _getcode
1032 /* -------------------------------------------------------------------- */
1033 /* Python bindings */
1035 PyDoc_STRVAR(unicodedata_name__doc__,
1036 "name(unichr[, default])\n\
1037 Returns the name assigned to the Unicode character unichr as a\n\
1038 string. If no name is defined, default is returned, or, if not\n\
1039 given, ValueError is raised.");
1041 static PyObject *
1042 unicodedata_name(PyObject* self, PyObject* args)
1044 char name[NAME_MAXLEN];
1045 Py_UCS4 c;
1047 PyUnicodeObject* v;
1048 PyObject* defobj = NULL;
1049 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1050 return NULL;
1052 c = getuchar(v);
1053 if (c == (Py_UCS4)-1)
1054 return NULL;
1056 if (!_getucname(self, c, name, sizeof(name))) {
1057 if (defobj == NULL) {
1058 PyErr_SetString(PyExc_ValueError, "no such name");
1059 return NULL;
1061 else {
1062 Py_INCREF(defobj);
1063 return defobj;
1067 return Py_BuildValue("s", name);
1070 PyDoc_STRVAR(unicodedata_lookup__doc__,
1071 "lookup(name)\n\
1073 Look up character by name. If a character with the\n\
1074 given name is found, return the corresponding Unicode\n\
1075 character. If not found, KeyError is raised.");
1077 static PyObject *
1078 unicodedata_lookup(PyObject* self, PyObject* args)
1080 Py_UCS4 code;
1081 Py_UNICODE str[2];
1083 char* name;
1084 int namelen;
1085 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1086 return NULL;
1088 if (!_getcode(self, name, namelen, &code)) {
1089 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1090 name);
1091 return NULL;
1094 #ifndef Py_UNICODE_WIDE
1095 if (code >= 0x10000) {
1096 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1097 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1098 return PyUnicode_FromUnicode(str, 2);
1100 #endif
1101 str[0] = (Py_UNICODE) code;
1102 return PyUnicode_FromUnicode(str, 1);
1105 /* XXX Add doc strings. */
1107 static PyMethodDef unicodedata_functions[] = {
1108 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1109 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1110 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1111 {"category", unicodedata_category, METH_VARARGS,
1112 unicodedata_category__doc__},
1113 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1114 unicodedata_bidirectional__doc__},
1115 {"combining", unicodedata_combining, METH_VARARGS,
1116 unicodedata_combining__doc__},
1117 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1118 unicodedata_mirrored__doc__},
1119 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1120 unicodedata_east_asian_width__doc__},
1121 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1122 unicodedata_decomposition__doc__},
1123 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1124 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1125 {"normalize", unicodedata_normalize, METH_VARARGS,
1126 unicodedata_normalize__doc__},
1127 {NULL, NULL} /* sentinel */
1130 static PyTypeObject UCD_Type = {
1131 /* The ob_type field must be initialized in the module init function
1132 * to be portable to Windows without using C++. */
1133 PyVarObject_HEAD_INIT(NULL, 0)
1134 "unicodedata.UCD", /*tp_name*/
1135 sizeof(PreviousDBVersion), /*tp_basicsize*/
1136 0, /*tp_itemsize*/
1137 /* methods */
1138 (destructor)PyObject_Del, /*tp_dealloc*/
1139 0, /*tp_print*/
1140 0, /*tp_getattr*/
1141 0, /*tp_setattr*/
1142 0, /*tp_compare*/
1143 0, /*tp_repr*/
1144 0, /*tp_as_number*/
1145 0, /*tp_as_sequence*/
1146 0, /*tp_as_mapping*/
1147 0, /*tp_hash*/
1148 0, /*tp_call*/
1149 0, /*tp_str*/
1150 PyObject_GenericGetAttr,/*tp_getattro*/
1151 0, /*tp_setattro*/
1152 0, /*tp_as_buffer*/
1153 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1154 0, /*tp_doc*/
1155 0, /*tp_traverse*/
1156 0, /*tp_clear*/
1157 0, /*tp_richcompare*/
1158 0, /*tp_weaklistoffset*/
1159 0, /*tp_iter*/
1160 0, /*tp_iternext*/
1161 unicodedata_functions, /*tp_methods*/
1162 DB_members, /*tp_members*/
1163 0, /*tp_getset*/
1164 0, /*tp_base*/
1165 0, /*tp_dict*/
1166 0, /*tp_descr_get*/
1167 0, /*tp_descr_set*/
1168 0, /*tp_dictoffset*/
1169 0, /*tp_init*/
1170 0, /*tp_alloc*/
1171 0, /*tp_new*/
1172 0, /*tp_free*/
1173 0, /*tp_is_gc*/
1176 PyDoc_STRVAR(unicodedata_docstring,
1177 "This module provides access to the Unicode Character Database which\n\
1178 defines character properties for all Unicode characters. The data in\n\
1179 this database is based on the UnicodeData.txt file version\n\
1180 4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1182 The module uses the same names and symbols as defined by the\n\
1183 UnicodeData File Format 4.1.0 (see\n\
1184 http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
1186 PyMODINIT_FUNC
1187 initunicodedata(void)
1189 PyObject *m, *v;
1191 Py_TYPE(&UCD_Type) = &PyType_Type;
1193 m = Py_InitModule3(
1194 "unicodedata", unicodedata_functions, unicodedata_docstring);
1195 if (!m)
1196 return;
1198 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1199 Py_INCREF(&UCD_Type);
1200 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1202 /* Previous versions */
1203 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1204 if (v != NULL)
1205 PyModule_AddObject(m, "ucd_3_2_0", v);
1207 /* Export C API */
1208 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1209 if (v != NULL)
1210 PyModule_AddObject(m, "ucnhash_CAPI", v);
1214 Local variables:
1215 c-basic-offset: 4
1216 indent-tabs-mode: nil
1217 End: