Blocked revisions 73580-73582 via svnmerge
[python/dscho.git] / Modules / unicodedata.c
blob2dddc488cfa18b44787aef5479c7043610109b9b
1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 5.1 data base.
5 Data was extracted from the Unicode 5.1 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
15 #include "Python.h"
16 #include "ucnhash.h"
17 #include "structmember.h"
19 /* character properties */
21 typedef struct {
22 const unsigned char category; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining; /* combining class value 0 - 255 */
25 const unsigned char bidirectional; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width; /* index into
29 _PyUnicode_EastAsianWidth */
30 const unsigned char normalization_quick_check; /* see is_normalized() */
31 } _PyUnicode_DatabaseRecord;
33 typedef struct change_record {
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed;
36 const unsigned char category_changed;
37 const unsigned char decimal_changed;
38 const unsigned char mirrored_changed;
39 const int numeric_changed;
40 } change_record;
42 /* data file generated by Tools/unicode/makeunicodedata.py */
43 #include "unicodedata_db.h"
45 static const _PyUnicode_DatabaseRecord*
46 _getrecord_ex(Py_UCS4 code)
48 int index;
49 if (code >= 0x110000)
50 index = 0;
51 else {
52 index = index1[(code>>SHIFT)];
53 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
56 return &_PyUnicode_Database_Records[index];
59 /* ------------- Previous-version API ------------------------------------- */
60 typedef struct previous_version {
61 PyObject_HEAD
62 const char *name;
63 const change_record* (*getrecord)(Py_UCS4);
64 Py_UCS4 (*normalization)(Py_UCS4);
65 } PreviousDBVersion;
67 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
69 static PyMemberDef DB_members[] = {
70 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
71 {NULL}
74 /* forward declaration */
75 static PyTypeObject UCD_Type;
76 #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
78 static PyObject*
79 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
80 Py_UCS4 (*normalization)(Py_UCS4))
82 PreviousDBVersion *self;
83 self = PyObject_New(PreviousDBVersion, &UCD_Type);
84 if (self == NULL)
85 return NULL;
86 self->name = name;
87 self->getrecord = getrecord;
88 self->normalization = normalization;
89 return (PyObject*)self;
93 static Py_UCS4 getuchar(PyUnicodeObject *obj)
95 Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
97 if (PyUnicode_GET_SIZE(obj) == 1)
98 return *v;
99 #ifndef Py_UNICODE_WIDE
100 else if ((PyUnicode_GET_SIZE(obj) == 2) &&
101 (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
102 (0xDC00 <= v[1] && v[1] <= 0xDFFF))
103 return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
104 #endif
105 PyErr_SetString(PyExc_TypeError,
106 "need a single Unicode character as parameter");
107 return (Py_UCS4)-1;
110 /* --- Module API --------------------------------------------------------- */
112 PyDoc_STRVAR(unicodedata_decimal__doc__,
113 "decimal(unichr[, default])\n\
115 Returns the decimal value assigned to the Unicode character unichr\n\
116 as integer. If no such value is defined, default is returned, or, if\n\
117 not given, ValueError is raised.");
119 static PyObject *
120 unicodedata_decimal(PyObject *self, PyObject *args)
122 PyUnicodeObject *v;
123 PyObject *defobj = NULL;
124 int have_old = 0;
125 long rc;
126 Py_UCS4 c;
128 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
129 return NULL;
130 c = getuchar(v);
131 if (c == (Py_UCS4)-1)
132 return NULL;
134 if (self && UCD_Check(self)) {
135 const change_record *old = get_old_record(self, c);
136 if (old->category_changed == 0) {
137 /* unassigned */
138 have_old = 1;
139 rc = -1;
141 else if (old->decimal_changed != 0xFF) {
142 have_old = 1;
143 rc = old->decimal_changed;
147 if (!have_old)
148 rc = Py_UNICODE_TODECIMAL(c);
149 if (rc < 0) {
150 if (defobj == NULL) {
151 PyErr_SetString(PyExc_ValueError,
152 "not a decimal");
153 return NULL;
155 else {
156 Py_INCREF(defobj);
157 return defobj;
160 return PyLong_FromLong(rc);
163 PyDoc_STRVAR(unicodedata_digit__doc__,
164 "digit(unichr[, default])\n\
166 Returns the digit value assigned to the Unicode character unichr as\n\
167 integer. If no such value is defined, default is returned, or, if\n\
168 not given, ValueError is raised.");
170 static PyObject *
171 unicodedata_digit(PyObject *self, PyObject *args)
173 PyUnicodeObject *v;
174 PyObject *defobj = NULL;
175 long rc;
176 Py_UCS4 c;
178 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
179 return NULL;
180 c = getuchar(v);
181 if (c == (Py_UCS4)-1)
182 return NULL;
183 rc = Py_UNICODE_TODIGIT(c);
184 if (rc < 0) {
185 if (defobj == NULL) {
186 PyErr_SetString(PyExc_ValueError, "not a digit");
187 return NULL;
189 else {
190 Py_INCREF(defobj);
191 return defobj;
194 return PyLong_FromLong(rc);
197 PyDoc_STRVAR(unicodedata_numeric__doc__,
198 "numeric(unichr[, default])\n\
200 Returns the numeric value assigned to the Unicode character unichr\n\
201 as float. If no such value is defined, default is returned, or, if\n\
202 not given, ValueError is raised.");
204 static PyObject *
205 unicodedata_numeric(PyObject *self, PyObject *args)
207 PyUnicodeObject *v;
208 PyObject *defobj = NULL;
209 int have_old = 0;
210 double rc;
211 Py_UCS4 c;
213 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
214 return NULL;
215 c = getuchar(v);
216 if (c == (Py_UCS4)-1)
217 return NULL;
219 if (self && UCD_Check(self)) {
220 const change_record *old = get_old_record(self, c);
221 if (old->category_changed == 0) {
222 /* unassigned */
223 have_old = 1;
224 rc = -1.0;
226 else if (old->decimal_changed != 0xFF) {
227 have_old = 1;
228 rc = old->decimal_changed;
232 if (!have_old)
233 rc = Py_UNICODE_TONUMERIC(c);
234 if (rc == -1.0) {
235 if (defobj == NULL) {
236 PyErr_SetString(PyExc_ValueError, "not a numeric character");
237 return NULL;
239 else {
240 Py_INCREF(defobj);
241 return defobj;
244 return PyFloat_FromDouble(rc);
247 PyDoc_STRVAR(unicodedata_category__doc__,
248 "category(unichr)\n\
250 Returns the general category assigned to the Unicode character\n\
251 unichr as string.");
253 static PyObject *
254 unicodedata_category(PyObject *self, PyObject *args)
256 PyUnicodeObject *v;
257 int index;
258 Py_UCS4 c;
260 if (!PyArg_ParseTuple(args, "O!:category",
261 &PyUnicode_Type, &v))
262 return NULL;
263 c = getuchar(v);
264 if (c == (Py_UCS4)-1)
265 return NULL;
266 index = (int) _getrecord_ex(c)->category;
267 if (self && UCD_Check(self)) {
268 const change_record *old = get_old_record(self, c);
269 if (old->category_changed != 0xFF)
270 index = old->category_changed;
272 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
275 PyDoc_STRVAR(unicodedata_bidirectional__doc__,
276 "bidirectional(unichr)\n\
278 Returns the bidirectional category assigned to the Unicode character\n\
279 unichr as string. If no such value is defined, an empty string is\n\
280 returned.");
282 static PyObject *
283 unicodedata_bidirectional(PyObject *self, PyObject *args)
285 PyUnicodeObject *v;
286 int index;
287 Py_UCS4 c;
289 if (!PyArg_ParseTuple(args, "O!:bidirectional",
290 &PyUnicode_Type, &v))
291 return NULL;
292 c = getuchar(v);
293 if (c == (Py_UCS4)-1)
294 return NULL;
295 index = (int) _getrecord_ex(c)->bidirectional;
296 if (self && UCD_Check(self)) {
297 const change_record *old = get_old_record(self, c);
298 if (old->category_changed == 0)
299 index = 0; /* unassigned */
300 else if (old->bidir_changed != 0xFF)
301 index = old->bidir_changed;
303 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
306 PyDoc_STRVAR(unicodedata_combining__doc__,
307 "combining(unichr)\n\
309 Returns the canonical combining class assigned to the Unicode\n\
310 character unichr as integer. Returns 0 if no combining class is\n\
311 defined.");
313 static PyObject *
314 unicodedata_combining(PyObject *self, PyObject *args)
316 PyUnicodeObject *v;
317 int index;
318 Py_UCS4 c;
320 if (!PyArg_ParseTuple(args, "O!:combining",
321 &PyUnicode_Type, &v))
322 return NULL;
323 c = getuchar(v);
324 if (c == (Py_UCS4)-1)
325 return NULL;
326 index = (int) _getrecord_ex(c)->combining;
327 if (self && UCD_Check(self)) {
328 const change_record *old = get_old_record(self, c);
329 if (old->category_changed == 0)
330 index = 0; /* unassigned */
332 return PyLong_FromLong(index);
335 PyDoc_STRVAR(unicodedata_mirrored__doc__,
336 "mirrored(unichr)\n\
338 Returns the mirrored property assigned to the Unicode character\n\
339 unichr as integer. Returns 1 if the character has been identified as\n\
340 a \"mirrored\" character in bidirectional text, 0 otherwise.");
342 static PyObject *
343 unicodedata_mirrored(PyObject *self, PyObject *args)
345 PyUnicodeObject *v;
346 int index;
347 Py_UCS4 c;
349 if (!PyArg_ParseTuple(args, "O!:mirrored",
350 &PyUnicode_Type, &v))
351 return NULL;
352 c = getuchar(v);
353 if (c == (Py_UCS4)-1)
354 return NULL;
355 index = (int) _getrecord_ex(c)->mirrored;
356 if (self && UCD_Check(self)) {
357 const change_record *old = get_old_record(self, c);
358 if (old->category_changed == 0)
359 index = 0; /* unassigned */
360 else if (old->mirrored_changed != 0xFF)
361 index = old->mirrored_changed;
363 return PyLong_FromLong(index);
366 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
367 "east_asian_width(unichr)\n\
369 Returns the east asian width assigned to the Unicode character\n\
370 unichr as string.");
372 static PyObject *
373 unicodedata_east_asian_width(PyObject *self, PyObject *args)
375 PyUnicodeObject *v;
376 int index;
377 Py_UCS4 c;
379 if (!PyArg_ParseTuple(args, "O!:east_asian_width",
380 &PyUnicode_Type, &v))
381 return NULL;
382 c = getuchar(v);
383 if (c == (Py_UCS4)-1)
384 return NULL;
385 index = (int) _getrecord_ex(c)->east_asian_width;
386 if (self && UCD_Check(self)) {
387 const change_record *old = get_old_record(self, c);
388 if (old->category_changed == 0)
389 index = 0; /* unassigned */
391 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
394 PyDoc_STRVAR(unicodedata_decomposition__doc__,
395 "decomposition(unichr)\n\
397 Returns the character decomposition mapping assigned to the Unicode\n\
398 character unichr as string. An empty string is returned in case no\n\
399 such mapping is defined.");
401 static PyObject *
402 unicodedata_decomposition(PyObject *self, PyObject *args)
404 PyUnicodeObject *v;
405 char decomp[256];
406 int code, index, count, i;
407 unsigned int prefix_index;
408 Py_UCS4 c;
410 if (!PyArg_ParseTuple(args, "O!:decomposition",
411 &PyUnicode_Type, &v))
412 return NULL;
413 c = getuchar(v);
414 if (c == (Py_UCS4)-1)
415 return NULL;
417 code = (int)c;
419 if (self && UCD_Check(self)) {
420 const change_record *old = get_old_record(self, c);
421 if (old->category_changed == 0)
422 return PyUnicode_FromString(""); /* unassigned */
425 if (code < 0 || code >= 0x110000)
426 index = 0;
427 else {
428 index = decomp_index1[(code>>DECOMP_SHIFT)];
429 index = decomp_index2[(index<<DECOMP_SHIFT)+
430 (code&((1<<DECOMP_SHIFT)-1))];
433 /* high byte is number of hex bytes (usually one or two), low byte
434 is prefix code (from*/
435 count = decomp_data[index] >> 8;
437 /* XXX: could allocate the PyString up front instead
438 (strlen(prefix) + 5 * count + 1 bytes) */
440 /* Based on how index is calculated above and decomp_data is generated
441 from Tools/unicode/makeunicodedata.py, it should not be possible
442 to overflow decomp_prefix. */
443 prefix_index = decomp_data[index] & 255;
444 assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
446 /* copy prefix */
447 i = strlen(decomp_prefix[prefix_index]);
448 memcpy(decomp, decomp_prefix[prefix_index], i);
450 while (count-- > 0) {
451 if (i)
452 decomp[i++] = ' ';
453 assert((size_t)i < sizeof(decomp));
454 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
455 decomp_data[++index]);
456 i += strlen(decomp + i);
459 decomp[i] = '\0';
461 return PyUnicode_FromString(decomp);
464 static void
465 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
467 if (code >= 0x110000) {
468 *index = 0;
469 } else if (self && UCD_Check(self) &&
470 get_old_record(self, code)->category_changed==0) {
471 /* unassigned in old version */
472 *index = 0;
474 else {
475 *index = decomp_index1[(code>>DECOMP_SHIFT)];
476 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
477 (code&((1<<DECOMP_SHIFT)-1))];
480 /* high byte is number of hex bytes (usually one or two), low byte
481 is prefix code (from*/
482 *count = decomp_data[*index] >> 8;
483 *prefix = decomp_data[*index] & 255;
485 (*index)++;
488 #define SBase 0xAC00
489 #define LBase 0x1100
490 #define VBase 0x1161
491 #define TBase 0x11A7
492 #define LCount 19
493 #define VCount 21
494 #define TCount 28
495 #define NCount (VCount*TCount)
496 #define SCount (LCount*NCount)
498 static PyObject*
499 nfd_nfkd(PyObject *self, PyObject *input, int k)
501 PyObject *result;
502 Py_UNICODE *i, *end, *o;
503 /* Longest decomposition in Unicode 3.2: U+FDFA */
504 Py_UNICODE stack[20];
505 Py_ssize_t space, isize;
506 int index, prefix, count, stackptr;
507 unsigned char prev, cur;
509 stackptr = 0;
510 isize = PyUnicode_GET_SIZE(input);
511 /* Overallocate atmost 10 characters. */
512 space = (isize > 10 ? 10 : isize) + isize;
513 result = PyUnicode_FromUnicode(NULL, space);
514 if (!result)
515 return NULL;
516 i = PyUnicode_AS_UNICODE(input);
517 end = i + isize;
518 o = PyUnicode_AS_UNICODE(result);
520 while (i < end) {
521 stack[stackptr++] = *i++;
522 while(stackptr) {
523 Py_UNICODE code = stack[--stackptr];
524 /* Hangul Decomposition adds three characters in
525 a single step, so we need atleast that much room. */
526 if (space < 3) {
527 Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
528 space += 10;
529 if (PyUnicode_Resize(&result, newsize) == -1)
530 return NULL;
531 o = PyUnicode_AS_UNICODE(result) + newsize - space;
533 /* Hangul Decomposition. */
534 if (SBase <= code && code < (SBase+SCount)) {
535 int SIndex = code - SBase;
536 int L = LBase + SIndex / NCount;
537 int V = VBase + (SIndex % NCount) / TCount;
538 int T = TBase + SIndex % TCount;
539 *o++ = L;
540 *o++ = V;
541 space -= 2;
542 if (T != TBase) {
543 *o++ = T;
544 space --;
546 continue;
548 /* normalization changes */
549 if (self && UCD_Check(self)) {
550 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
551 if (value != 0) {
552 stack[stackptr++] = value;
553 continue;
557 /* Other decompositions. */
558 get_decomp_record(self, code, &index, &prefix, &count);
560 /* Copy character if it is not decomposable, or has a
561 compatibility decomposition, but we do NFD. */
562 if (!count || (prefix && !k)) {
563 *o++ = code;
564 space--;
565 continue;
567 /* Copy decomposition onto the stack, in reverse
568 order. */
569 while(count) {
570 code = decomp_data[index + (--count)];
571 stack[stackptr++] = code;
576 /* Drop overallocation. Cannot fail. */
577 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
579 /* Sort canonically. */
580 i = PyUnicode_AS_UNICODE(result);
581 prev = _getrecord_ex(*i)->combining;
582 end = i + PyUnicode_GET_SIZE(result);
583 for (i++; i < end; i++) {
584 cur = _getrecord_ex(*i)->combining;
585 if (prev == 0 || cur == 0 || prev <= cur) {
586 prev = cur;
587 continue;
589 /* Non-canonical order. Need to switch *i with previous. */
590 o = i - 1;
591 while (1) {
592 Py_UNICODE tmp = o[1];
593 o[1] = o[0];
594 o[0] = tmp;
595 o--;
596 if (o < PyUnicode_AS_UNICODE(result))
597 break;
598 prev = _getrecord_ex(*o)->combining;
599 if (prev == 0 || prev <= cur)
600 break;
602 prev = _getrecord_ex(*i)->combining;
604 return result;
607 static int
608 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
610 int index;
611 for (index = 0; nfc[index].start; index++) {
612 int start = nfc[index].start;
613 if (code < start)
614 return -1;
615 if (code <= start + nfc[index].count) {
616 int delta = code - start;
617 return nfc[index].index + delta;
620 return -1;
623 static PyObject*
624 nfc_nfkc(PyObject *self, PyObject *input, int k)
626 PyObject *result;
627 Py_UNICODE *i, *i1, *o, *end;
628 int f,l,index,index1,comb;
629 Py_UNICODE code;
630 Py_UNICODE *skipped[20];
631 int cskipped = 0;
633 result = nfd_nfkd(self, input, k);
634 if (!result)
635 return NULL;
637 /* We are going to modify result in-place.
638 If nfd_nfkd is changed to sometimes return the input,
639 this code needs to be reviewed. */
640 assert(result != input);
642 i = PyUnicode_AS_UNICODE(result);
643 end = i + PyUnicode_GET_SIZE(result);
644 o = PyUnicode_AS_UNICODE(result);
646 again:
647 while (i < end) {
648 for (index = 0; index < cskipped; index++) {
649 if (skipped[index] == i) {
650 /* *i character is skipped.
651 Remove from list. */
652 skipped[index] = skipped[cskipped-1];
653 cskipped--;
654 i++;
655 goto again; /* continue while */
658 /* Hangul Composition. We don't need to check for <LV,T>
659 pairs, since we always have decomposed data. */
660 if (LBase <= *i && *i < (LBase+LCount) &&
661 i + 1 < end &&
662 VBase <= i[1] && i[1] <= (VBase+VCount)) {
663 int LIndex, VIndex;
664 LIndex = i[0] - LBase;
665 VIndex = i[1] - VBase;
666 code = SBase + (LIndex*VCount+VIndex)*TCount;
667 i+=2;
668 if (i < end &&
669 TBase <= *i && *i <= (TBase+TCount)) {
670 code += *i-TBase;
671 i++;
673 *o++ = code;
674 continue;
677 f = find_nfc_index(self, nfc_first, *i);
678 if (f == -1) {
679 *o++ = *i++;
680 continue;
682 /* Find next unblocked character. */
683 i1 = i+1;
684 comb = 0;
685 while (i1 < end) {
686 int comb1 = _getrecord_ex(*i1)->combining;
687 if (comb1 && comb == comb1) {
688 /* Character is blocked. */
689 i1++;
690 continue;
692 l = find_nfc_index(self, nfc_last, *i1);
693 /* *i1 cannot be combined with *i. If *i1
694 is a starter, we don't need to look further.
695 Otherwise, record the combining class. */
696 if (l == -1) {
697 not_combinable:
698 if (comb1 == 0)
699 break;
700 comb = comb1;
701 i1++;
702 continue;
704 index = f*TOTAL_LAST + l;
705 index1 = comp_index[index >> COMP_SHIFT];
706 code = comp_data[(index1<<COMP_SHIFT)+
707 (index&((1<<COMP_SHIFT)-1))];
708 if (code == 0)
709 goto not_combinable;
711 /* Replace the original character. */
712 *i = code;
713 /* Mark the second character unused. */
714 skipped[cskipped++] = i1;
715 i1++;
716 f = find_nfc_index(self, nfc_first, *i);
717 if (f == -1)
718 break;
720 *o++ = *i++;
722 if (o != end)
723 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
724 return result;
727 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
728 static int
729 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
731 Py_UNICODE *i, *end;
732 unsigned char prev_combining = 0, quickcheck_mask;
734 /* An older version of the database is requested, quickchecks must be
735 disabled. */
736 if (self && UCD_Check(self))
737 return 0;
739 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
740 as described in http://unicode.org/reports/tr15/#Annex8. */
741 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
743 i = PyUnicode_AS_UNICODE(input);
744 end = i + PyUnicode_GET_SIZE(input);
745 while (i < end) {
746 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
747 unsigned char combining = record->combining;
748 unsigned char quickcheck = record->normalization_quick_check;
750 if (quickcheck & quickcheck_mask)
751 return 0; /* this string might need normalization */
752 if (combining && prev_combining > combining)
753 return 0; /* non-canonical sort order, not normalized */
754 prev_combining = combining;
756 return 1; /* certainly normalized */
759 PyDoc_STRVAR(unicodedata_normalize__doc__,
760 "normalize(form, unistr)\n\
762 Return the normal form 'form' for the Unicode string unistr. Valid\n\
763 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
765 static PyObject*
766 unicodedata_normalize(PyObject *self, PyObject *args)
768 char *form;
769 PyObject *input;
771 if(!PyArg_ParseTuple(args, "sO!:normalize",
772 &form, &PyUnicode_Type, &input))
773 return NULL;
775 if (PyUnicode_GetSize(input) == 0) {
776 /* Special case empty input strings, since resizing
777 them later would cause internal errors. */
778 Py_INCREF(input);
779 return input;
782 if (strcmp(form, "NFC") == 0) {
783 if (is_normalized(self, input, 1, 0)) {
784 Py_INCREF(input);
785 return input;
787 return nfc_nfkc(self, input, 0);
789 if (strcmp(form, "NFKC") == 0) {
790 if (is_normalized(self, input, 1, 1)) {
791 Py_INCREF(input);
792 return input;
794 return nfc_nfkc(self, input, 1);
796 if (strcmp(form, "NFD") == 0) {
797 if (is_normalized(self, input, 0, 0)) {
798 Py_INCREF(input);
799 return input;
801 return nfd_nfkd(self, input, 0);
803 if (strcmp(form, "NFKD") == 0) {
804 if (is_normalized(self, input, 0, 1)) {
805 Py_INCREF(input);
806 return input;
808 return nfd_nfkd(self, input, 1);
810 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
811 return NULL;
814 /* -------------------------------------------------------------------- */
815 /* unicode character name tables */
817 /* data file generated by Tools/unicode/makeunicodedata.py */
818 #include "unicodename_db.h"
820 /* -------------------------------------------------------------------- */
821 /* database code (cut and pasted from the unidb package) */
823 static unsigned long
824 _gethash(const char *s, int len, int scale)
826 int i;
827 unsigned long h = 0;
828 unsigned long ix;
829 for (i = 0; i < len; i++) {
830 h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
831 ix = h & 0xff000000;
832 if (ix)
833 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
835 return h;
838 static char *hangul_syllables[][3] = {
839 { "G", "A", "" },
840 { "GG", "AE", "G" },
841 { "N", "YA", "GG" },
842 { "D", "YAE", "GS" },
843 { "DD", "EO", "N", },
844 { "R", "E", "NJ" },
845 { "M", "YEO", "NH" },
846 { "B", "YE", "D" },
847 { "BB", "O", "L" },
848 { "S", "WA", "LG" },
849 { "SS", "WAE", "LM" },
850 { "", "OE", "LB" },
851 { "J", "YO", "LS" },
852 { "JJ", "U", "LT" },
853 { "C", "WEO", "LP" },
854 { "K", "WE", "LH" },
855 { "T", "WI", "M" },
856 { "P", "YU", "B" },
857 { "H", "EU", "BS" },
858 { 0, "YI", "S" },
859 { 0, "I", "SS" },
860 { 0, 0, "NG" },
861 { 0, 0, "J" },
862 { 0, 0, "C" },
863 { 0, 0, "K" },
864 { 0, 0, "T" },
865 { 0, 0, "P" },
866 { 0, 0, "H" }
869 static int
870 is_unified_ideograph(Py_UCS4 code)
872 return (
873 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
874 (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
875 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
878 static int
879 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
881 int offset;
882 int i;
883 int word;
884 unsigned char* w;
886 if (code >= 0x110000)
887 return 0;
889 if (self && UCD_Check(self)) {
890 const change_record *old = get_old_record(self, code);
891 if (old->category_changed == 0) {
892 /* unassigned */
893 return 0;
897 if (SBase <= code && code < SBase+SCount) {
898 /* Hangul syllable. */
899 int SIndex = code - SBase;
900 int L = SIndex / NCount;
901 int V = (SIndex % NCount) / TCount;
902 int T = SIndex % TCount;
904 if (buflen < 27)
905 /* Worst case: HANGUL SYLLABLE <10chars>. */
906 return 0;
907 strcpy(buffer, "HANGUL SYLLABLE ");
908 buffer += 16;
909 strcpy(buffer, hangul_syllables[L][0]);
910 buffer += strlen(hangul_syllables[L][0]);
911 strcpy(buffer, hangul_syllables[V][1]);
912 buffer += strlen(hangul_syllables[V][1]);
913 strcpy(buffer, hangul_syllables[T][2]);
914 buffer += strlen(hangul_syllables[T][2]);
915 *buffer = '\0';
916 return 1;
919 if (is_unified_ideograph(code)) {
920 if (buflen < 28)
921 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
922 return 0;
923 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
924 return 1;
927 /* get offset into phrasebook */
928 offset = phrasebook_offset1[(code>>phrasebook_shift)];
929 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
930 (code&((1<<phrasebook_shift)-1))];
931 if (!offset)
932 return 0;
934 i = 0;
936 for (;;) {
937 /* get word index */
938 word = phrasebook[offset] - phrasebook_short;
939 if (word >= 0) {
940 word = (word << 8) + phrasebook[offset+1];
941 offset += 2;
942 } else
943 word = phrasebook[offset++];
944 if (i) {
945 if (i > buflen)
946 return 0; /* buffer overflow */
947 buffer[i++] = ' ';
949 /* copy word string from lexicon. the last character in the
950 word has bit 7 set. the last word in a string ends with
951 0x80 */
952 w = lexicon + lexicon_offset[word];
953 while (*w < 128) {
954 if (i >= buflen)
955 return 0; /* buffer overflow */
956 buffer[i++] = *w++;
958 if (i >= buflen)
959 return 0; /* buffer overflow */
960 buffer[i++] = *w & 127;
961 if (*w == 128)
962 break; /* end of word */
965 return 1;
968 static int
969 _cmpname(PyObject *self, int code, const char* name, int namelen)
971 /* check if code corresponds to the given name */
972 int i;
973 char buffer[NAME_MAXLEN];
974 if (!_getucname(self, code, buffer, sizeof(buffer)))
975 return 0;
976 for (i = 0; i < namelen; i++) {
977 if (toupper(Py_CHARMASK(name[i])) != buffer[i])
978 return 0;
980 return buffer[namelen] == '\0';
983 static void
984 find_syllable(const char *str, int *len, int *pos, int count, int column)
986 int i, len1;
987 *len = -1;
988 for (i = 0; i < count; i++) {
989 char *s = hangul_syllables[i][column];
990 len1 = strlen(s);
991 if (len1 <= *len)
992 continue;
993 if (strncmp(str, s, len1) == 0) {
994 *len = len1;
995 *pos = i;
998 if (*len == -1) {
999 *len = 0;
1003 static int
1004 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
1006 unsigned int h, v;
1007 unsigned int mask = code_size-1;
1008 unsigned int i, incr;
1010 /* Check for hangul syllables. */
1011 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1012 int len, L = -1, V = -1, T = -1;
1013 const char *pos = name + 16;
1014 find_syllable(pos, &len, &L, LCount, 0);
1015 pos += len;
1016 find_syllable(pos, &len, &V, VCount, 1);
1017 pos += len;
1018 find_syllable(pos, &len, &T, TCount, 2);
1019 pos += len;
1020 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1021 *code = SBase + (L*VCount+V)*TCount + T;
1022 return 1;
1024 /* Otherwise, it's an illegal syllable name. */
1025 return 0;
1028 /* Check for unified ideographs. */
1029 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1030 /* Four or five hexdigits must follow. */
1031 v = 0;
1032 name += 22;
1033 namelen -= 22;
1034 if (namelen != 4 && namelen != 5)
1035 return 0;
1036 while (namelen--) {
1037 v *= 16;
1038 if (*name >= '0' && *name <= '9')
1039 v += *name - '0';
1040 else if (*name >= 'A' && *name <= 'F')
1041 v += *name - 'A' + 10;
1042 else
1043 return 0;
1044 name++;
1046 if (!is_unified_ideograph(v))
1047 return 0;
1048 *code = v;
1049 return 1;
1052 /* the following is the same as python's dictionary lookup, with
1053 only minor changes. see the makeunicodedata script for more
1054 details */
1056 h = (unsigned int) _gethash(name, namelen, code_magic);
1057 i = (~h) & mask;
1058 v = code_hash[i];
1059 if (!v)
1060 return 0;
1061 if (_cmpname(self, v, name, namelen)) {
1062 *code = v;
1063 return 1;
1065 incr = (h ^ (h >> 3)) & mask;
1066 if (!incr)
1067 incr = mask;
1068 for (;;) {
1069 i = (i + incr) & mask;
1070 v = code_hash[i];
1071 if (!v)
1072 return 0;
1073 if (_cmpname(self, v, name, namelen)) {
1074 *code = v;
1075 return 1;
1077 incr = incr << 1;
1078 if (incr > mask)
1079 incr = incr ^ code_poly;
1083 static const _PyUnicode_Name_CAPI hashAPI =
1085 sizeof(_PyUnicode_Name_CAPI),
1086 _getucname,
1087 _getcode
1090 /* -------------------------------------------------------------------- */
1091 /* Python bindings */
1093 PyDoc_STRVAR(unicodedata_name__doc__,
1094 "name(unichr[, default])\n\
1095 Returns the name assigned to the Unicode character unichr as a\n\
1096 string. If no name is defined, default is returned, or, if not\n\
1097 given, ValueError is raised.");
1099 static PyObject *
1100 unicodedata_name(PyObject* self, PyObject* args)
1102 char name[NAME_MAXLEN];
1103 Py_UCS4 c;
1105 PyUnicodeObject* v;
1106 PyObject* defobj = NULL;
1107 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1108 return NULL;
1110 c = getuchar(v);
1111 if (c == (Py_UCS4)-1)
1112 return NULL;
1114 if (!_getucname(self, c, name, sizeof(name))) {
1115 if (defobj == NULL) {
1116 PyErr_SetString(PyExc_ValueError, "no such name");
1117 return NULL;
1119 else {
1120 Py_INCREF(defobj);
1121 return defobj;
1125 return PyUnicode_FromString(name);
1128 PyDoc_STRVAR(unicodedata_lookup__doc__,
1129 "lookup(name)\n\
1131 Look up character by name. If a character with the\n\
1132 given name is found, return the corresponding Unicode\n\
1133 character. If not found, KeyError is raised.");
1135 static PyObject *
1136 unicodedata_lookup(PyObject* self, PyObject* args)
1138 Py_UCS4 code;
1139 Py_UNICODE str[2];
1141 char* name;
1142 int namelen;
1143 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1144 return NULL;
1146 if (!_getcode(self, name, namelen, &code)) {
1147 PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1148 name);
1149 return NULL;
1152 #ifndef Py_UNICODE_WIDE
1153 if (code >= 0x10000) {
1154 str[0] = 0xd800 + ((code - 0x10000) >> 10);
1155 str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1156 return PyUnicode_FromUnicode(str, 2);
1158 #endif
1159 str[0] = (Py_UNICODE) code;
1160 return PyUnicode_FromUnicode(str, 1);
1163 /* XXX Add doc strings. */
1165 static PyMethodDef unicodedata_functions[] = {
1166 {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1167 {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1168 {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1169 {"category", unicodedata_category, METH_VARARGS,
1170 unicodedata_category__doc__},
1171 {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1172 unicodedata_bidirectional__doc__},
1173 {"combining", unicodedata_combining, METH_VARARGS,
1174 unicodedata_combining__doc__},
1175 {"mirrored", unicodedata_mirrored, METH_VARARGS,
1176 unicodedata_mirrored__doc__},
1177 {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1178 unicodedata_east_asian_width__doc__},
1179 {"decomposition", unicodedata_decomposition, METH_VARARGS,
1180 unicodedata_decomposition__doc__},
1181 {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1182 {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1183 {"normalize", unicodedata_normalize, METH_VARARGS,
1184 unicodedata_normalize__doc__},
1185 {NULL, NULL} /* sentinel */
1188 static PyTypeObject UCD_Type = {
1189 /* The ob_type field must be initialized in the module init function
1190 * to be portable to Windows without using C++. */
1191 PyVarObject_HEAD_INIT(NULL, 0)
1192 "unicodedata.UCD", /*tp_name*/
1193 sizeof(PreviousDBVersion), /*tp_basicsize*/
1194 0, /*tp_itemsize*/
1195 /* methods */
1196 (destructor)PyObject_Del, /*tp_dealloc*/
1197 0, /*tp_print*/
1198 0, /*tp_getattr*/
1199 0, /*tp_setattr*/
1200 0, /*tp_reserved*/
1201 0, /*tp_repr*/
1202 0, /*tp_as_number*/
1203 0, /*tp_as_sequence*/
1204 0, /*tp_as_mapping*/
1205 0, /*tp_hash*/
1206 0, /*tp_call*/
1207 0, /*tp_str*/
1208 PyObject_GenericGetAttr,/*tp_getattro*/
1209 0, /*tp_setattro*/
1210 0, /*tp_as_buffer*/
1211 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1212 0, /*tp_doc*/
1213 0, /*tp_traverse*/
1214 0, /*tp_clear*/
1215 0, /*tp_richcompare*/
1216 0, /*tp_weaklistoffset*/
1217 0, /*tp_iter*/
1218 0, /*tp_iternext*/
1219 unicodedata_functions, /*tp_methods*/
1220 DB_members, /*tp_members*/
1221 0, /*tp_getset*/
1222 0, /*tp_base*/
1223 0, /*tp_dict*/
1224 0, /*tp_descr_get*/
1225 0, /*tp_descr_set*/
1226 0, /*tp_dictoffset*/
1227 0, /*tp_init*/
1228 0, /*tp_alloc*/
1229 0, /*tp_new*/
1230 0, /*tp_free*/
1231 0, /*tp_is_gc*/
1234 PyDoc_STRVAR(unicodedata_docstring,
1235 "This module provides access to the Unicode Character Database which\n\
1236 defines character properties for all Unicode characters. The data in\n\
1237 this database is based on the UnicodeData.txt file version\n\
1238 5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1240 The module uses the same names and symbols as defined by the\n\
1241 UnicodeData File Format 5.1.0 (see\n\
1242 http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
1245 static struct PyModuleDef unicodedatamodule = {
1246 PyModuleDef_HEAD_INIT,
1247 "unicodedata",
1248 unicodedata_docstring,
1250 unicodedata_functions,
1251 NULL,
1252 NULL,
1253 NULL,
1254 NULL
1257 PyMODINIT_FUNC
1258 PyInit_unicodedata(void)
1260 PyObject *m, *v;
1262 Py_TYPE(&UCD_Type) = &PyType_Type;
1264 m = PyModule_Create(&unicodedatamodule);
1265 if (!m)
1266 return NULL;
1268 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1269 Py_INCREF(&UCD_Type);
1270 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1272 /* Previous versions */
1273 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1274 if (v != NULL)
1275 PyModule_AddObject(m, "ucd_3_2_0", v);
1277 /* Export C API */
1278 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1279 if (v != NULL)
1280 PyModule_AddObject(m, "ucnhash_CAPI", v);
1281 return m;
1285 Local variables:
1286 c-basic-offset: 4
1287 indent-tabs-mode: nil
1288 End: