1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 5.1 data base.
5 Data was extracted from the Unicode 5.1 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
17 #include "structmember.h"
19 /* character properties */
22 const unsigned char category
; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining
; /* combining class value 0 - 255 */
25 const unsigned char bidirectional
; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored
; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width
; /* index into
29 _PyUnicode_EastAsianWidth */
30 } _PyUnicode_DatabaseRecord
;
32 typedef struct change_record
{
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed
;
35 const unsigned char category_changed
;
36 const unsigned char decimal_changed
;
37 const unsigned char mirrored_changed
;
38 const int numeric_changed
;
41 /* data file generated by Tools/unicode/makeunicodedata.py */
42 #include "unicodedata_db.h"
44 static const _PyUnicode_DatabaseRecord
*
45 _getrecord_ex(Py_UCS4 code
)
51 index
= index1
[(code
>>SHIFT
)];
52 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
55 return &_PyUnicode_Database_Records
[index
];
58 /* ------------- Previous-version API ------------------------------------- */
59 typedef struct previous_version
{
62 const change_record
* (*getrecord
)(Py_UCS4
);
63 Py_UCS4 (*normalization
)(Py_UCS4
);
66 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
68 static PyMemberDef DB_members
[] = {
69 {"unidata_version", T_STRING
, offsetof(PreviousDBVersion
, name
), READONLY
},
73 /* forward declaration */
74 static PyTypeObject UCD_Type
;
77 new_previous_version(const char*name
, const change_record
* (*getrecord
)(Py_UCS4
),
78 Py_UCS4 (*normalization
)(Py_UCS4
))
80 PreviousDBVersion
*self
;
81 self
= PyObject_New(PreviousDBVersion
, &UCD_Type
);
85 self
->getrecord
= getrecord
;
86 self
->normalization
= normalization
;
87 return (PyObject
*)self
;
91 static Py_UCS4
getuchar(PyUnicodeObject
*obj
)
93 Py_UNICODE
*v
= PyUnicode_AS_UNICODE(obj
);
95 if (PyUnicode_GET_SIZE(obj
) == 1)
97 #ifndef Py_UNICODE_WIDE
98 else if ((PyUnicode_GET_SIZE(obj
) == 2) &&
99 (0xD800 <= v
[0] && v
[0] <= 0xDBFF) &&
100 (0xDC00 <= v
[1] && v
[1] <= 0xDFFF))
101 return (((v
[0] & 0x3FF)<<10) | (v
[1] & 0x3FF)) + 0x10000;
103 PyErr_SetString(PyExc_TypeError
,
104 "need a single Unicode character as parameter");
108 /* --- Module API --------------------------------------------------------- */
110 PyDoc_STRVAR(unicodedata_decimal__doc__
,
111 "decimal(unichr[, default])\n\
113 Returns the decimal value assigned to the Unicode character unichr\n\
114 as integer. If no such value is defined, default is returned, or, if\n\
115 not given, ValueError is raised.");
118 unicodedata_decimal(PyObject
*self
, PyObject
*args
)
121 PyObject
*defobj
= NULL
;
126 if (!PyArg_ParseTuple(args
, "O!|O:decimal", &PyUnicode_Type
, &v
, &defobj
))
129 if (c
== (Py_UCS4
)-1)
133 const change_record
*old
= get_old_record(self
, c
);
134 if (old
->category_changed
== 0) {
139 else if (old
->decimal_changed
!= 0xFF) {
141 rc
= old
->decimal_changed
;
146 rc
= Py_UNICODE_TODECIMAL(c
);
148 if (defobj
== NULL
) {
149 PyErr_SetString(PyExc_ValueError
,
158 return PyInt_FromLong(rc
);
161 PyDoc_STRVAR(unicodedata_digit__doc__
,
162 "digit(unichr[, default])\n\
164 Returns the digit value assigned to the Unicode character unichr as\n\
165 integer. If no such value is defined, default is returned, or, if\n\
166 not given, ValueError is raised.");
169 unicodedata_digit(PyObject
*self
, PyObject
*args
)
172 PyObject
*defobj
= NULL
;
176 if (!PyArg_ParseTuple(args
, "O!|O:digit", &PyUnicode_Type
, &v
, &defobj
))
179 if (c
== (Py_UCS4
)-1)
181 rc
= Py_UNICODE_TODIGIT(c
);
183 if (defobj
== NULL
) {
184 PyErr_SetString(PyExc_ValueError
, "not a digit");
192 return PyInt_FromLong(rc
);
195 PyDoc_STRVAR(unicodedata_numeric__doc__
,
196 "numeric(unichr[, default])\n\
198 Returns the numeric value assigned to the Unicode character unichr\n\
199 as float. If no such value is defined, default is returned, or, if\n\
200 not given, ValueError is raised.");
203 unicodedata_numeric(PyObject
*self
, PyObject
*args
)
206 PyObject
*defobj
= NULL
;
211 if (!PyArg_ParseTuple(args
, "O!|O:numeric", &PyUnicode_Type
, &v
, &defobj
))
214 if (c
== (Py_UCS4
)-1)
218 const change_record
*old
= get_old_record(self
, c
);
219 if (old
->category_changed
== 0) {
224 else if (old
->decimal_changed
!= 0xFF) {
226 rc
= old
->decimal_changed
;
231 rc
= Py_UNICODE_TONUMERIC(c
);
233 if (defobj
== NULL
) {
234 PyErr_SetString(PyExc_ValueError
, "not a numeric character");
242 return PyFloat_FromDouble(rc
);
245 PyDoc_STRVAR(unicodedata_category__doc__
,
248 Returns the general category assigned to the Unicode character\n\
252 unicodedata_category(PyObject
*self
, PyObject
*args
)
258 if (!PyArg_ParseTuple(args
, "O!:category",
259 &PyUnicode_Type
, &v
))
262 if (c
== (Py_UCS4
)-1)
264 index
= (int) _getrecord_ex(c
)->category
;
266 const change_record
*old
= get_old_record(self
, c
);
267 if (old
->category_changed
!= 0xFF)
268 index
= old
->category_changed
;
270 return PyString_FromString(_PyUnicode_CategoryNames
[index
]);
273 PyDoc_STRVAR(unicodedata_bidirectional__doc__
,
274 "bidirectional(unichr)\n\
276 Returns the bidirectional category assigned to the Unicode character\n\
277 unichr as string. If no such value is defined, an empty string is\n\
281 unicodedata_bidirectional(PyObject
*self
, PyObject
*args
)
287 if (!PyArg_ParseTuple(args
, "O!:bidirectional",
288 &PyUnicode_Type
, &v
))
291 if (c
== (Py_UCS4
)-1)
293 index
= (int) _getrecord_ex(c
)->bidirectional
;
295 const change_record
*old
= get_old_record(self
, c
);
296 if (old
->category_changed
== 0)
297 index
= 0; /* unassigned */
298 else if (old
->bidir_changed
!= 0xFF)
299 index
= old
->bidir_changed
;
301 return PyString_FromString(_PyUnicode_BidirectionalNames
[index
]);
304 PyDoc_STRVAR(unicodedata_combining__doc__
,
305 "combining(unichr)\n\
307 Returns the canonical combining class assigned to the Unicode\n\
308 character unichr as integer. Returns 0 if no combining class is\n\
312 unicodedata_combining(PyObject
*self
, PyObject
*args
)
318 if (!PyArg_ParseTuple(args
, "O!:combining",
319 &PyUnicode_Type
, &v
))
322 if (c
== (Py_UCS4
)-1)
324 index
= (int) _getrecord_ex(c
)->combining
;
326 const change_record
*old
= get_old_record(self
, c
);
327 if (old
->category_changed
== 0)
328 index
= 0; /* unassigned */
330 return PyInt_FromLong(index
);
333 PyDoc_STRVAR(unicodedata_mirrored__doc__
,
336 Returns the mirrored property assigned to the Unicode character\n\
337 unichr as integer. Returns 1 if the character has been identified as\n\
338 a \"mirrored\" character in bidirectional text, 0 otherwise.");
341 unicodedata_mirrored(PyObject
*self
, PyObject
*args
)
347 if (!PyArg_ParseTuple(args
, "O!:mirrored",
348 &PyUnicode_Type
, &v
))
351 if (c
== (Py_UCS4
)-1)
353 index
= (int) _getrecord_ex(c
)->mirrored
;
355 const change_record
*old
= get_old_record(self
, c
);
356 if (old
->category_changed
== 0)
357 index
= 0; /* unassigned */
358 else if (old
->mirrored_changed
!= 0xFF)
359 index
= old
->mirrored_changed
;
361 return PyInt_FromLong(index
);
364 PyDoc_STRVAR(unicodedata_east_asian_width__doc__
,
365 "east_asian_width(unichr)\n\
367 Returns the east asian width assigned to the Unicode character\n\
371 unicodedata_east_asian_width(PyObject
*self
, PyObject
*args
)
377 if (!PyArg_ParseTuple(args
, "O!:east_asian_width",
378 &PyUnicode_Type
, &v
))
381 if (c
== (Py_UCS4
)-1)
383 index
= (int) _getrecord_ex(c
)->east_asian_width
;
385 const change_record
*old
= get_old_record(self
, c
);
386 if (old
->category_changed
== 0)
387 index
= 0; /* unassigned */
389 return PyString_FromString(_PyUnicode_EastAsianWidthNames
[index
]);
392 PyDoc_STRVAR(unicodedata_decomposition__doc__
,
393 "decomposition(unichr)\n\
395 Returns the character decomposition mapping assigned to the Unicode\n\
396 character unichr as string. An empty string is returned in case no\n\
397 such mapping is defined.");
400 unicodedata_decomposition(PyObject
*self
, PyObject
*args
)
404 int code
, index
, count
, i
;
405 unsigned int prefix_index
;
408 if (!PyArg_ParseTuple(args
, "O!:decomposition",
409 &PyUnicode_Type
, &v
))
412 if (c
== (Py_UCS4
)-1)
418 const change_record
*old
= get_old_record(self
, c
);
419 if (old
->category_changed
== 0)
420 return PyString_FromString(""); /* unassigned */
423 if (code
< 0 || code
>= 0x110000)
426 index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
427 index
= decomp_index2
[(index
<<DECOMP_SHIFT
)+
428 (code
&((1<<DECOMP_SHIFT
)-1))];
431 /* high byte is number of hex bytes (usually one or two), low byte
432 is prefix code (from*/
433 count
= decomp_data
[index
] >> 8;
435 /* XXX: could allocate the PyString up front instead
436 (strlen(prefix) + 5 * count + 1 bytes) */
438 /* Based on how index is calculated above and decomp_data is generated
439 from Tools/unicode/makeunicodedata.py, it should not be possible
440 to overflow decomp_prefix. */
441 prefix_index
= decomp_data
[index
] & 255;
442 assert(prefix_index
< (sizeof(decomp_prefix
)/sizeof(*decomp_prefix
)));
445 i
= strlen(decomp_prefix
[prefix_index
]);
446 memcpy(decomp
, decomp_prefix
[prefix_index
], i
);
448 while (count
-- > 0) {
451 assert((size_t)i
< sizeof(decomp
));
452 PyOS_snprintf(decomp
+ i
, sizeof(decomp
) - i
, "%04X",
453 decomp_data
[++index
]);
454 i
+= strlen(decomp
+ i
);
459 return PyString_FromString(decomp
);
463 get_decomp_record(PyObject
*self
, Py_UCS4 code
, int *index
, int *prefix
, int *count
)
465 if (code
>= 0x110000) {
467 } else if (self
&& get_old_record(self
, code
)->category_changed
==0) {
468 /* unassigned in old version */
472 *index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
473 *index
= decomp_index2
[(*index
<<DECOMP_SHIFT
)+
474 (code
&((1<<DECOMP_SHIFT
)-1))];
477 /* high byte is number of hex bytes (usually one or two), low byte
478 is prefix code (from*/
479 *count
= decomp_data
[*index
] >> 8;
480 *prefix
= decomp_data
[*index
] & 255;
492 #define NCount (VCount*TCount)
493 #define SCount (LCount*NCount)
496 nfd_nfkd(PyObject
*self
, PyObject
*input
, int k
)
499 Py_UNICODE
*i
, *end
, *o
;
500 /* Longest decomposition in Unicode 3.2: U+FDFA */
501 Py_UNICODE stack
[20];
502 Py_ssize_t space
, isize
;
503 int index
, prefix
, count
, stackptr
;
504 unsigned char prev
, cur
;
507 isize
= PyUnicode_GET_SIZE(input
);
508 /* Overallocate atmost 10 characters. */
509 space
= (isize
> 10 ? 10 : isize
) + isize
;
510 result
= PyUnicode_FromUnicode(NULL
, space
);
513 i
= PyUnicode_AS_UNICODE(input
);
515 o
= PyUnicode_AS_UNICODE(result
);
518 stack
[stackptr
++] = *i
++;
520 Py_UNICODE code
= stack
[--stackptr
];
521 /* Hangul Decomposition adds three characters in
522 a single step, so we need atleast that much room. */
524 Py_ssize_t newsize
= PyString_GET_SIZE(result
) + 10;
526 if (PyUnicode_Resize(&result
, newsize
) == -1)
528 o
= PyUnicode_AS_UNICODE(result
) + newsize
- space
;
530 /* Hangul Decomposition. */
531 if (SBase
<= code
&& code
< (SBase
+SCount
)) {
532 int SIndex
= code
- SBase
;
533 int L
= LBase
+ SIndex
/ NCount
;
534 int V
= VBase
+ (SIndex
% NCount
) / TCount
;
535 int T
= TBase
+ SIndex
% TCount
;
545 /* normalization changes */
547 Py_UCS4 value
= ((PreviousDBVersion
*)self
)->normalization(code
);
549 stack
[stackptr
++] = value
;
554 /* Other decompositions. */
555 get_decomp_record(self
, code
, &index
, &prefix
, &count
);
557 /* Copy character if it is not decomposable, or has a
558 compatibility decomposition, but we do NFD. */
559 if (!count
|| (prefix
&& !k
)) {
564 /* Copy decomposition onto the stack, in reverse
567 code
= decomp_data
[index
+ (--count
)];
568 stack
[stackptr
++] = code
;
573 /* Drop overallocation. Cannot fail. */
574 PyUnicode_Resize(&result
, PyUnicode_GET_SIZE(result
) - space
);
576 /* Sort canonically. */
577 i
= PyUnicode_AS_UNICODE(result
);
578 prev
= _getrecord_ex(*i
)->combining
;
579 end
= i
+ PyUnicode_GET_SIZE(result
);
580 for (i
++; i
< end
; i
++) {
581 cur
= _getrecord_ex(*i
)->combining
;
582 if (prev
== 0 || cur
== 0 || prev
<= cur
) {
586 /* Non-canonical order. Need to switch *i with previous. */
589 Py_UNICODE tmp
= o
[1];
593 if (o
< PyUnicode_AS_UNICODE(result
))
595 prev
= _getrecord_ex(*o
)->combining
;
596 if (prev
== 0 || prev
<= cur
)
599 prev
= _getrecord_ex(*i
)->combining
;
605 find_nfc_index(PyObject
*self
, struct reindex
* nfc
, Py_UNICODE code
)
608 for (index
= 0; nfc
[index
].start
; index
++) {
609 int start
= nfc
[index
].start
;
612 if (code
<= start
+ nfc
[index
].count
) {
613 int delta
= code
- start
;
614 return nfc
[index
].index
+ delta
;
621 nfc_nfkc(PyObject
*self
, PyObject
*input
, int k
)
624 Py_UNICODE
*i
, *i1
, *o
, *end
;
625 int f
,l
,index
,index1
,comb
;
627 Py_UNICODE
*skipped
[20];
630 result
= nfd_nfkd(self
, input
, k
);
634 /* We are going to modify result in-place.
635 If nfd_nfkd is changed to sometimes return the input,
636 this code needs to be reviewed. */
637 assert(result
!= input
);
639 i
= PyUnicode_AS_UNICODE(result
);
640 end
= i
+ PyUnicode_GET_SIZE(result
);
641 o
= PyUnicode_AS_UNICODE(result
);
645 for (index
= 0; index
< cskipped
; index
++) {
646 if (skipped
[index
] == i
) {
647 /* *i character is skipped.
649 skipped
[index
] = skipped
[cskipped
-1];
652 goto again
; /* continue while */
655 /* Hangul Composition. We don't need to check for <LV,T>
656 pairs, since we always have decomposed data. */
657 if (LBase
<= *i
&& *i
< (LBase
+LCount
) &&
659 VBase
<= i
[1] && i
[1] <= (VBase
+VCount
)) {
661 LIndex
= i
[0] - LBase
;
662 VIndex
= i
[1] - VBase
;
663 code
= SBase
+ (LIndex
*VCount
+VIndex
)*TCount
;
666 TBase
<= *i
&& *i
<= (TBase
+TCount
)) {
674 f
= find_nfc_index(self
, nfc_first
, *i
);
679 /* Find next unblocked character. */
683 int comb1
= _getrecord_ex(*i1
)->combining
;
684 if (comb1
&& comb
== comb1
) {
685 /* Character is blocked. */
689 l
= find_nfc_index(self
, nfc_last
, *i1
);
690 /* *i1 cannot be combined with *i. If *i1
691 is a starter, we don't need to look further.
692 Otherwise, record the combining class. */
701 index
= f
*TOTAL_LAST
+ l
;
702 index1
= comp_index
[index
>> COMP_SHIFT
];
703 code
= comp_data
[(index1
<<COMP_SHIFT
)+
704 (index
&((1<<COMP_SHIFT
)-1))];
708 /* Replace the original character. */
710 /* Mark the second character unused. */
711 skipped
[cskipped
++] = i1
;
713 f
= find_nfc_index(self
, nfc_first
, *i
);
720 PyUnicode_Resize(&result
, o
- PyUnicode_AS_UNICODE(result
));
724 PyDoc_STRVAR(unicodedata_normalize__doc__
,
725 "normalize(form, unistr)\n\
727 Return the normal form 'form' for the Unicode string unistr. Valid\n\
728 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
731 unicodedata_normalize(PyObject
*self
, PyObject
*args
)
736 if(!PyArg_ParseTuple(args
, "sO!:normalize",
737 &form
, &PyUnicode_Type
, &input
))
740 if (PyUnicode_GetSize(input
) == 0) {
741 /* Special case empty input strings, since resizing
742 them later would cause internal errors. */
747 if (strcmp(form
, "NFC") == 0)
748 return nfc_nfkc(self
, input
, 0);
749 if (strcmp(form
, "NFKC") == 0)
750 return nfc_nfkc(self
, input
, 1);
751 if (strcmp(form
, "NFD") == 0)
752 return nfd_nfkd(self
, input
, 0);
753 if (strcmp(form
, "NFKD") == 0)
754 return nfd_nfkd(self
, input
, 1);
755 PyErr_SetString(PyExc_ValueError
, "invalid normalization form");
759 /* -------------------------------------------------------------------- */
760 /* unicode character name tables */
762 /* data file generated by Tools/unicode/makeunicodedata.py */
763 #include "unicodename_db.h"
765 /* -------------------------------------------------------------------- */
766 /* database code (cut and pasted from the unidb package) */
769 _gethash(const char *s
, int len
, int scale
)
774 for (i
= 0; i
< len
; i
++) {
775 h
= (h
* scale
) + (unsigned char) toupper(Py_CHARMASK(s
[i
]));
778 h
= (h
^ ((ix
>>24) & 0xff)) & 0x00ffffff;
783 static char *hangul_syllables
[][3] = {
787 { "D", "YAE", "GS" },
788 { "DD", "EO", "N", },
790 { "M", "YEO", "NH" },
794 { "SS", "WAE", "LM" },
798 { "C", "WEO", "LP" },
815 is_unified_ideograph(Py_UCS4 code
)
818 (0x3400 <= code
&& code
<= 0x4DB5) || /* CJK Ideograph Extension A */
819 (0x4E00 <= code
&& code
<= 0x9FBB) || /* CJK Ideograph */
820 (0x20000 <= code
&& code
<= 0x2A6D6));/* CJK Ideograph Extension B */
824 _getucname(PyObject
*self
, Py_UCS4 code
, char* buffer
, int buflen
)
831 if (code
>= 0x110000)
835 const change_record
*old
= get_old_record(self
, code
);
836 if (old
->category_changed
== 0) {
842 if (SBase
<= code
&& code
< SBase
+SCount
) {
843 /* Hangul syllable. */
844 int SIndex
= code
- SBase
;
845 int L
= SIndex
/ NCount
;
846 int V
= (SIndex
% NCount
) / TCount
;
847 int T
= SIndex
% TCount
;
850 /* Worst case: HANGUL SYLLABLE <10chars>. */
852 strcpy(buffer
, "HANGUL SYLLABLE ");
854 strcpy(buffer
, hangul_syllables
[L
][0]);
855 buffer
+= strlen(hangul_syllables
[L
][0]);
856 strcpy(buffer
, hangul_syllables
[V
][1]);
857 buffer
+= strlen(hangul_syllables
[V
][1]);
858 strcpy(buffer
, hangul_syllables
[T
][2]);
859 buffer
+= strlen(hangul_syllables
[T
][2]);
864 if (is_unified_ideograph(code
)) {
866 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
868 sprintf(buffer
, "CJK UNIFIED IDEOGRAPH-%X", code
);
872 /* get offset into phrasebook */
873 offset
= phrasebook_offset1
[(code
>>phrasebook_shift
)];
874 offset
= phrasebook_offset2
[(offset
<<phrasebook_shift
) +
875 (code
&((1<<phrasebook_shift
)-1))];
883 word
= phrasebook
[offset
] - phrasebook_short
;
885 word
= (word
<< 8) + phrasebook
[offset
+1];
888 word
= phrasebook
[offset
++];
891 return 0; /* buffer overflow */
894 /* copy word string from lexicon. the last character in the
895 word has bit 7 set. the last word in a string ends with
897 w
= lexicon
+ lexicon_offset
[word
];
900 return 0; /* buffer overflow */
904 return 0; /* buffer overflow */
905 buffer
[i
++] = *w
& 127;
907 break; /* end of word */
914 _cmpname(PyObject
*self
, int code
, const char* name
, int namelen
)
916 /* check if code corresponds to the given name */
918 char buffer
[NAME_MAXLEN
];
919 if (!_getucname(self
, code
, buffer
, sizeof(buffer
)))
921 for (i
= 0; i
< namelen
; i
++) {
922 if (toupper(Py_CHARMASK(name
[i
])) != buffer
[i
])
925 return buffer
[namelen
] == '\0';
929 find_syllable(const char *str
, int *len
, int *pos
, int count
, int column
)
933 for (i
= 0; i
< count
; i
++) {
934 char *s
= hangul_syllables
[i
][column
];
938 if (strncmp(str
, s
, len1
) == 0) {
949 _getcode(PyObject
* self
, const char* name
, int namelen
, Py_UCS4
* code
)
952 unsigned int mask
= code_size
-1;
953 unsigned int i
, incr
;
955 /* Check for hangul syllables. */
956 if (strncmp(name
, "HANGUL SYLLABLE ", 16) == 0) {
957 int len
, L
= -1, V
= -1, T
= -1;
958 const char *pos
= name
+ 16;
959 find_syllable(pos
, &len
, &L
, LCount
, 0);
961 find_syllable(pos
, &len
, &V
, VCount
, 1);
963 find_syllable(pos
, &len
, &T
, TCount
, 2);
965 if (L
!= -1 && V
!= -1 && T
!= -1 && pos
-name
== namelen
) {
966 *code
= SBase
+ (L
*VCount
+V
)*TCount
+ T
;
969 /* Otherwise, it's an illegal syllable name. */
973 /* Check for unified ideographs. */
974 if (strncmp(name
, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
975 /* Four or five hexdigits must follow. */
979 if (namelen
!= 4 && namelen
!= 5)
983 if (*name
>= '0' && *name
<= '9')
985 else if (*name
>= 'A' && *name
<= 'F')
986 v
+= *name
- 'A' + 10;
991 if (!is_unified_ideograph(v
))
997 /* the following is the same as python's dictionary lookup, with
998 only minor changes. see the makeunicodedata script for more
1001 h
= (unsigned int) _gethash(name
, namelen
, code_magic
);
1006 if (_cmpname(self
, v
, name
, namelen
)) {
1010 incr
= (h
^ (h
>> 3)) & mask
;
1014 i
= (i
+ incr
) & mask
;
1018 if (_cmpname(self
, v
, name
, namelen
)) {
1024 incr
= incr
^ code_poly
;
1028 static const _PyUnicode_Name_CAPI hashAPI
=
1030 sizeof(_PyUnicode_Name_CAPI
),
1035 /* -------------------------------------------------------------------- */
1036 /* Python bindings */
1038 PyDoc_STRVAR(unicodedata_name__doc__
,
1039 "name(unichr[, default])\n\
1040 Returns the name assigned to the Unicode character unichr as a\n\
1041 string. If no name is defined, default is returned, or, if not\n\
1042 given, ValueError is raised.");
1045 unicodedata_name(PyObject
* self
, PyObject
* args
)
1047 char name
[NAME_MAXLEN
];
1051 PyObject
* defobj
= NULL
;
1052 if (!PyArg_ParseTuple(args
, "O!|O:name", &PyUnicode_Type
, &v
, &defobj
))
1056 if (c
== (Py_UCS4
)-1)
1059 if (!_getucname(self
, c
, name
, sizeof(name
))) {
1060 if (defobj
== NULL
) {
1061 PyErr_SetString(PyExc_ValueError
, "no such name");
1070 return Py_BuildValue("s", name
);
1073 PyDoc_STRVAR(unicodedata_lookup__doc__
,
1076 Look up character by name. If a character with the\n\
1077 given name is found, return the corresponding Unicode\n\
1078 character. If not found, KeyError is raised.");
1081 unicodedata_lookup(PyObject
* self
, PyObject
* args
)
1088 if (!PyArg_ParseTuple(args
, "s#:lookup", &name
, &namelen
))
1091 if (!_getcode(self
, name
, namelen
, &code
)) {
1092 PyErr_Format(PyExc_KeyError
, "undefined character name '%s'",
1097 #ifndef Py_UNICODE_WIDE
1098 if (code
>= 0x10000) {
1099 str
[0] = 0xd800 + ((code
- 0x10000) >> 10);
1100 str
[1] = 0xdc00 + ((code
- 0x10000) & 0x3ff);
1101 return PyUnicode_FromUnicode(str
, 2);
1104 str
[0] = (Py_UNICODE
) code
;
1105 return PyUnicode_FromUnicode(str
, 1);
1108 /* XXX Add doc strings. */
1110 static PyMethodDef unicodedata_functions
[] = {
1111 {"decimal", unicodedata_decimal
, METH_VARARGS
, unicodedata_decimal__doc__
},
1112 {"digit", unicodedata_digit
, METH_VARARGS
, unicodedata_digit__doc__
},
1113 {"numeric", unicodedata_numeric
, METH_VARARGS
, unicodedata_numeric__doc__
},
1114 {"category", unicodedata_category
, METH_VARARGS
,
1115 unicodedata_category__doc__
},
1116 {"bidirectional", unicodedata_bidirectional
, METH_VARARGS
,
1117 unicodedata_bidirectional__doc__
},
1118 {"combining", unicodedata_combining
, METH_VARARGS
,
1119 unicodedata_combining__doc__
},
1120 {"mirrored", unicodedata_mirrored
, METH_VARARGS
,
1121 unicodedata_mirrored__doc__
},
1122 {"east_asian_width", unicodedata_east_asian_width
, METH_VARARGS
,
1123 unicodedata_east_asian_width__doc__
},
1124 {"decomposition", unicodedata_decomposition
, METH_VARARGS
,
1125 unicodedata_decomposition__doc__
},
1126 {"name", unicodedata_name
, METH_VARARGS
, unicodedata_name__doc__
},
1127 {"lookup", unicodedata_lookup
, METH_VARARGS
, unicodedata_lookup__doc__
},
1128 {"normalize", unicodedata_normalize
, METH_VARARGS
,
1129 unicodedata_normalize__doc__
},
1130 {NULL
, NULL
} /* sentinel */
1133 static PyTypeObject UCD_Type
= {
1134 /* The ob_type field must be initialized in the module init function
1135 * to be portable to Windows without using C++. */
1136 PyVarObject_HEAD_INIT(NULL
, 0)
1137 "unicodedata.UCD", /*tp_name*/
1138 sizeof(PreviousDBVersion
), /*tp_basicsize*/
1141 (destructor
)PyObject_Del
, /*tp_dealloc*/
1148 0, /*tp_as_sequence*/
1149 0, /*tp_as_mapping*/
1153 PyObject_GenericGetAttr
,/*tp_getattro*/
1156 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
1160 0, /*tp_richcompare*/
1161 0, /*tp_weaklistoffset*/
1164 unicodedata_functions
, /*tp_methods*/
1165 DB_members
, /*tp_members*/
1171 0, /*tp_dictoffset*/
1179 PyDoc_STRVAR(unicodedata_docstring
,
1180 "This module provides access to the Unicode Character Database which\n\
1181 defines character properties for all Unicode characters. The data in\n\
1182 this database is based on the UnicodeData.txt file version\n\
1183 5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1185 The module uses the same names and symbols as defined by the\n\
1186 UnicodeData File Format 5.1.0 (see\n\
1187 http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
1190 initunicodedata(void)
1194 Py_TYPE(&UCD_Type
) = &PyType_Type
;
1197 "unicodedata", unicodedata_functions
, unicodedata_docstring
);
1201 PyModule_AddStringConstant(m
, "unidata_version", UNIDATA_VERSION
);
1202 Py_INCREF(&UCD_Type
);
1203 PyModule_AddObject(m
, "UCD", (PyObject
*)&UCD_Type
);
1205 /* Previous versions */
1206 v
= new_previous_version("3.2.0", get_change_3_2_0
, normalization_3_2_0
);
1208 PyModule_AddObject(m
, "ucd_3_2_0", v
);
1211 v
= PyCObject_FromVoidPtr((void *) &hashAPI
, NULL
);
1213 PyModule_AddObject(m
, "ucnhash_CAPI", v
);
1219 indent-tabs-mode: nil