1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 4.1 data base.
5 Data was extracted from the Unicode 4.1 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
17 #include "structmember.h"
19 /* character properties */
22 const unsigned char category
; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining
; /* combining class value 0 - 255 */
25 const unsigned char bidirectional
; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored
; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width
; /* index into
29 _PyUnicode_EastAsianWidth */
30 } _PyUnicode_DatabaseRecord
;
32 typedef struct change_record
{
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed
;
35 const unsigned char category_changed
;
36 const unsigned char decimal_changed
;
37 const int numeric_changed
;
40 /* data file generated by Tools/unicode/makeunicodedata.py */
41 #include "unicodedata_db.h"
43 static const _PyUnicode_DatabaseRecord
*
44 _getrecord_ex(Py_UCS4 code
)
50 index
= index1
[(code
>>SHIFT
)];
51 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
54 return &_PyUnicode_Database_Records
[index
];
57 /* ------------- Previous-version API ------------------------------------- */
58 typedef struct previous_version
{
61 const change_record
* (*getrecord
)(Py_UCS4
);
62 Py_UCS4 (*normalization
)(Py_UCS4
);
65 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
67 static PyMemberDef DB_members
[] = {
68 {"unidata_version", T_STRING
, offsetof(PreviousDBVersion
, name
), READONLY
},
72 /* forward declaration */
73 static PyTypeObject UCD_Type
;
76 new_previous_version(const char*name
, const change_record
* (*getrecord
)(Py_UCS4
),
77 Py_UCS4 (*normalization
)(Py_UCS4
))
79 PreviousDBVersion
*self
;
80 self
= PyObject_New(PreviousDBVersion
, &UCD_Type
);
84 self
->getrecord
= getrecord
;
85 self
->normalization
= normalization
;
86 return (PyObject
*)self
;
90 static Py_UCS4
getuchar(PyUnicodeObject
*obj
)
92 Py_UNICODE
*v
= PyUnicode_AS_UNICODE(obj
);
94 if (PyUnicode_GET_SIZE(obj
) == 1)
96 #ifndef Py_UNICODE_WIDE
97 else if ((PyUnicode_GET_SIZE(obj
) == 2) &&
98 (0xD800 <= v
[0] && v
[0] <= 0xDBFF) &&
99 (0xDC00 <= v
[1] && v
[1] <= 0xDFFF))
100 return (((v
[0] & 0x3FF)<<10) | (v
[1] & 0x3FF)) + 0x10000;
102 PyErr_SetString(PyExc_TypeError
,
103 "need a single Unicode character as parameter");
107 /* --- Module API --------------------------------------------------------- */
109 PyDoc_STRVAR(unicodedata_decimal__doc__
,
110 "decimal(unichr[, default])\n\
112 Returns the decimal value assigned to the Unicode character unichr\n\
113 as integer. If no such value is defined, default is returned, or, if\n\
114 not given, ValueError is raised.");
117 unicodedata_decimal(PyObject
*self
, PyObject
*args
)
120 PyObject
*defobj
= NULL
;
125 if (!PyArg_ParseTuple(args
, "O!|O:decimal", &PyUnicode_Type
, &v
, &defobj
))
128 if (c
== (Py_UCS4
)-1)
132 const change_record
*old
= get_old_record(self
, c
);
133 if (old
->category_changed
== 0) {
138 else if (old
->decimal_changed
!= 0xFF) {
140 rc
= old
->decimal_changed
;
145 rc
= Py_UNICODE_TODECIMAL(c
);
147 if (defobj
== NULL
) {
148 PyErr_SetString(PyExc_ValueError
,
157 return PyInt_FromLong(rc
);
160 PyDoc_STRVAR(unicodedata_digit__doc__
,
161 "digit(unichr[, default])\n\
163 Returns the digit value assigned to the Unicode character unichr as\n\
164 integer. If no such value is defined, default is returned, or, if\n\
165 not given, ValueError is raised.");
168 unicodedata_digit(PyObject
*self
, PyObject
*args
)
171 PyObject
*defobj
= NULL
;
175 if (!PyArg_ParseTuple(args
, "O!|O:digit", &PyUnicode_Type
, &v
, &defobj
))
178 if (c
== (Py_UCS4
)-1)
180 rc
= Py_UNICODE_TODIGIT(c
);
182 if (defobj
== NULL
) {
183 PyErr_SetString(PyExc_ValueError
, "not a digit");
191 return PyInt_FromLong(rc
);
194 PyDoc_STRVAR(unicodedata_numeric__doc__
,
195 "numeric(unichr[, default])\n\
197 Returns the numeric value assigned to the Unicode character unichr\n\
198 as float. If no such value is defined, default is returned, or, if\n\
199 not given, ValueError is raised.");
202 unicodedata_numeric(PyObject
*self
, PyObject
*args
)
205 PyObject
*defobj
= NULL
;
210 if (!PyArg_ParseTuple(args
, "O!|O:numeric", &PyUnicode_Type
, &v
, &defobj
))
213 if (c
== (Py_UCS4
)-1)
217 const change_record
*old
= get_old_record(self
, c
);
218 if (old
->category_changed
== 0) {
223 else if (old
->decimal_changed
!= 0xFF) {
225 rc
= old
->decimal_changed
;
230 rc
= Py_UNICODE_TONUMERIC(c
);
232 if (defobj
== NULL
) {
233 PyErr_SetString(PyExc_ValueError
, "not a numeric character");
241 return PyFloat_FromDouble(rc
);
244 PyDoc_STRVAR(unicodedata_category__doc__
,
247 Returns the general category assigned to the Unicode character\n\
251 unicodedata_category(PyObject
*self
, PyObject
*args
)
257 if (!PyArg_ParseTuple(args
, "O!:category",
258 &PyUnicode_Type
, &v
))
261 if (c
== (Py_UCS4
)-1)
263 index
= (int) _getrecord_ex(c
)->category
;
265 const change_record
*old
= get_old_record(self
, c
);
266 if (old
->category_changed
!= 0xFF)
267 index
= old
->category_changed
;
269 return PyString_FromString(_PyUnicode_CategoryNames
[index
]);
272 PyDoc_STRVAR(unicodedata_bidirectional__doc__
,
273 "bidirectional(unichr)\n\
275 Returns the bidirectional category assigned to the Unicode character\n\
276 unichr as string. If no such value is defined, an empty string is\n\
280 unicodedata_bidirectional(PyObject
*self
, PyObject
*args
)
286 if (!PyArg_ParseTuple(args
, "O!:bidirectional",
287 &PyUnicode_Type
, &v
))
290 if (c
== (Py_UCS4
)-1)
292 index
= (int) _getrecord_ex(c
)->bidirectional
;
294 const change_record
*old
= get_old_record(self
, c
);
295 if (old
->category_changed
== 0)
296 index
= 0; /* unassigned */
297 else if (old
->bidir_changed
!= 0xFF)
298 index
= old
->bidir_changed
;
300 return PyString_FromString(_PyUnicode_BidirectionalNames
[index
]);
303 PyDoc_STRVAR(unicodedata_combining__doc__
,
304 "combining(unichr)\n\
306 Returns the canonical combining class assigned to the Unicode\n\
307 character unichr as integer. Returns 0 if no combining class is\n\
311 unicodedata_combining(PyObject
*self
, PyObject
*args
)
317 if (!PyArg_ParseTuple(args
, "O!:combining",
318 &PyUnicode_Type
, &v
))
321 if (c
== (Py_UCS4
)-1)
323 index
= (int) _getrecord_ex(c
)->combining
;
325 const change_record
*old
= get_old_record(self
, c
);
326 if (old
->category_changed
== 0)
327 index
= 0; /* unassigned */
329 return PyInt_FromLong(index
);
332 PyDoc_STRVAR(unicodedata_mirrored__doc__
,
335 Returns the mirrored property assigned to the Unicode character\n\
336 unichr as integer. Returns 1 if the character has been identified as\n\
337 a \"mirrored\" character in bidirectional text, 0 otherwise.");
340 unicodedata_mirrored(PyObject
*self
, PyObject
*args
)
346 if (!PyArg_ParseTuple(args
, "O!:mirrored",
347 &PyUnicode_Type
, &v
))
350 if (c
== (Py_UCS4
)-1)
352 index
= (int) _getrecord_ex(c
)->mirrored
;
354 const change_record
*old
= get_old_record(self
, c
);
355 if (old
->category_changed
== 0)
356 index
= 0; /* unassigned */
358 return PyInt_FromLong(index
);
361 PyDoc_STRVAR(unicodedata_east_asian_width__doc__
,
362 "east_asian_width(unichr)\n\
364 Returns the east asian width assigned to the Unicode character\n\
368 unicodedata_east_asian_width(PyObject
*self
, PyObject
*args
)
374 if (!PyArg_ParseTuple(args
, "O!:east_asian_width",
375 &PyUnicode_Type
, &v
))
378 if (c
== (Py_UCS4
)-1)
380 index
= (int) _getrecord_ex(c
)->east_asian_width
;
382 const change_record
*old
= get_old_record(self
, c
);
383 if (old
->category_changed
== 0)
384 index
= 0; /* unassigned */
386 return PyString_FromString(_PyUnicode_EastAsianWidthNames
[index
]);
389 PyDoc_STRVAR(unicodedata_decomposition__doc__
,
390 "decomposition(unichr)\n\
392 Returns the character decomposition mapping assigned to the Unicode\n\
393 character unichr as string. An empty string is returned in case no\n\
394 such mapping is defined.");
397 unicodedata_decomposition(PyObject
*self
, PyObject
*args
)
401 int code
, index
, count
, i
;
402 unsigned int prefix_index
;
405 if (!PyArg_ParseTuple(args
, "O!:decomposition",
406 &PyUnicode_Type
, &v
))
409 if (c
== (Py_UCS4
)-1)
415 const change_record
*old
= get_old_record(self
, c
);
416 if (old
->category_changed
== 0)
417 return PyString_FromString(""); /* unassigned */
420 if (code
< 0 || code
>= 0x110000)
423 index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
424 index
= decomp_index2
[(index
<<DECOMP_SHIFT
)+
425 (code
&((1<<DECOMP_SHIFT
)-1))];
428 /* high byte is number of hex bytes (usually one or two), low byte
429 is prefix code (from*/
430 count
= decomp_data
[index
] >> 8;
432 /* XXX: could allocate the PyString up front instead
433 (strlen(prefix) + 5 * count + 1 bytes) */
435 /* Based on how index is calculated above and decomp_data is generated
436 from Tools/unicode/makeunicodedata.py, it should not be possible
437 to overflow decomp_prefix. */
438 prefix_index
= decomp_data
[index
] & 255;
439 assert(prefix_index
< (sizeof(decomp_prefix
)/sizeof(*decomp_prefix
)));
442 i
= strlen(decomp_prefix
[prefix_index
]);
443 memcpy(decomp
, decomp_prefix
[prefix_index
], i
);
445 while (count
-- > 0) {
448 assert((size_t)i
< sizeof(decomp
));
449 PyOS_snprintf(decomp
+ i
, sizeof(decomp
) - i
, "%04X",
450 decomp_data
[++index
]);
451 i
+= strlen(decomp
+ i
);
456 return PyString_FromString(decomp
);
460 get_decomp_record(PyObject
*self
, Py_UCS4 code
, int *index
, int *prefix
, int *count
)
462 if (code
>= 0x110000) {
464 } else if (self
&& get_old_record(self
, code
)->category_changed
==0) {
465 /* unassigned in old version */
469 *index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
470 *index
= decomp_index2
[(*index
<<DECOMP_SHIFT
)+
471 (code
&((1<<DECOMP_SHIFT
)-1))];
474 /* high byte is number of hex bytes (usually one or two), low byte
475 is prefix code (from*/
476 *count
= decomp_data
[*index
] >> 8;
477 *prefix
= decomp_data
[*index
] & 255;
489 #define NCount (VCount*TCount)
490 #define SCount (LCount*NCount)
493 nfd_nfkd(PyObject
*self
, PyObject
*input
, int k
)
496 Py_UNICODE
*i
, *end
, *o
;
497 /* Longest decomposition in Unicode 3.2: U+FDFA */
498 Py_UNICODE stack
[20];
499 Py_ssize_t space
, isize
;
500 int index
, prefix
, count
, stackptr
;
501 unsigned char prev
, cur
;
504 isize
= PyUnicode_GET_SIZE(input
);
505 /* Overallocate atmost 10 characters. */
506 space
= (isize
> 10 ? 10 : isize
) + isize
;
507 result
= PyUnicode_FromUnicode(NULL
, space
);
510 i
= PyUnicode_AS_UNICODE(input
);
512 o
= PyUnicode_AS_UNICODE(result
);
515 stack
[stackptr
++] = *i
++;
517 Py_UNICODE code
= stack
[--stackptr
];
518 /* Hangul Decomposition adds three characters in
519 a single step, so we need atleast that much room. */
521 Py_ssize_t newsize
= PyString_GET_SIZE(result
) + 10;
523 if (PyUnicode_Resize(&result
, newsize
) == -1)
525 o
= PyUnicode_AS_UNICODE(result
) + newsize
- space
;
527 /* Hangul Decomposition. */
528 if (SBase
<= code
&& code
< (SBase
+SCount
)) {
529 int SIndex
= code
- SBase
;
530 int L
= LBase
+ SIndex
/ NCount
;
531 int V
= VBase
+ (SIndex
% NCount
) / TCount
;
532 int T
= TBase
+ SIndex
% TCount
;
542 /* normalization changes */
544 Py_UCS4 value
= ((PreviousDBVersion
*)self
)->normalization(code
);
546 stack
[stackptr
++] = value
;
551 /* Other decompositions. */
552 get_decomp_record(self
, code
, &index
, &prefix
, &count
);
554 /* Copy character if it is not decomposable, or has a
555 compatibility decomposition, but we do NFD. */
556 if (!count
|| (prefix
&& !k
)) {
561 /* Copy decomposition onto the stack, in reverse
564 code
= decomp_data
[index
+ (--count
)];
565 stack
[stackptr
++] = code
;
570 /* Drop overallocation. Cannot fail. */
571 PyUnicode_Resize(&result
, PyUnicode_GET_SIZE(result
) - space
);
573 /* Sort canonically. */
574 i
= PyUnicode_AS_UNICODE(result
);
575 prev
= _getrecord_ex(*i
)->combining
;
576 end
= i
+ PyUnicode_GET_SIZE(result
);
577 for (i
++; i
< end
; i
++) {
578 cur
= _getrecord_ex(*i
)->combining
;
579 if (prev
== 0 || cur
== 0 || prev
<= cur
) {
583 /* Non-canonical order. Need to switch *i with previous. */
586 Py_UNICODE tmp
= o
[1];
590 if (o
< PyUnicode_AS_UNICODE(result
))
592 prev
= _getrecord_ex(*o
)->combining
;
593 if (prev
== 0 || prev
<= cur
)
596 prev
= _getrecord_ex(*i
)->combining
;
602 find_nfc_index(PyObject
*self
, struct reindex
* nfc
, Py_UNICODE code
)
605 for (index
= 0; nfc
[index
].start
; index
++) {
606 int start
= nfc
[index
].start
;
609 if (code
<= start
+ nfc
[index
].count
) {
610 int delta
= code
- start
;
611 return nfc
[index
].index
+ delta
;
618 nfc_nfkc(PyObject
*self
, PyObject
*input
, int k
)
621 Py_UNICODE
*i
, *i1
, *o
, *end
;
622 int f
,l
,index
,index1
,comb
;
624 Py_UNICODE
*skipped
[20];
627 result
= nfd_nfkd(self
, input
, k
);
631 /* We are going to modify result in-place.
632 If nfd_nfkd is changed to sometimes return the input,
633 this code needs to be reviewed. */
634 assert(result
!= input
);
636 i
= PyUnicode_AS_UNICODE(result
);
637 end
= i
+ PyUnicode_GET_SIZE(result
);
638 o
= PyUnicode_AS_UNICODE(result
);
642 for (index
= 0; index
< cskipped
; index
++) {
643 if (skipped
[index
] == i
) {
644 /* *i character is skipped.
646 skipped
[index
] = skipped
[cskipped
-1];
649 goto again
; /* continue while */
652 /* Hangul Composition. We don't need to check for <LV,T>
653 pairs, since we always have decomposed data. */
654 if (LBase
<= *i
&& *i
< (LBase
+LCount
) &&
656 VBase
<= i
[1] && i
[1] <= (VBase
+VCount
)) {
658 LIndex
= i
[0] - LBase
;
659 VIndex
= i
[1] - VBase
;
660 code
= SBase
+ (LIndex
*VCount
+VIndex
)*TCount
;
663 TBase
<= *i
&& *i
<= (TBase
+TCount
)) {
671 f
= find_nfc_index(self
, nfc_first
, *i
);
676 /* Find next unblocked character. */
680 int comb1
= _getrecord_ex(*i1
)->combining
;
681 if (comb1
&& comb
== comb1
) {
682 /* Character is blocked. */
686 l
= find_nfc_index(self
, nfc_last
, *i1
);
687 /* *i1 cannot be combined with *i. If *i1
688 is a starter, we don't need to look further.
689 Otherwise, record the combining class. */
698 index
= f
*TOTAL_LAST
+ l
;
699 index1
= comp_index
[index
>> COMP_SHIFT
];
700 code
= comp_data
[(index1
<<COMP_SHIFT
)+
701 (index
&((1<<COMP_SHIFT
)-1))];
705 /* Replace the original character. */
707 /* Mark the second character unused. */
708 skipped
[cskipped
++] = i1
;
710 f
= find_nfc_index(self
, nfc_first
, *i
);
717 PyUnicode_Resize(&result
, o
- PyUnicode_AS_UNICODE(result
));
721 PyDoc_STRVAR(unicodedata_normalize__doc__
,
722 "normalize(form, unistr)\n\
724 Return the normal form 'form' for the Unicode string unistr. Valid\n\
725 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
728 unicodedata_normalize(PyObject
*self
, PyObject
*args
)
733 if(!PyArg_ParseTuple(args
, "sO!:normalize",
734 &form
, &PyUnicode_Type
, &input
))
737 if (PyUnicode_GetSize(input
) == 0) {
738 /* Special case empty input strings, since resizing
739 them later would cause internal errors. */
744 if (strcmp(form
, "NFC") == 0)
745 return nfc_nfkc(self
, input
, 0);
746 if (strcmp(form
, "NFKC") == 0)
747 return nfc_nfkc(self
, input
, 1);
748 if (strcmp(form
, "NFD") == 0)
749 return nfd_nfkd(self
, input
, 0);
750 if (strcmp(form
, "NFKD") == 0)
751 return nfd_nfkd(self
, input
, 1);
752 PyErr_SetString(PyExc_ValueError
, "invalid normalization form");
756 /* -------------------------------------------------------------------- */
757 /* unicode character name tables */
759 /* data file generated by Tools/unicode/makeunicodedata.py */
760 #include "unicodename_db.h"
762 /* -------------------------------------------------------------------- */
763 /* database code (cut and pasted from the unidb package) */
766 _gethash(const char *s
, int len
, int scale
)
771 for (i
= 0; i
< len
; i
++) {
772 h
= (h
* scale
) + (unsigned char) toupper(Py_CHARMASK(s
[i
]));
775 h
= (h
^ ((ix
>>24) & 0xff)) & 0x00ffffff;
780 static char *hangul_syllables
[][3] = {
784 { "D", "YAE", "GS" },
785 { "DD", "EO", "N", },
787 { "M", "YEO", "NH" },
791 { "SS", "WAE", "LM" },
795 { "C", "WEO", "LP" },
812 is_unified_ideograph(Py_UCS4 code
)
815 (0x3400 <= code
&& code
<= 0x4DB5) || /* CJK Ideograph Extension A */
816 (0x4E00 <= code
&& code
<= 0x9FBB) || /* CJK Ideograph */
817 (0x20000 <= code
&& code
<= 0x2A6D6));/* CJK Ideograph Extension B */
821 _getucname(PyObject
*self
, Py_UCS4 code
, char* buffer
, int buflen
)
828 if (code
>= 0x110000)
832 const change_record
*old
= get_old_record(self
, code
);
833 if (old
->category_changed
== 0) {
839 if (SBase
<= code
&& code
< SBase
+SCount
) {
840 /* Hangul syllable. */
841 int SIndex
= code
- SBase
;
842 int L
= SIndex
/ NCount
;
843 int V
= (SIndex
% NCount
) / TCount
;
844 int T
= SIndex
% TCount
;
847 /* Worst case: HANGUL SYLLABLE <10chars>. */
849 strcpy(buffer
, "HANGUL SYLLABLE ");
851 strcpy(buffer
, hangul_syllables
[L
][0]);
852 buffer
+= strlen(hangul_syllables
[L
][0]);
853 strcpy(buffer
, hangul_syllables
[V
][1]);
854 buffer
+= strlen(hangul_syllables
[V
][1]);
855 strcpy(buffer
, hangul_syllables
[T
][2]);
856 buffer
+= strlen(hangul_syllables
[T
][2]);
861 if (is_unified_ideograph(code
)) {
863 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
865 sprintf(buffer
, "CJK UNIFIED IDEOGRAPH-%X", code
);
869 /* get offset into phrasebook */
870 offset
= phrasebook_offset1
[(code
>>phrasebook_shift
)];
871 offset
= phrasebook_offset2
[(offset
<<phrasebook_shift
) +
872 (code
&((1<<phrasebook_shift
)-1))];
880 word
= phrasebook
[offset
] - phrasebook_short
;
882 word
= (word
<< 8) + phrasebook
[offset
+1];
885 word
= phrasebook
[offset
++];
888 return 0; /* buffer overflow */
891 /* copy word string from lexicon. the last character in the
892 word has bit 7 set. the last word in a string ends with
894 w
= lexicon
+ lexicon_offset
[word
];
897 return 0; /* buffer overflow */
901 return 0; /* buffer overflow */
902 buffer
[i
++] = *w
& 127;
904 break; /* end of word */
911 _cmpname(PyObject
*self
, int code
, const char* name
, int namelen
)
913 /* check if code corresponds to the given name */
915 char buffer
[NAME_MAXLEN
];
916 if (!_getucname(self
, code
, buffer
, sizeof(buffer
)))
918 for (i
= 0; i
< namelen
; i
++) {
919 if (toupper(Py_CHARMASK(name
[i
])) != buffer
[i
])
922 return buffer
[namelen
] == '\0';
926 find_syllable(const char *str
, int *len
, int *pos
, int count
, int column
)
930 for (i
= 0; i
< count
; i
++) {
931 char *s
= hangul_syllables
[i
][column
];
935 if (strncmp(str
, s
, len1
) == 0) {
946 _getcode(PyObject
* self
, const char* name
, int namelen
, Py_UCS4
* code
)
949 unsigned int mask
= code_size
-1;
950 unsigned int i
, incr
;
952 /* Check for hangul syllables. */
953 if (strncmp(name
, "HANGUL SYLLABLE ", 16) == 0) {
954 int len
, L
= -1, V
= -1, T
= -1;
955 const char *pos
= name
+ 16;
956 find_syllable(pos
, &len
, &L
, LCount
, 0);
958 find_syllable(pos
, &len
, &V
, VCount
, 1);
960 find_syllable(pos
, &len
, &T
, TCount
, 2);
962 if (L
!= -1 && V
!= -1 && T
!= -1 && pos
-name
== namelen
) {
963 *code
= SBase
+ (L
*VCount
+V
)*TCount
+ T
;
966 /* Otherwise, it's an illegal syllable name. */
970 /* Check for unified ideographs. */
971 if (strncmp(name
, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
972 /* Four or five hexdigits must follow. */
976 if (namelen
!= 4 && namelen
!= 5)
980 if (*name
>= '0' && *name
<= '9')
982 else if (*name
>= 'A' && *name
<= 'F')
983 v
+= *name
- 'A' + 10;
988 if (!is_unified_ideograph(v
))
994 /* the following is the same as python's dictionary lookup, with
995 only minor changes. see the makeunicodedata script for more
998 h
= (unsigned int) _gethash(name
, namelen
, code_magic
);
1003 if (_cmpname(self
, v
, name
, namelen
)) {
1007 incr
= (h
^ (h
>> 3)) & mask
;
1011 i
= (i
+ incr
) & mask
;
1015 if (_cmpname(self
, v
, name
, namelen
)) {
1021 incr
= incr
^ code_poly
;
1025 static const _PyUnicode_Name_CAPI hashAPI
=
1027 sizeof(_PyUnicode_Name_CAPI
),
1032 /* -------------------------------------------------------------------- */
1033 /* Python bindings */
1035 PyDoc_STRVAR(unicodedata_name__doc__
,
1036 "name(unichr[, default])\n\
1037 Returns the name assigned to the Unicode character unichr as a\n\
1038 string. If no name is defined, default is returned, or, if not\n\
1039 given, ValueError is raised.");
1042 unicodedata_name(PyObject
* self
, PyObject
* args
)
1044 char name
[NAME_MAXLEN
];
1048 PyObject
* defobj
= NULL
;
1049 if (!PyArg_ParseTuple(args
, "O!|O:name", &PyUnicode_Type
, &v
, &defobj
))
1053 if (c
== (Py_UCS4
)-1)
1056 if (!_getucname(self
, c
, name
, sizeof(name
))) {
1057 if (defobj
== NULL
) {
1058 PyErr_SetString(PyExc_ValueError
, "no such name");
1067 return Py_BuildValue("s", name
);
1070 PyDoc_STRVAR(unicodedata_lookup__doc__
,
1073 Look up character by name. If a character with the\n\
1074 given name is found, return the corresponding Unicode\n\
1075 character. If not found, KeyError is raised.");
1078 unicodedata_lookup(PyObject
* self
, PyObject
* args
)
1085 if (!PyArg_ParseTuple(args
, "s#:lookup", &name
, &namelen
))
1088 if (!_getcode(self
, name
, namelen
, &code
)) {
1089 PyErr_Format(PyExc_KeyError
, "undefined character name '%s'",
1094 #ifndef Py_UNICODE_WIDE
1095 if (code
>= 0x10000) {
1096 str
[0] = 0xd800 + ((code
- 0x10000) >> 10);
1097 str
[1] = 0xdc00 + ((code
- 0x10000) & 0x3ff);
1098 return PyUnicode_FromUnicode(str
, 2);
1101 str
[0] = (Py_UNICODE
) code
;
1102 return PyUnicode_FromUnicode(str
, 1);
1105 /* XXX Add doc strings. */
1107 static PyMethodDef unicodedata_functions
[] = {
1108 {"decimal", unicodedata_decimal
, METH_VARARGS
, unicodedata_decimal__doc__
},
1109 {"digit", unicodedata_digit
, METH_VARARGS
, unicodedata_digit__doc__
},
1110 {"numeric", unicodedata_numeric
, METH_VARARGS
, unicodedata_numeric__doc__
},
1111 {"category", unicodedata_category
, METH_VARARGS
,
1112 unicodedata_category__doc__
},
1113 {"bidirectional", unicodedata_bidirectional
, METH_VARARGS
,
1114 unicodedata_bidirectional__doc__
},
1115 {"combining", unicodedata_combining
, METH_VARARGS
,
1116 unicodedata_combining__doc__
},
1117 {"mirrored", unicodedata_mirrored
, METH_VARARGS
,
1118 unicodedata_mirrored__doc__
},
1119 {"east_asian_width", unicodedata_east_asian_width
, METH_VARARGS
,
1120 unicodedata_east_asian_width__doc__
},
1121 {"decomposition", unicodedata_decomposition
, METH_VARARGS
,
1122 unicodedata_decomposition__doc__
},
1123 {"name", unicodedata_name
, METH_VARARGS
, unicodedata_name__doc__
},
1124 {"lookup", unicodedata_lookup
, METH_VARARGS
, unicodedata_lookup__doc__
},
1125 {"normalize", unicodedata_normalize
, METH_VARARGS
,
1126 unicodedata_normalize__doc__
},
1127 {NULL
, NULL
} /* sentinel */
1130 static PyTypeObject UCD_Type
= {
1131 /* The ob_type field must be initialized in the module init function
1132 * to be portable to Windows without using C++. */
1133 PyVarObject_HEAD_INIT(NULL
, 0)
1134 "unicodedata.UCD", /*tp_name*/
1135 sizeof(PreviousDBVersion
), /*tp_basicsize*/
1138 (destructor
)PyObject_Del
, /*tp_dealloc*/
1145 0, /*tp_as_sequence*/
1146 0, /*tp_as_mapping*/
1150 PyObject_GenericGetAttr
,/*tp_getattro*/
1153 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
1157 0, /*tp_richcompare*/
1158 0, /*tp_weaklistoffset*/
1161 unicodedata_functions
, /*tp_methods*/
1162 DB_members
, /*tp_members*/
1168 0, /*tp_dictoffset*/
1176 PyDoc_STRVAR(unicodedata_docstring
,
1177 "This module provides access to the Unicode Character Database which\n\
1178 defines character properties for all Unicode characters. The data in\n\
1179 this database is based on the UnicodeData.txt file version\n\
1180 4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1182 The module uses the same names and symbols as defined by the\n\
1183 UnicodeData File Format 4.1.0 (see\n\
1184 http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
1187 initunicodedata(void)
1191 Py_TYPE(&UCD_Type
) = &PyType_Type
;
1194 "unicodedata", unicodedata_functions
, unicodedata_docstring
);
1198 PyModule_AddStringConstant(m
, "unidata_version", UNIDATA_VERSION
);
1199 Py_INCREF(&UCD_Type
);
1200 PyModule_AddObject(m
, "UCD", (PyObject
*)&UCD_Type
);
1202 /* Previous versions */
1203 v
= new_previous_version("3.2.0", get_change_3_2_0
, normalization_3_2_0
);
1205 PyModule_AddObject(m
, "ucd_3_2_0", v
);
1208 v
= PyCObject_FromVoidPtr((void *) &hashAPI
, NULL
);
1210 PyModule_AddObject(m
, "ucnhash_CAPI", v
);
1216 indent-tabs-mode: nil