1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 4.1 data base.
5 Data was extracted from the Unicode 4.1 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
17 #include "structmember.h"
19 /* character properties */
22 const unsigned char category
; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining
; /* combining class value 0 - 255 */
25 const unsigned char bidirectional
; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored
; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width
; /* index into
29 _PyUnicode_EastAsianWidth */
30 } _PyUnicode_DatabaseRecord
;
32 typedef struct change_record
{
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed
;
35 const unsigned char category_changed
;
36 const unsigned char decimal_changed
;
37 const int numeric_changed
;
40 /* data file generated by Tools/unicode/makeunicodedata.py */
41 #include "unicodedata_db.h"
43 static const _PyUnicode_DatabaseRecord
*
44 _getrecord_ex(Py_UCS4 code
)
50 index
= index1
[(code
>>SHIFT
)];
51 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
54 return &_PyUnicode_Database_Records
[index
];
57 static const _PyUnicode_DatabaseRecord
*
58 _getrecord(PyUnicodeObject
* v
)
60 return _getrecord_ex(*PyUnicode_AS_UNICODE(v
));
63 /* ------------- Previous-version API ------------------------------------- */
64 typedef struct previous_version
{
67 const change_record
* (*getrecord
)(Py_UCS4
);
68 Py_UCS4 (*normalization
)(Py_UCS4
);
71 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
73 static PyMemberDef DB_members
[] = {
74 {"unidata_version", T_STRING
, offsetof(PreviousDBVersion
, name
), READONLY
},
78 /* forward declaration */
79 static PyTypeObject UCD_Type
;
82 new_previous_version(const char*name
, const change_record
* (*getrecord
)(Py_UCS4
),
83 Py_UCS4 (*normalization
)(Py_UCS4
))
85 PreviousDBVersion
*self
;
86 self
= PyObject_New(PreviousDBVersion
, &UCD_Type
);
90 self
->getrecord
= getrecord
;
91 self
->normalization
= normalization
;
92 return (PyObject
*)self
;
95 /* --- Module API --------------------------------------------------------- */
97 PyDoc_STRVAR(unicodedata_decimal__doc__
,
98 "decimal(unichr[, default])\n\
100 Returns the decimal value assigned to the Unicode character unichr\n\
101 as integer. If no such value is defined, default is returned, or, if\n\
102 not given, ValueError is raised.");
105 unicodedata_decimal(PyObject
*self
, PyObject
*args
)
108 PyObject
*defobj
= NULL
;
112 if (!PyArg_ParseTuple(args
, "O!|O:decimal", &PyUnicode_Type
, &v
, &defobj
))
114 if (PyUnicode_GET_SIZE(v
) != 1) {
115 PyErr_SetString(PyExc_TypeError
,
116 "need a single Unicode character as parameter");
121 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
122 if (old
->category_changed
== 0) {
127 else if (old
->decimal_changed
!= 0xFF) {
129 rc
= old
->decimal_changed
;
134 rc
= Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v
));
136 if (defobj
== NULL
) {
137 PyErr_SetString(PyExc_ValueError
,
146 return PyInt_FromLong(rc
);
149 PyDoc_STRVAR(unicodedata_digit__doc__
,
150 "digit(unichr[, default])\n\
152 Returns the digit value assigned to the Unicode character unichr as\n\
153 integer. If no such value is defined, default is returned, or, if\n\
154 not given, ValueError is raised.");
157 unicodedata_digit(PyObject
*self
, PyObject
*args
)
160 PyObject
*defobj
= NULL
;
163 if (!PyArg_ParseTuple(args
, "O!|O:digit", &PyUnicode_Type
, &v
, &defobj
))
165 if (PyUnicode_GET_SIZE(v
) != 1) {
166 PyErr_SetString(PyExc_TypeError
,
167 "need a single Unicode character as parameter");
170 rc
= Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v
));
172 if (defobj
== NULL
) {
173 PyErr_SetString(PyExc_ValueError
, "not a digit");
181 return PyInt_FromLong(rc
);
184 PyDoc_STRVAR(unicodedata_numeric__doc__
,
185 "numeric(unichr[, default])\n\
187 Returns the numeric value assigned to the Unicode character unichr\n\
188 as float. If no such value is defined, default is returned, or, if\n\
189 not given, ValueError is raised.");
192 unicodedata_numeric(PyObject
*self
, PyObject
*args
)
195 PyObject
*defobj
= NULL
;
199 if (!PyArg_ParseTuple(args
, "O!|O:numeric", &PyUnicode_Type
, &v
, &defobj
))
201 if (PyUnicode_GET_SIZE(v
) != 1) {
202 PyErr_SetString(PyExc_TypeError
,
203 "need a single Unicode character as parameter");
208 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
209 if (old
->category_changed
== 0) {
214 else if (old
->decimal_changed
!= 0xFF) {
216 rc
= old
->decimal_changed
;
221 rc
= Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v
));
223 if (defobj
== NULL
) {
224 PyErr_SetString(PyExc_ValueError
, "not a numeric character");
232 return PyFloat_FromDouble(rc
);
235 PyDoc_STRVAR(unicodedata_category__doc__
,
238 Returns the general category assigned to the Unicode character\n\
242 unicodedata_category(PyObject
*self
, PyObject
*args
)
247 if (!PyArg_ParseTuple(args
, "O!:category",
248 &PyUnicode_Type
, &v
))
250 if (PyUnicode_GET_SIZE(v
) != 1) {
251 PyErr_SetString(PyExc_TypeError
,
252 "need a single Unicode character as parameter");
255 index
= (int) _getrecord(v
)->category
;
257 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
258 if (old
->category_changed
!= 0xFF)
259 index
= old
->category_changed
;
261 return PyString_FromString(_PyUnicode_CategoryNames
[index
]);
264 PyDoc_STRVAR(unicodedata_bidirectional__doc__
,
265 "bidirectional(unichr)\n\
267 Returns the bidirectional category assigned to the Unicode character\n\
268 unichr as string. If no such value is defined, an empty string is\n\
272 unicodedata_bidirectional(PyObject
*self
, PyObject
*args
)
277 if (!PyArg_ParseTuple(args
, "O!:bidirectional",
278 &PyUnicode_Type
, &v
))
280 if (PyUnicode_GET_SIZE(v
) != 1) {
281 PyErr_SetString(PyExc_TypeError
,
282 "need a single Unicode character as parameter");
285 index
= (int) _getrecord(v
)->bidirectional
;
287 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
288 if (old
->category_changed
== 0)
289 index
= 0; /* unassigned */
290 else if (old
->bidir_changed
!= 0xFF)
291 index
= old
->bidir_changed
;
293 return PyString_FromString(_PyUnicode_BidirectionalNames
[index
]);
296 PyDoc_STRVAR(unicodedata_combining__doc__
,
297 "combining(unichr)\n\
299 Returns the canonical combining class assigned to the Unicode\n\
300 character unichr as integer. Returns 0 if no combining class is\n\
304 unicodedata_combining(PyObject
*self
, PyObject
*args
)
309 if (!PyArg_ParseTuple(args
, "O!:combining",
310 &PyUnicode_Type
, &v
))
312 if (PyUnicode_GET_SIZE(v
) != 1) {
313 PyErr_SetString(PyExc_TypeError
,
314 "need a single Unicode character as parameter");
317 index
= (int) _getrecord(v
)->combining
;
319 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
320 if (old
->category_changed
== 0)
321 index
= 0; /* unassigned */
323 return PyInt_FromLong(index
);
326 PyDoc_STRVAR(unicodedata_mirrored__doc__
,
329 Returns the mirrored property assigned to the Unicode character\n\
330 unichr as integer. Returns 1 if the character has been identified as\n\
331 a \"mirrored\" character in bidirectional text, 0 otherwise.");
334 unicodedata_mirrored(PyObject
*self
, PyObject
*args
)
339 if (!PyArg_ParseTuple(args
, "O!:mirrored",
340 &PyUnicode_Type
, &v
))
342 if (PyUnicode_GET_SIZE(v
) != 1) {
343 PyErr_SetString(PyExc_TypeError
,
344 "need a single Unicode character as parameter");
347 index
= (int) _getrecord(v
)->mirrored
;
349 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
350 if (old
->category_changed
== 0)
351 index
= 0; /* unassigned */
353 return PyInt_FromLong(index
);
356 PyDoc_STRVAR(unicodedata_east_asian_width__doc__
,
357 "east_asian_width(unichr)\n\
359 Returns the east asian width assigned to the Unicode character\n\
363 unicodedata_east_asian_width(PyObject
*self
, PyObject
*args
)
368 if (!PyArg_ParseTuple(args
, "O!:east_asian_width",
369 &PyUnicode_Type
, &v
))
371 if (PyUnicode_GET_SIZE(v
) != 1) {
372 PyErr_SetString(PyExc_TypeError
,
373 "need a single Unicode character as parameter");
376 index
= (int) _getrecord(v
)->east_asian_width
;
378 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
379 if (old
->category_changed
== 0)
380 index
= 0; /* unassigned */
382 return PyString_FromString(_PyUnicode_EastAsianWidthNames
[index
]);
385 PyDoc_STRVAR(unicodedata_decomposition__doc__
,
386 "decomposition(unichr)\n\
388 Returns the character decomposition mapping assigned to the Unicode\n\
389 character unichr as string. An empty string is returned in case no\n\
390 such mapping is defined.");
393 unicodedata_decomposition(PyObject
*self
, PyObject
*args
)
397 int code
, index
, count
, i
;
398 unsigned int prefix_index
;
400 if (!PyArg_ParseTuple(args
, "O!:decomposition",
401 &PyUnicode_Type
, &v
))
403 if (PyUnicode_GET_SIZE(v
) != 1) {
404 PyErr_SetString(PyExc_TypeError
,
405 "need a single Unicode character as parameter");
409 code
= (int) *PyUnicode_AS_UNICODE(v
);
412 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
413 if (old
->category_changed
== 0)
414 return PyString_FromString(""); /* unassigned */
417 if (code
< 0 || code
>= 0x110000)
420 index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
421 index
= decomp_index2
[(index
<<DECOMP_SHIFT
)+
422 (code
&((1<<DECOMP_SHIFT
)-1))];
425 /* high byte is number of hex bytes (usually one or two), low byte
426 is prefix code (from*/
427 count
= decomp_data
[index
] >> 8;
429 /* XXX: could allocate the PyString up front instead
430 (strlen(prefix) + 5 * count + 1 bytes) */
432 /* Based on how index is calculated above and decomp_data is generated
433 from Tools/unicode/makeunicodedata.py, it should not be possible
434 to overflow decomp_prefix. */
435 prefix_index
= decomp_data
[index
] & 255;
436 assert(prefix_index
< (sizeof(decomp_prefix
)/sizeof(*decomp_prefix
)));
439 i
= strlen(decomp_prefix
[prefix_index
]);
440 memcpy(decomp
, decomp_prefix
[prefix_index
], i
);
442 while (count
-- > 0) {
445 assert((size_t)i
< sizeof(decomp
));
446 PyOS_snprintf(decomp
+ i
, sizeof(decomp
) - i
, "%04X",
447 decomp_data
[++index
]);
448 i
+= strlen(decomp
+ i
);
453 return PyString_FromString(decomp
);
457 get_decomp_record(PyObject
*self
, Py_UCS4 code
, int *index
, int *prefix
, int *count
)
459 if (code
>= 0x110000) {
461 } else if (self
&& get_old_record(self
, code
)->category_changed
==0) {
462 /* unassigned in old version */
466 *index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
467 *index
= decomp_index2
[(*index
<<DECOMP_SHIFT
)+
468 (code
&((1<<DECOMP_SHIFT
)-1))];
471 /* high byte is number of hex bytes (usually one or two), low byte
472 is prefix code (from*/
473 *count
= decomp_data
[*index
] >> 8;
474 *prefix
= decomp_data
[*index
] & 255;
486 #define NCount (VCount*TCount)
487 #define SCount (LCount*NCount)
490 nfd_nfkd(PyObject
*self
, PyObject
*input
, int k
)
493 Py_UNICODE
*i
, *end
, *o
;
494 /* Longest decomposition in Unicode 3.2: U+FDFA */
495 Py_UNICODE stack
[20];
496 Py_ssize_t space
, isize
;
497 int index
, prefix
, count
, stackptr
;
498 unsigned char prev
, cur
;
501 isize
= PyUnicode_GET_SIZE(input
);
502 /* Overallocate atmost 10 characters. */
503 space
= (isize
> 10 ? 10 : isize
) + isize
;
504 result
= PyUnicode_FromUnicode(NULL
, space
);
507 i
= PyUnicode_AS_UNICODE(input
);
509 o
= PyUnicode_AS_UNICODE(result
);
512 stack
[stackptr
++] = *i
++;
514 Py_UNICODE code
= stack
[--stackptr
];
515 /* Hangul Decomposition adds three characters in
516 a single step, so we need atleast that much room. */
518 Py_ssize_t newsize
= PyString_GET_SIZE(result
) + 10;
520 if (PyUnicode_Resize(&result
, newsize
) == -1)
522 o
= PyUnicode_AS_UNICODE(result
) + newsize
- space
;
524 /* Hangul Decomposition. */
525 if (SBase
<= code
&& code
< (SBase
+SCount
)) {
526 int SIndex
= code
- SBase
;
527 int L
= LBase
+ SIndex
/ NCount
;
528 int V
= VBase
+ (SIndex
% NCount
) / TCount
;
529 int T
= TBase
+ SIndex
% TCount
;
539 /* normalization changes */
541 Py_UCS4 value
= ((PreviousDBVersion
*)self
)->normalization(code
);
543 stack
[stackptr
++] = value
;
548 /* Other decompositions. */
549 get_decomp_record(self
, code
, &index
, &prefix
, &count
);
551 /* Copy character if it is not decomposable, or has a
552 compatibility decomposition, but we do NFD. */
553 if (!count
|| (prefix
&& !k
)) {
558 /* Copy decomposition onto the stack, in reverse
561 code
= decomp_data
[index
+ (--count
)];
562 stack
[stackptr
++] = code
;
567 /* Drop overallocation. Cannot fail. */
568 PyUnicode_Resize(&result
, PyUnicode_GET_SIZE(result
) - space
);
570 /* Sort canonically. */
571 i
= PyUnicode_AS_UNICODE(result
);
572 prev
= _getrecord_ex(*i
)->combining
;
573 end
= i
+ PyUnicode_GET_SIZE(result
);
574 for (i
++; i
< end
; i
++) {
575 cur
= _getrecord_ex(*i
)->combining
;
576 if (prev
== 0 || cur
== 0 || prev
<= cur
) {
580 /* Non-canonical order. Need to switch *i with previous. */
583 Py_UNICODE tmp
= o
[1];
587 if (o
< PyUnicode_AS_UNICODE(result
))
589 prev
= _getrecord_ex(*o
)->combining
;
590 if (prev
== 0 || prev
<= cur
)
593 prev
= _getrecord_ex(*i
)->combining
;
599 find_nfc_index(PyObject
*self
, struct reindex
* nfc
, Py_UNICODE code
)
602 for (index
= 0; nfc
[index
].start
; index
++) {
603 int start
= nfc
[index
].start
;
606 if (code
<= start
+ nfc
[index
].count
) {
607 int delta
= code
- start
;
608 return nfc
[index
].index
+ delta
;
615 nfc_nfkc(PyObject
*self
, PyObject
*input
, int k
)
618 Py_UNICODE
*i
, *i1
, *o
, *end
;
619 int f
,l
,index
,index1
,comb
;
621 Py_UNICODE
*skipped
[20];
624 result
= nfd_nfkd(self
, input
, k
);
628 /* We are going to modify result in-place.
629 If nfd_nfkd is changed to sometimes return the input,
630 this code needs to be reviewed. */
631 assert(result
!= input
);
633 i
= PyUnicode_AS_UNICODE(result
);
634 end
= i
+ PyUnicode_GET_SIZE(result
);
635 o
= PyUnicode_AS_UNICODE(result
);
639 for (index
= 0; index
< cskipped
; index
++) {
640 if (skipped
[index
] == i
) {
641 /* *i character is skipped.
643 skipped
[index
] = skipped
[cskipped
-1];
646 goto again
; /* continue while */
649 /* Hangul Composition. We don't need to check for <LV,T>
650 pairs, since we always have decomposed data. */
651 if (LBase
<= *i
&& *i
< (LBase
+LCount
) &&
653 VBase
<= i
[1] && i
[1] <= (VBase
+VCount
)) {
655 LIndex
= i
[0] - LBase
;
656 VIndex
= i
[1] - VBase
;
657 code
= SBase
+ (LIndex
*VCount
+VIndex
)*TCount
;
660 TBase
<= *i
&& *i
<= (TBase
+TCount
)) {
668 f
= find_nfc_index(self
, nfc_first
, *i
);
673 /* Find next unblocked character. */
677 int comb1
= _getrecord_ex(*i1
)->combining
;
678 if (comb1
&& comb
== comb1
) {
679 /* Character is blocked. */
683 l
= find_nfc_index(self
, nfc_last
, *i1
);
684 /* *i1 cannot be combined with *i. If *i1
685 is a starter, we don't need to look further.
686 Otherwise, record the combining class. */
695 index
= f
*TOTAL_LAST
+ l
;
696 index1
= comp_index
[index
>> COMP_SHIFT
];
697 code
= comp_data
[(index1
<<COMP_SHIFT
)+
698 (index
&((1<<COMP_SHIFT
)-1))];
702 /* Replace the original character. */
704 /* Mark the second character unused. */
705 skipped
[cskipped
++] = i1
;
707 f
= find_nfc_index(self
, nfc_first
, *i
);
714 PyUnicode_Resize(&result
, o
- PyUnicode_AS_UNICODE(result
));
718 PyDoc_STRVAR(unicodedata_normalize__doc__
,
719 "normalize(form, unistr)\n\
721 Return the normal form 'form' for the Unicode string unistr. Valid\n\
722 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
725 unicodedata_normalize(PyObject
*self
, PyObject
*args
)
730 if(!PyArg_ParseTuple(args
, "sO!:normalize",
731 &form
, &PyUnicode_Type
, &input
))
734 if (PyUnicode_GetSize(input
) == 0) {
735 /* Special case empty input strings, since resizing
736 them later would cause internal errors. */
741 if (strcmp(form
, "NFC") == 0)
742 return nfc_nfkc(self
, input
, 0);
743 if (strcmp(form
, "NFKC") == 0)
744 return nfc_nfkc(self
, input
, 1);
745 if (strcmp(form
, "NFD") == 0)
746 return nfd_nfkd(self
, input
, 0);
747 if (strcmp(form
, "NFKD") == 0)
748 return nfd_nfkd(self
, input
, 1);
749 PyErr_SetString(PyExc_ValueError
, "invalid normalization form");
753 /* -------------------------------------------------------------------- */
754 /* unicode character name tables */
756 /* data file generated by Tools/unicode/makeunicodedata.py */
757 #include "unicodename_db.h"
759 /* -------------------------------------------------------------------- */
760 /* database code (cut and pasted from the unidb package) */
763 _gethash(const char *s
, int len
, int scale
)
768 for (i
= 0; i
< len
; i
++) {
769 h
= (h
* scale
) + (unsigned char) toupper(Py_CHARMASK(s
[i
]));
772 h
= (h
^ ((ix
>>24) & 0xff)) & 0x00ffffff;
777 static char *hangul_syllables
[][3] = {
781 { "D", "YAE", "GS" },
782 { "DD", "EO", "N", },
784 { "M", "YEO", "NH" },
788 { "SS", "WAE", "LM" },
792 { "C", "WEO", "LP" },
809 is_unified_ideograph(Py_UCS4 code
)
812 (0x3400 <= code
&& code
<= 0x4DB5) || /* CJK Ideograph Extension A */
813 (0x4E00 <= code
&& code
<= 0x9FBB) || /* CJK Ideograph */
814 (0x20000 <= code
&& code
<= 0x2A6D6));/* CJK Ideograph Extension B */
818 _getucname(PyObject
*self
, Py_UCS4 code
, char* buffer
, int buflen
)
825 if (code
>= 0x110000)
829 const change_record
*old
= get_old_record(self
, code
);
830 if (old
->category_changed
== 0) {
836 if (SBase
<= code
&& code
< SBase
+SCount
) {
837 /* Hangul syllable. */
838 int SIndex
= code
- SBase
;
839 int L
= SIndex
/ NCount
;
840 int V
= (SIndex
% NCount
) / TCount
;
841 int T
= SIndex
% TCount
;
844 /* Worst case: HANGUL SYLLABLE <10chars>. */
846 strcpy(buffer
, "HANGUL SYLLABLE ");
848 strcpy(buffer
, hangul_syllables
[L
][0]);
849 buffer
+= strlen(hangul_syllables
[L
][0]);
850 strcpy(buffer
, hangul_syllables
[V
][1]);
851 buffer
+= strlen(hangul_syllables
[V
][1]);
852 strcpy(buffer
, hangul_syllables
[T
][2]);
853 buffer
+= strlen(hangul_syllables
[T
][2]);
858 if (is_unified_ideograph(code
)) {
860 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
862 sprintf(buffer
, "CJK UNIFIED IDEOGRAPH-%X", code
);
866 /* get offset into phrasebook */
867 offset
= phrasebook_offset1
[(code
>>phrasebook_shift
)];
868 offset
= phrasebook_offset2
[(offset
<<phrasebook_shift
) +
869 (code
&((1<<phrasebook_shift
)-1))];
877 word
= phrasebook
[offset
] - phrasebook_short
;
879 word
= (word
<< 8) + phrasebook
[offset
+1];
882 word
= phrasebook
[offset
++];
885 return 0; /* buffer overflow */
888 /* copy word string from lexicon. the last character in the
889 word has bit 7 set. the last word in a string ends with
891 w
= lexicon
+ lexicon_offset
[word
];
894 return 0; /* buffer overflow */
898 return 0; /* buffer overflow */
899 buffer
[i
++] = *w
& 127;
901 break; /* end of word */
908 _cmpname(PyObject
*self
, int code
, const char* name
, int namelen
)
910 /* check if code corresponds to the given name */
912 char buffer
[NAME_MAXLEN
];
913 if (!_getucname(self
, code
, buffer
, sizeof(buffer
)))
915 for (i
= 0; i
< namelen
; i
++) {
916 if (toupper(Py_CHARMASK(name
[i
])) != buffer
[i
])
919 return buffer
[namelen
] == '\0';
923 find_syllable(const char *str
, int *len
, int *pos
, int count
, int column
)
927 for (i
= 0; i
< count
; i
++) {
928 char *s
= hangul_syllables
[i
][column
];
932 if (strncmp(str
, s
, len1
) == 0) {
943 _getcode(PyObject
* self
, const char* name
, int namelen
, Py_UCS4
* code
)
946 unsigned int mask
= code_size
-1;
947 unsigned int i
, incr
;
949 /* Check for hangul syllables. */
950 if (strncmp(name
, "HANGUL SYLLABLE ", 16) == 0) {
951 int len
, L
= -1, V
= -1, T
= -1;
952 const char *pos
= name
+ 16;
953 find_syllable(pos
, &len
, &L
, LCount
, 0);
955 find_syllable(pos
, &len
, &V
, VCount
, 1);
957 find_syllable(pos
, &len
, &T
, TCount
, 2);
959 if (L
!= -1 && V
!= -1 && T
!= -1 && pos
-name
== namelen
) {
960 *code
= SBase
+ (L
*VCount
+V
)*TCount
+ T
;
963 /* Otherwise, it's an illegal syllable name. */
967 /* Check for unified ideographs. */
968 if (strncmp(name
, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
969 /* Four or five hexdigits must follow. */
973 if (namelen
!= 4 && namelen
!= 5)
977 if (*name
>= '0' && *name
<= '9')
979 else if (*name
>= 'A' && *name
<= 'F')
980 v
+= *name
- 'A' + 10;
985 if (!is_unified_ideograph(v
))
991 /* the following is the same as python's dictionary lookup, with
992 only minor changes. see the makeunicodedata script for more
995 h
= (unsigned int) _gethash(name
, namelen
, code_magic
);
1000 if (_cmpname(self
, v
, name
, namelen
)) {
1004 incr
= (h
^ (h
>> 3)) & mask
;
1008 i
= (i
+ incr
) & mask
;
1012 if (_cmpname(self
, v
, name
, namelen
)) {
1018 incr
= incr
^ code_poly
;
1022 static const _PyUnicode_Name_CAPI hashAPI
=
1024 sizeof(_PyUnicode_Name_CAPI
),
1029 /* -------------------------------------------------------------------- */
1030 /* Python bindings */
1032 PyDoc_STRVAR(unicodedata_name__doc__
,
1033 "name(unichr[, default])\n\
1034 Returns the name assigned to the Unicode character unichr as a\n\
1035 string. If no name is defined, default is returned, or, if not\n\
1036 given, ValueError is raised.");
1039 unicodedata_name(PyObject
* self
, PyObject
* args
)
1041 char name
[NAME_MAXLEN
];
1044 PyObject
* defobj
= NULL
;
1045 if (!PyArg_ParseTuple(args
, "O!|O:name", &PyUnicode_Type
, &v
, &defobj
))
1048 if (PyUnicode_GET_SIZE(v
) != 1) {
1049 PyErr_SetString(PyExc_TypeError
,
1050 "need a single Unicode character as parameter");
1054 if (!_getucname(self
, (Py_UCS4
) *PyUnicode_AS_UNICODE(v
),
1055 name
, sizeof(name
))) {
1056 if (defobj
== NULL
) {
1057 PyErr_SetString(PyExc_ValueError
, "no such name");
1066 return Py_BuildValue("s", name
);
1069 PyDoc_STRVAR(unicodedata_lookup__doc__
,
1072 Look up character by name. If a character with the\n\
1073 given name is found, return the corresponding Unicode\n\
1074 character. If not found, KeyError is raised.");
1077 unicodedata_lookup(PyObject
* self
, PyObject
* args
)
1084 if (!PyArg_ParseTuple(args
, "s#:lookup", &name
, &namelen
))
1087 if (!_getcode(self
, name
, namelen
, &code
)) {
1088 PyErr_Format(PyExc_KeyError
, "undefined character name '%s'",
1093 #ifndef Py_UNICODE_WIDE
1094 if (code
>= 0x10000) {
1095 str
[0] = 0xd800 + ((code
- 0x10000) >> 10);
1096 str
[1] = 0xdc00 + ((code
- 0x10000) & 0x3ff);
1097 return PyUnicode_FromUnicode(str
, 2);
1100 str
[0] = (Py_UNICODE
) code
;
1101 return PyUnicode_FromUnicode(str
, 1);
1104 /* XXX Add doc strings. */
1106 static PyMethodDef unicodedata_functions
[] = {
1107 {"decimal", unicodedata_decimal
, METH_VARARGS
, unicodedata_decimal__doc__
},
1108 {"digit", unicodedata_digit
, METH_VARARGS
, unicodedata_digit__doc__
},
1109 {"numeric", unicodedata_numeric
, METH_VARARGS
, unicodedata_numeric__doc__
},
1110 {"category", unicodedata_category
, METH_VARARGS
,
1111 unicodedata_category__doc__
},
1112 {"bidirectional", unicodedata_bidirectional
, METH_VARARGS
,
1113 unicodedata_bidirectional__doc__
},
1114 {"combining", unicodedata_combining
, METH_VARARGS
,
1115 unicodedata_combining__doc__
},
1116 {"mirrored", unicodedata_mirrored
, METH_VARARGS
,
1117 unicodedata_mirrored__doc__
},
1118 {"east_asian_width", unicodedata_east_asian_width
, METH_VARARGS
,
1119 unicodedata_east_asian_width__doc__
},
1120 {"decomposition", unicodedata_decomposition
, METH_VARARGS
,
1121 unicodedata_decomposition__doc__
},
1122 {"name", unicodedata_name
, METH_VARARGS
, unicodedata_name__doc__
},
1123 {"lookup", unicodedata_lookup
, METH_VARARGS
, unicodedata_lookup__doc__
},
1124 {"normalize", unicodedata_normalize
, METH_VARARGS
,
1125 unicodedata_normalize__doc__
},
1126 {NULL
, NULL
} /* sentinel */
1129 static PyTypeObject UCD_Type
= {
1130 /* The ob_type field must be initialized in the module init function
1131 * to be portable to Windows without using C++. */
1132 PyVarObject_HEAD_INIT(NULL
, 0)
1133 "unicodedata.UCD", /*tp_name*/
1134 sizeof(PreviousDBVersion
), /*tp_basicsize*/
1137 (destructor
)PyObject_Del
, /*tp_dealloc*/
1144 0, /*tp_as_sequence*/
1145 0, /*tp_as_mapping*/
1149 PyObject_GenericGetAttr
,/*tp_getattro*/
1152 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
1156 0, /*tp_richcompare*/
1157 0, /*tp_weaklistoffset*/
1160 unicodedata_functions
, /*tp_methods*/
1161 DB_members
, /*tp_members*/
1167 0, /*tp_dictoffset*/
1175 PyDoc_STRVAR(unicodedata_docstring
,
1176 "This module provides access to the Unicode Character Database which\n\
1177 defines character properties for all Unicode characters. The data in\n\
1178 this database is based on the UnicodeData.txt file version\n\
1179 4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1181 The module uses the same names and symbols as defined by the\n\
1182 UnicodeData File Format 4.1.0 (see\n\
1183 http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
1186 initunicodedata(void)
1190 Py_Type(&UCD_Type
) = &PyType_Type
;
1193 "unicodedata", unicodedata_functions
, unicodedata_docstring
);
1197 PyModule_AddStringConstant(m
, "unidata_version", UNIDATA_VERSION
);
1198 Py_INCREF(&UCD_Type
);
1199 PyModule_AddObject(m
, "UCD", (PyObject
*)&UCD_Type
);
1201 /* Previous versions */
1202 v
= new_previous_version("3.2.0", get_change_3_2_0
, normalization_3_2_0
);
1204 PyModule_AddObject(m
, "ucd_3_2_0", v
);
1207 v
= PyCObject_FromVoidPtr((void *) &hashAPI
, NULL
);
1209 PyModule_AddObject(m
, "ucnhash_CAPI", v
);
1215 indent-tabs-mode: nil