1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 3.2 data base.
5 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
17 #include "structmember.h"
19 /* character properties */
22 const unsigned char category
; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining
; /* combining class value 0 - 255 */
25 const unsigned char bidirectional
; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored
; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width
; /* index into
29 _PyUnicode_EastAsianWidth */
30 } _PyUnicode_DatabaseRecord
;
32 typedef struct change_record
{
33 /* sequence of fields should be the same as in merge_old_version */
34 const unsigned char bidir_changed
;
35 const unsigned char category_changed
;
36 const unsigned char decimal_changed
;
37 const int numeric_changed
;
40 /* data file generated by Tools/unicode/makeunicodedata.py */
41 #include "unicodedata_db.h"
43 static const _PyUnicode_DatabaseRecord
*
44 _getrecord_ex(Py_UCS4 code
)
50 index
= index1
[(code
>>SHIFT
)];
51 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
54 return &_PyUnicode_Database_Records
[index
];
57 static const _PyUnicode_DatabaseRecord
*
58 _getrecord(PyUnicodeObject
* v
)
60 return _getrecord_ex(*PyUnicode_AS_UNICODE(v
));
63 /* ------------- Previous-version API ------------------------------------- */
64 typedef struct previous_version
{
67 const change_record
* (*getrecord
)(Py_UCS4
);
68 Py_UCS4 (*normalization
)(Py_UCS4
);
71 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
73 static PyMemberDef DB_members
[] = {
74 {"unidata_version", T_STRING
, offsetof(PreviousDBVersion
, name
), READONLY
},
78 // forward declaration
79 static PyTypeObject UCD_Type
;
82 new_previous_version(const char*name
, const change_record
* (*getrecord
)(Py_UCS4
),
83 Py_UCS4 (*normalization
)(Py_UCS4
))
85 PreviousDBVersion
*self
;
86 self
= PyObject_New(PreviousDBVersion
, &UCD_Type
);
90 self
->getrecord
= getrecord
;
91 self
->normalization
= normalization
;
92 return (PyObject
*)self
;
95 /* --- Module API --------------------------------------------------------- */
97 PyDoc_STRVAR(unicodedata_decimal__doc__
,
98 "decimal(unichr[, default])\n\
100 Returns the decimal value assigned to the Unicode character unichr\n\
101 as integer. If no such value is defined, default is returned, or, if\n\
102 not given, ValueError is raised.");
105 unicodedata_decimal(PyObject
*self
, PyObject
*args
)
108 PyObject
*defobj
= NULL
;
112 if (!PyArg_ParseTuple(args
, "O!|O:decimal", &PyUnicode_Type
, &v
, &defobj
))
114 if (PyUnicode_GET_SIZE(v
) != 1) {
115 PyErr_SetString(PyExc_TypeError
,
116 "need a single Unicode character as parameter");
121 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
122 if (old
->category_changed
== 0) {
127 else if (old
->decimal_changed
!= 0xFF) {
129 rc
= old
->decimal_changed
;
134 rc
= Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v
));
136 if (defobj
== NULL
) {
137 PyErr_SetString(PyExc_ValueError
,
146 return PyInt_FromLong(rc
);
149 PyDoc_STRVAR(unicodedata_digit__doc__
,
150 "digit(unichr[, default])\n\
152 Returns the digit value assigned to the Unicode character unichr as\n\
153 integer. If no such value is defined, default is returned, or, if\n\
154 not given, ValueError is raised.");
157 unicodedata_digit(PyObject
*self
, PyObject
*args
)
160 PyObject
*defobj
= NULL
;
163 if (!PyArg_ParseTuple(args
, "O!|O:digit", &PyUnicode_Type
, &v
, &defobj
))
165 if (PyUnicode_GET_SIZE(v
) != 1) {
166 PyErr_SetString(PyExc_TypeError
,
167 "need a single Unicode character as parameter");
170 rc
= Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v
));
172 if (defobj
== NULL
) {
173 PyErr_SetString(PyExc_ValueError
, "not a digit");
181 return PyInt_FromLong(rc
);
184 PyDoc_STRVAR(unicodedata_numeric__doc__
,
185 "numeric(unichr[, default])\n\
187 Returns the numeric value assigned to the Unicode character unichr\n\
188 as float. If no such value is defined, default is returned, or, if\n\
189 not given, ValueError is raised.");
192 unicodedata_numeric(PyObject
*self
, PyObject
*args
)
195 PyObject
*defobj
= NULL
;
199 if (!PyArg_ParseTuple(args
, "O!|O:numeric", &PyUnicode_Type
, &v
, &defobj
))
201 if (PyUnicode_GET_SIZE(v
) != 1) {
202 PyErr_SetString(PyExc_TypeError
,
203 "need a single Unicode character as parameter");
208 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
209 if (old
->category_changed
== 0) {
214 else if (old
->decimal_changed
!= 0xFF) {
216 rc
= old
->decimal_changed
;
221 rc
= Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v
));
223 if (defobj
== NULL
) {
224 PyErr_SetString(PyExc_ValueError
, "not a numeric character");
232 return PyFloat_FromDouble(rc
);
235 PyDoc_STRVAR(unicodedata_category__doc__
,
238 Returns the general category assigned to the Unicode character\n\
242 unicodedata_category(PyObject
*self
, PyObject
*args
)
247 if (!PyArg_ParseTuple(args
, "O!:category",
248 &PyUnicode_Type
, &v
))
250 if (PyUnicode_GET_SIZE(v
) != 1) {
251 PyErr_SetString(PyExc_TypeError
,
252 "need a single Unicode character as parameter");
255 index
= (int) _getrecord(v
)->category
;
257 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
258 if (old
->category_changed
!= 0xFF)
259 index
= old
->category_changed
;
261 return PyString_FromString(_PyUnicode_CategoryNames
[index
]);
264 PyDoc_STRVAR(unicodedata_bidirectional__doc__
,
265 "bidirectional(unichr)\n\
267 Returns the bidirectional category assigned to the Unicode character\n\
268 unichr as string. If no such value is defined, an empty string is\n\
272 unicodedata_bidirectional(PyObject
*self
, PyObject
*args
)
277 if (!PyArg_ParseTuple(args
, "O!:bidirectional",
278 &PyUnicode_Type
, &v
))
280 if (PyUnicode_GET_SIZE(v
) != 1) {
281 PyErr_SetString(PyExc_TypeError
,
282 "need a single Unicode character as parameter");
285 index
= (int) _getrecord(v
)->bidirectional
;
287 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
288 if (old
->category_changed
== 0)
289 index
= 0; /* unassigned */
290 else if (old
->bidir_changed
!= 0xFF)
291 index
= old
->bidir_changed
;
293 return PyString_FromString(_PyUnicode_BidirectionalNames
[index
]);
296 PyDoc_STRVAR(unicodedata_combining__doc__
,
297 "combining(unichr)\n\
299 Returns the canonical combining class assigned to the Unicode\n\
300 character unichr as integer. Returns 0 if no combining class is\n\
304 unicodedata_combining(PyObject
*self
, PyObject
*args
)
309 if (!PyArg_ParseTuple(args
, "O!:combining",
310 &PyUnicode_Type
, &v
))
312 if (PyUnicode_GET_SIZE(v
) != 1) {
313 PyErr_SetString(PyExc_TypeError
,
314 "need a single Unicode character as parameter");
317 index
= (int) _getrecord(v
)->combining
;
319 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
320 if (old
->category_changed
== 0)
321 index
= 0; /* unassigned */
323 return PyInt_FromLong(index
);
326 PyDoc_STRVAR(unicodedata_mirrored__doc__
,
329 Returns the mirrored property assigned to the Unicode character\n\
330 unichr as integer. Returns 1 if the character has been identified as\n\
331 a \"mirrored\" character in bidirectional text, 0 otherwise.");
334 unicodedata_mirrored(PyObject
*self
, PyObject
*args
)
339 if (!PyArg_ParseTuple(args
, "O!:mirrored",
340 &PyUnicode_Type
, &v
))
342 if (PyUnicode_GET_SIZE(v
) != 1) {
343 PyErr_SetString(PyExc_TypeError
,
344 "need a single Unicode character as parameter");
347 index
= (int) _getrecord(v
)->mirrored
;
349 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
350 if (old
->category_changed
== 0)
351 index
= 0; /* unassigned */
353 return PyInt_FromLong(index
);
356 PyDoc_STRVAR(unicodedata_east_asian_width__doc__
,
357 "east_asian_width(unichr)\n\
359 Returns the east asian width assigned to the Unicode character\n\
363 unicodedata_east_asian_width(PyObject
*self
, PyObject
*args
)
368 if (!PyArg_ParseTuple(args
, "O!:east_asian_width",
369 &PyUnicode_Type
, &v
))
371 if (PyUnicode_GET_SIZE(v
) != 1) {
372 PyErr_SetString(PyExc_TypeError
,
373 "need a single Unicode character as parameter");
376 index
= (int) _getrecord(v
)->east_asian_width
;
378 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
379 if (old
->category_changed
== 0)
380 index
= 0; /* unassigned */
382 return PyString_FromString(_PyUnicode_EastAsianWidthNames
[index
]);
385 PyDoc_STRVAR(unicodedata_decomposition__doc__
,
386 "decomposition(unichr)\n\
388 Returns the character decomposition mapping assigned to the Unicode\n\
389 character unichr as string. An empty string is returned in case no\n\
390 such mapping is defined.");
393 unicodedata_decomposition(PyObject
*self
, PyObject
*args
)
397 int code
, index
, count
, i
;
399 if (!PyArg_ParseTuple(args
, "O!:decomposition",
400 &PyUnicode_Type
, &v
))
402 if (PyUnicode_GET_SIZE(v
) != 1) {
403 PyErr_SetString(PyExc_TypeError
,
404 "need a single Unicode character as parameter");
408 code
= (int) *PyUnicode_AS_UNICODE(v
);
411 const change_record
*old
= get_old_record(self
, *PyUnicode_AS_UNICODE(v
));
412 if (old
->category_changed
== 0)
413 return PyString_FromString(""); /* unassigned */
416 if (code
< 0 || code
>= 0x110000)
419 index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
420 index
= decomp_index2
[(index
<<DECOMP_SHIFT
)+
421 (code
&((1<<DECOMP_SHIFT
)-1))];
424 /* high byte is number of hex bytes (usually one or two), low byte
425 is prefix code (from*/
426 count
= decomp_data
[index
] >> 8;
428 /* XXX: could allocate the PyString up front instead
429 (strlen(prefix) + 5 * count + 1 bytes) */
432 i
= strlen(decomp_prefix
[decomp_data
[index
] & 255]);
433 memcpy(decomp
, decomp_prefix
[decomp_data
[index
] & 255], i
);
435 while (count
-- > 0) {
438 assert((size_t)i
< sizeof(decomp
));
439 PyOS_snprintf(decomp
+ i
, sizeof(decomp
) - i
, "%04X",
440 decomp_data
[++index
]);
441 i
+= strlen(decomp
+ i
);
446 return PyString_FromString(decomp
);
450 get_decomp_record(PyObject
*self
, Py_UCS4 code
, int *index
, int *prefix
, int *count
)
452 if (code
>= 0x110000) {
454 } else if (self
&& get_old_record(self
, code
)->category_changed
==0) {
455 /* unassigned in old version */
459 *index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
460 *index
= decomp_index2
[(*index
<<DECOMP_SHIFT
)+
461 (code
&((1<<DECOMP_SHIFT
)-1))];
464 /* high byte is number of hex bytes (usually one or two), low byte
465 is prefix code (from*/
466 *count
= decomp_data
[*index
] >> 8;
467 *prefix
= decomp_data
[*index
] & 255;
479 #define NCount (VCount*TCount)
480 #define SCount (LCount*NCount)
483 nfd_nfkd(PyObject
*self
, PyObject
*input
, int k
)
486 Py_UNICODE
*i
, *end
, *o
;
487 /* Longest decomposition in Unicode 3.2: U+FDFA */
488 Py_UNICODE stack
[20];
489 Py_ssize_t space
, isize
;
490 int index
, prefix
, count
, stackptr
;
491 unsigned char prev
, cur
;
494 isize
= PyUnicode_GET_SIZE(input
);
495 /* Overallocate atmost 10 characters. */
496 space
= (isize
> 10 ? 10 : isize
) + isize
;
497 result
= PyUnicode_FromUnicode(NULL
, space
);
500 i
= PyUnicode_AS_UNICODE(input
);
502 o
= PyUnicode_AS_UNICODE(result
);
505 stack
[stackptr
++] = *i
++;
507 Py_UNICODE code
= stack
[--stackptr
];
508 /* Hangul Decomposition adds three characters in
509 a single step, so we need atleast that much room. */
511 Py_ssize_t newsize
= PyString_GET_SIZE(result
) + 10;
513 if (PyUnicode_Resize(&result
, newsize
) == -1)
515 o
= PyUnicode_AS_UNICODE(result
) + newsize
- space
;
517 /* Hangul Decomposition. */
518 if (SBase
<= code
&& code
< (SBase
+SCount
)) {
519 int SIndex
= code
- SBase
;
520 int L
= LBase
+ SIndex
/ NCount
;
521 int V
= VBase
+ (SIndex
% NCount
) / TCount
;
522 int T
= TBase
+ SIndex
% TCount
;
532 /* normalization changes */
534 Py_UCS4 value
= ((PreviousDBVersion
*)self
)->normalization(code
);
536 stack
[stackptr
++] = value
;
541 /* Other decompositions. */
542 get_decomp_record(self
, code
, &index
, &prefix
, &count
);
544 /* Copy character if it is not decomposable, or has a
545 compatibility decomposition, but we do NFD. */
546 if (!count
|| (prefix
&& !k
)) {
551 /* Copy decomposition onto the stack, in reverse
554 code
= decomp_data
[index
+ (--count
)];
555 stack
[stackptr
++] = code
;
560 /* Drop overallocation. Cannot fail. */
561 PyUnicode_Resize(&result
, PyUnicode_GET_SIZE(result
) - space
);
563 /* Sort canonically. */
564 i
= PyUnicode_AS_UNICODE(result
);
565 prev
= _getrecord_ex(*i
)->combining
;
566 end
= i
+ PyUnicode_GET_SIZE(result
);
567 for (i
++; i
< end
; i
++) {
568 cur
= _getrecord_ex(*i
)->combining
;
569 if (prev
== 0 || cur
== 0 || prev
<= cur
) {
573 /* Non-canonical order. Need to switch *i with previous. */
576 Py_UNICODE tmp
= o
[1];
580 if (o
< PyUnicode_AS_UNICODE(result
))
582 prev
= _getrecord_ex(*o
)->combining
;
583 if (prev
== 0 || prev
<= cur
)
586 prev
= _getrecord_ex(*i
)->combining
;
592 find_nfc_index(PyObject
*self
, struct reindex
* nfc
, Py_UNICODE code
)
595 for (index
= 0; nfc
[index
].start
; index
++) {
596 int start
= nfc
[index
].start
;
599 if (code
<= start
+ nfc
[index
].count
) {
600 int delta
= code
- start
;
601 return nfc
[index
].index
+ delta
;
608 nfc_nfkc(PyObject
*self
, PyObject
*input
, int k
)
611 Py_UNICODE
*i
, *i1
, *o
, *end
;
612 int f
,l
,index
,index1
,comb
;
614 Py_UNICODE
*skipped
[20];
617 result
= nfd_nfkd(self
, input
, k
);
621 /* We are going to modify result in-place.
622 If nfd_nfkd is changed to sometimes return the input,
623 this code needs to be reviewed. */
624 assert(result
!= input
);
626 i
= PyUnicode_AS_UNICODE(result
);
627 end
= i
+ PyUnicode_GET_SIZE(result
);
628 o
= PyUnicode_AS_UNICODE(result
);
632 for (index
= 0; index
< cskipped
; index
++) {
633 if (skipped
[index
] == i
) {
634 /* *i character is skipped.
636 skipped
[index
] = skipped
[cskipped
-1];
639 goto again
; /* continue while */
642 /* Hangul Composition. We don't need to check for <LV,T>
643 pairs, since we always have decomposed data. */
644 if (LBase
<= *i
&& *i
< (LBase
+LCount
) &&
646 VBase
<= i
[1] && i
[1] <= (VBase
+VCount
)) {
648 LIndex
= i
[0] - LBase
;
649 VIndex
= i
[1] - VBase
;
650 code
= SBase
+ (LIndex
*VCount
+VIndex
)*TCount
;
653 TBase
<= *i
&& *i
<= (TBase
+TCount
)) {
661 f
= find_nfc_index(self
, nfc_first
, *i
);
666 /* Find next unblocked character. */
670 int comb1
= _getrecord_ex(*i1
)->combining
;
671 if (comb1
&& comb
== comb1
) {
672 /* Character is blocked. */
676 l
= find_nfc_index(self
, nfc_last
, *i1
);
677 /* *i1 cannot be combined with *i. If *i1
678 is a starter, we don't need to look further.
679 Otherwise, record the combining class. */
688 index
= f
*TOTAL_LAST
+ l
;
689 index1
= comp_index
[index
>> COMP_SHIFT
];
690 code
= comp_data
[(index1
<<COMP_SHIFT
)+
691 (index
&((1<<COMP_SHIFT
)-1))];
695 /* Replace the original character. */
697 /* Mark the second character unused. */
698 skipped
[cskipped
++] = i1
;
700 f
= find_nfc_index(self
, nfc_first
, *i
);
707 PyUnicode_Resize(&result
, o
- PyUnicode_AS_UNICODE(result
));
711 PyDoc_STRVAR(unicodedata_normalize__doc__
,
712 "normalize(form, unistr)\n\
714 Return the normal form 'form' for the Unicode string unistr. Valid\n\
715 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
718 unicodedata_normalize(PyObject
*self
, PyObject
*args
)
723 if(!PyArg_ParseTuple(args
, "sO!:normalize",
724 &form
, &PyUnicode_Type
, &input
))
727 if (PyUnicode_GetSize(input
) == 0) {
728 /* Special case empty input strings, since resizing
729 them later would cause internal errors. */
734 if (strcmp(form
, "NFC") == 0)
735 return nfc_nfkc(self
, input
, 0);
736 if (strcmp(form
, "NFKC") == 0)
737 return nfc_nfkc(self
, input
, 1);
738 if (strcmp(form
, "NFD") == 0)
739 return nfd_nfkd(self
, input
, 0);
740 if (strcmp(form
, "NFKD") == 0)
741 return nfd_nfkd(self
, input
, 1);
742 PyErr_SetString(PyExc_ValueError
, "invalid normalization form");
746 /* -------------------------------------------------------------------- */
747 /* unicode character name tables */
749 /* data file generated by Tools/unicode/makeunicodedata.py */
750 #include "unicodename_db.h"
752 /* -------------------------------------------------------------------- */
753 /* database code (cut and pasted from the unidb package) */
756 _gethash(const char *s
, int len
, int scale
)
761 for (i
= 0; i
< len
; i
++) {
762 h
= (h
* scale
) + (unsigned char) toupper(Py_CHARMASK(s
[i
]));
765 h
= (h
^ ((ix
>>24) & 0xff)) & 0x00ffffff;
770 static char *hangul_syllables
[][3] = {
774 { "D", "YAE", "GS" },
775 { "DD", "EO", "N", },
777 { "M", "YEO", "NH" },
781 { "SS", "WAE", "LM" },
785 { "C", "WEO", "LP" },
802 is_unified_ideograph(Py_UCS4 code
)
805 (0x3400 <= code
&& code
<= 0x4DB5) || /* CJK Ideograph Extension A */
806 (0x4E00 <= code
&& code
<= 0x9FBB) || /* CJK Ideograph */
807 (0x20000 <= code
&& code
<= 0x2A6D6));/* CJK Ideograph Extension B */
811 _getucname(PyObject
*self
, Py_UCS4 code
, char* buffer
, int buflen
)
818 if (code
>= 0x110000)
822 const change_record
*old
= get_old_record(self
, code
);
823 if (old
->category_changed
== 0) {
829 if (SBase
<= code
&& code
< SBase
+SCount
) {
830 /* Hangul syllable. */
831 int SIndex
= code
- SBase
;
832 int L
= SIndex
/ NCount
;
833 int V
= (SIndex
% NCount
) / TCount
;
834 int T
= SIndex
% TCount
;
837 /* Worst case: HANGUL SYLLABLE <10chars>. */
839 strcpy(buffer
, "HANGUL SYLLABLE ");
841 strcpy(buffer
, hangul_syllables
[L
][0]);
842 buffer
+= strlen(hangul_syllables
[L
][0]);
843 strcpy(buffer
, hangul_syllables
[V
][1]);
844 buffer
+= strlen(hangul_syllables
[V
][1]);
845 strcpy(buffer
, hangul_syllables
[T
][2]);
846 buffer
+= strlen(hangul_syllables
[T
][2]);
851 if (is_unified_ideograph(code
)) {
853 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
855 sprintf(buffer
, "CJK UNIFIED IDEOGRAPH-%X", code
);
859 /* get offset into phrasebook */
860 offset
= phrasebook_offset1
[(code
>>phrasebook_shift
)];
861 offset
= phrasebook_offset2
[(offset
<<phrasebook_shift
) +
862 (code
&((1<<phrasebook_shift
)-1))];
870 word
= phrasebook
[offset
] - phrasebook_short
;
872 word
= (word
<< 8) + phrasebook
[offset
+1];
875 word
= phrasebook
[offset
++];
878 return 0; /* buffer overflow */
881 /* copy word string from lexicon. the last character in the
882 word has bit 7 set. the last word in a string ends with
884 w
= lexicon
+ lexicon_offset
[word
];
887 return 0; /* buffer overflow */
891 return 0; /* buffer overflow */
892 buffer
[i
++] = *w
& 127;
894 break; /* end of word */
901 _cmpname(PyObject
*self
, int code
, const char* name
, int namelen
)
903 /* check if code corresponds to the given name */
905 char buffer
[NAME_MAXLEN
];
906 if (!_getucname(self
, code
, buffer
, sizeof(buffer
)))
908 for (i
= 0; i
< namelen
; i
++) {
909 if (toupper(Py_CHARMASK(name
[i
])) != buffer
[i
])
912 return buffer
[namelen
] == '\0';
916 find_syllable(const char *str
, int *len
, int *pos
, int count
, int column
)
920 for (i
= 0; i
< count
; i
++) {
921 char *s
= hangul_syllables
[i
][column
];
925 if (strncmp(str
, s
, len1
) == 0) {
936 _getcode(PyObject
* self
, const char* name
, int namelen
, Py_UCS4
* code
)
939 unsigned int mask
= code_size
-1;
940 unsigned int i
, incr
;
942 /* Check for hangul syllables. */
943 if (strncmp(name
, "HANGUL SYLLABLE ", 16) == 0) {
944 int len
, L
= -1, V
= -1, T
= -1;
945 const char *pos
= name
+ 16;
946 find_syllable(pos
, &len
, &L
, LCount
, 0);
948 find_syllable(pos
, &len
, &V
, VCount
, 1);
950 find_syllable(pos
, &len
, &T
, TCount
, 2);
952 if (L
!= -1 && V
!= -1 && T
!= -1 && pos
-name
== namelen
) {
953 *code
= SBase
+ (L
*VCount
+V
)*TCount
+ T
;
956 /* Otherwise, it's an illegal syllable name. */
960 /* Check for unified ideographs. */
961 if (strncmp(name
, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
962 /* Four or five hexdigits must follow. */
966 if (namelen
!= 4 && namelen
!= 5)
970 if (*name
>= '0' && *name
<= '9')
972 else if (*name
>= 'A' && *name
<= 'F')
973 v
+= *name
- 'A' + 10;
978 if (!is_unified_ideograph(v
))
984 /* the following is the same as python's dictionary lookup, with
985 only minor changes. see the makeunicodedata script for more
988 h
= (unsigned int) _gethash(name
, namelen
, code_magic
);
993 if (_cmpname(self
, v
, name
, namelen
)) {
997 incr
= (h
^ (h
>> 3)) & mask
;
1001 i
= (i
+ incr
) & mask
;
1005 if (_cmpname(self
, v
, name
, namelen
)) {
1011 incr
= incr
^ code_poly
;
1015 static const _PyUnicode_Name_CAPI hashAPI
=
1017 sizeof(_PyUnicode_Name_CAPI
),
1022 /* -------------------------------------------------------------------- */
1023 /* Python bindings */
1025 PyDoc_STRVAR(unicodedata_name__doc__
,
1026 "name(unichr[, default])\n\
1027 Returns the name assigned to the Unicode character unichr as a\n\
1028 string. If no name is defined, default is returned, or, if not\n\
1029 given, ValueError is raised.");
1032 unicodedata_name(PyObject
* self
, PyObject
* args
)
1034 char name
[NAME_MAXLEN
];
1037 PyObject
* defobj
= NULL
;
1038 if (!PyArg_ParseTuple(args
, "O!|O:name", &PyUnicode_Type
, &v
, &defobj
))
1041 if (PyUnicode_GET_SIZE(v
) != 1) {
1042 PyErr_SetString(PyExc_TypeError
,
1043 "need a single Unicode character as parameter");
1047 if (!_getucname(self
, (Py_UCS4
) *PyUnicode_AS_UNICODE(v
),
1048 name
, sizeof(name
))) {
1049 if (defobj
== NULL
) {
1050 PyErr_SetString(PyExc_ValueError
, "no such name");
1059 return Py_BuildValue("s", name
);
1062 PyDoc_STRVAR(unicodedata_lookup__doc__
,
1065 Look up character by name. If a character with the\n\
1066 given name is found, return the corresponding Unicode\n\
1067 character. If not found, KeyError is raised.");
1070 unicodedata_lookup(PyObject
* self
, PyObject
* args
)
1077 if (!PyArg_ParseTuple(args
, "s#:lookup", &name
, &namelen
))
1080 if (!_getcode(self
, name
, namelen
, &code
)) {
1081 char fmt
[] = "undefined character name '%s'";
1082 char *buf
= PyMem_MALLOC(sizeof(fmt
) + namelen
);
1083 sprintf(buf
, fmt
, name
);
1084 PyErr_SetString(PyExc_KeyError
, buf
);
1089 str
[0] = (Py_UNICODE
) code
;
1090 return PyUnicode_FromUnicode(str
, 1);
1093 /* XXX Add doc strings. */
1095 static PyMethodDef unicodedata_functions
[] = {
1096 {"decimal", unicodedata_decimal
, METH_VARARGS
, unicodedata_decimal__doc__
},
1097 {"digit", unicodedata_digit
, METH_VARARGS
, unicodedata_digit__doc__
},
1098 {"numeric", unicodedata_numeric
, METH_VARARGS
, unicodedata_numeric__doc__
},
1099 {"category", unicodedata_category
, METH_VARARGS
,
1100 unicodedata_category__doc__
},
1101 {"bidirectional", unicodedata_bidirectional
, METH_VARARGS
,
1102 unicodedata_bidirectional__doc__
},
1103 {"combining", unicodedata_combining
, METH_VARARGS
,
1104 unicodedata_combining__doc__
},
1105 {"mirrored", unicodedata_mirrored
, METH_VARARGS
,
1106 unicodedata_mirrored__doc__
},
1107 {"east_asian_width", unicodedata_east_asian_width
, METH_VARARGS
,
1108 unicodedata_east_asian_width__doc__
},
1109 {"decomposition", unicodedata_decomposition
, METH_VARARGS
,
1110 unicodedata_decomposition__doc__
},
1111 {"name", unicodedata_name
, METH_VARARGS
, unicodedata_name__doc__
},
1112 {"lookup", unicodedata_lookup
, METH_VARARGS
, unicodedata_lookup__doc__
},
1113 {"normalize", unicodedata_normalize
, METH_VARARGS
,
1114 unicodedata_normalize__doc__
},
1115 {NULL
, NULL
} /* sentinel */
1118 static PyTypeObject UCD_Type
= {
1119 /* The ob_type field must be initialized in the module init function
1120 * to be portable to Windows without using C++. */
1121 PyObject_HEAD_INIT(NULL
)
1123 "unicodedata.UCD", /*tp_name*/
1124 sizeof(PreviousDBVersion
), /*tp_basicsize*/
1127 (destructor
)PyObject_Del
, /*tp_dealloc*/
1134 0, /*tp_as_sequence*/
1135 0, /*tp_as_mapping*/
1139 PyObject_GenericGetAttr
,/*tp_getattro*/
1142 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
1146 0, /*tp_richcompare*/
1147 0, /*tp_weaklistoffset*/
1150 unicodedata_functions
, /*tp_methods*/
1151 DB_members
, /*tp_members*/
1157 0, /*tp_dictoffset*/
1165 PyDoc_STRVAR(unicodedata_docstring
,
1166 "This module provides access to the Unicode Character Database which\n\
1167 defines character properties for all Unicode characters. The data in\n\
1168 this database is based on the UnicodeData.txt file version\n\
1169 3.2.0 which is publically available from ftp://ftp.unicode.org/.\n\
1171 The module uses the same names and symbols as defined by the\n\
1172 UnicodeData File Format 3.2.0 (see\n\
1173 http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html).");
1176 initunicodedata(void)
1180 UCD_Type
.ob_type
= &PyType_Type
;
1183 "unicodedata", unicodedata_functions
, unicodedata_docstring
);
1187 PyModule_AddStringConstant(m
, "unidata_version", UNIDATA_VERSION
);
1188 Py_INCREF(&UCD_Type
);
1189 PyModule_AddObject(m
, "UCD", (PyObject
*)&UCD_Type
);
1191 /* Previous versions */
1192 v
= new_previous_version("3.2.0", get_change_3_2_0
, normalization_3_2_0
);
1194 PyModule_AddObject(m
, "ucd_3_2_0", v
);
1197 v
= PyCObject_FromVoidPtr((void *) &hashAPI
, NULL
);
1199 PyModule_AddObject(m
, "ucnhash_CAPI", v
);
1205 indent-tabs-mode: nil