1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 5.1 data base.
5 Data was extracted from the Unicode 5.1 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
17 #include "structmember.h"
19 /* character properties */
22 const unsigned char category
; /* index into
23 _PyUnicode_CategoryNames */
24 const unsigned char combining
; /* combining class value 0 - 255 */
25 const unsigned char bidirectional
; /* index into
26 _PyUnicode_BidirectionalNames */
27 const unsigned char mirrored
; /* true if mirrored in bidir mode */
28 const unsigned char east_asian_width
; /* index into
29 _PyUnicode_EastAsianWidth */
30 const unsigned char normalization_quick_check
; /* see is_normalized() */
31 } _PyUnicode_DatabaseRecord
;
33 typedef struct change_record
{
34 /* sequence of fields should be the same as in merge_old_version */
35 const unsigned char bidir_changed
;
36 const unsigned char category_changed
;
37 const unsigned char decimal_changed
;
38 const unsigned char mirrored_changed
;
39 const int numeric_changed
;
42 /* data file generated by Tools/unicode/makeunicodedata.py */
43 #include "unicodedata_db.h"
45 static const _PyUnicode_DatabaseRecord
*
46 _getrecord_ex(Py_UCS4 code
)
52 index
= index1
[(code
>>SHIFT
)];
53 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
56 return &_PyUnicode_Database_Records
[index
];
59 /* ------------- Previous-version API ------------------------------------- */
60 typedef struct previous_version
{
63 const change_record
* (*getrecord
)(Py_UCS4
);
64 Py_UCS4 (*normalization
)(Py_UCS4
);
67 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
69 static PyMemberDef DB_members
[] = {
70 {"unidata_version", T_STRING
, offsetof(PreviousDBVersion
, name
), READONLY
},
74 /* forward declaration */
75 static PyTypeObject UCD_Type
;
76 #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
79 new_previous_version(const char*name
, const change_record
* (*getrecord
)(Py_UCS4
),
80 Py_UCS4 (*normalization
)(Py_UCS4
))
82 PreviousDBVersion
*self
;
83 self
= PyObject_New(PreviousDBVersion
, &UCD_Type
);
87 self
->getrecord
= getrecord
;
88 self
->normalization
= normalization
;
89 return (PyObject
*)self
;
93 static Py_UCS4
getuchar(PyUnicodeObject
*obj
)
95 Py_UNICODE
*v
= PyUnicode_AS_UNICODE(obj
);
97 if (PyUnicode_GET_SIZE(obj
) == 1)
99 #ifndef Py_UNICODE_WIDE
100 else if ((PyUnicode_GET_SIZE(obj
) == 2) &&
101 (0xD800 <= v
[0] && v
[0] <= 0xDBFF) &&
102 (0xDC00 <= v
[1] && v
[1] <= 0xDFFF))
103 return (((v
[0] & 0x3FF)<<10) | (v
[1] & 0x3FF)) + 0x10000;
105 PyErr_SetString(PyExc_TypeError
,
106 "need a single Unicode character as parameter");
110 /* --- Module API --------------------------------------------------------- */
112 PyDoc_STRVAR(unicodedata_decimal__doc__
,
113 "decimal(unichr[, default])\n\
115 Returns the decimal value assigned to the Unicode character unichr\n\
116 as integer. If no such value is defined, default is returned, or, if\n\
117 not given, ValueError is raised.");
120 unicodedata_decimal(PyObject
*self
, PyObject
*args
)
123 PyObject
*defobj
= NULL
;
128 if (!PyArg_ParseTuple(args
, "O!|O:decimal", &PyUnicode_Type
, &v
, &defobj
))
131 if (c
== (Py_UCS4
)-1)
134 if (self
&& UCD_Check(self
)) {
135 const change_record
*old
= get_old_record(self
, c
);
136 if (old
->category_changed
== 0) {
141 else if (old
->decimal_changed
!= 0xFF) {
143 rc
= old
->decimal_changed
;
148 rc
= Py_UNICODE_TODECIMAL(c
);
150 if (defobj
== NULL
) {
151 PyErr_SetString(PyExc_ValueError
,
160 return PyLong_FromLong(rc
);
163 PyDoc_STRVAR(unicodedata_digit__doc__
,
164 "digit(unichr[, default])\n\
166 Returns the digit value assigned to the Unicode character unichr as\n\
167 integer. If no such value is defined, default is returned, or, if\n\
168 not given, ValueError is raised.");
171 unicodedata_digit(PyObject
*self
, PyObject
*args
)
174 PyObject
*defobj
= NULL
;
178 if (!PyArg_ParseTuple(args
, "O!|O:digit", &PyUnicode_Type
, &v
, &defobj
))
181 if (c
== (Py_UCS4
)-1)
183 rc
= Py_UNICODE_TODIGIT(c
);
185 if (defobj
== NULL
) {
186 PyErr_SetString(PyExc_ValueError
, "not a digit");
194 return PyLong_FromLong(rc
);
197 PyDoc_STRVAR(unicodedata_numeric__doc__
,
198 "numeric(unichr[, default])\n\
200 Returns the numeric value assigned to the Unicode character unichr\n\
201 as float. If no such value is defined, default is returned, or, if\n\
202 not given, ValueError is raised.");
205 unicodedata_numeric(PyObject
*self
, PyObject
*args
)
208 PyObject
*defobj
= NULL
;
213 if (!PyArg_ParseTuple(args
, "O!|O:numeric", &PyUnicode_Type
, &v
, &defobj
))
216 if (c
== (Py_UCS4
)-1)
219 if (self
&& UCD_Check(self
)) {
220 const change_record
*old
= get_old_record(self
, c
);
221 if (old
->category_changed
== 0) {
226 else if (old
->decimal_changed
!= 0xFF) {
228 rc
= old
->decimal_changed
;
233 rc
= Py_UNICODE_TONUMERIC(c
);
235 if (defobj
== NULL
) {
236 PyErr_SetString(PyExc_ValueError
, "not a numeric character");
244 return PyFloat_FromDouble(rc
);
247 PyDoc_STRVAR(unicodedata_category__doc__
,
250 Returns the general category assigned to the Unicode character\n\
254 unicodedata_category(PyObject
*self
, PyObject
*args
)
260 if (!PyArg_ParseTuple(args
, "O!:category",
261 &PyUnicode_Type
, &v
))
264 if (c
== (Py_UCS4
)-1)
266 index
= (int) _getrecord_ex(c
)->category
;
267 if (self
&& UCD_Check(self
)) {
268 const change_record
*old
= get_old_record(self
, c
);
269 if (old
->category_changed
!= 0xFF)
270 index
= old
->category_changed
;
272 return PyUnicode_FromString(_PyUnicode_CategoryNames
[index
]);
275 PyDoc_STRVAR(unicodedata_bidirectional__doc__
,
276 "bidirectional(unichr)\n\
278 Returns the bidirectional category assigned to the Unicode character\n\
279 unichr as string. If no such value is defined, an empty string is\n\
283 unicodedata_bidirectional(PyObject
*self
, PyObject
*args
)
289 if (!PyArg_ParseTuple(args
, "O!:bidirectional",
290 &PyUnicode_Type
, &v
))
293 if (c
== (Py_UCS4
)-1)
295 index
= (int) _getrecord_ex(c
)->bidirectional
;
296 if (self
&& UCD_Check(self
)) {
297 const change_record
*old
= get_old_record(self
, c
);
298 if (old
->category_changed
== 0)
299 index
= 0; /* unassigned */
300 else if (old
->bidir_changed
!= 0xFF)
301 index
= old
->bidir_changed
;
303 return PyUnicode_FromString(_PyUnicode_BidirectionalNames
[index
]);
306 PyDoc_STRVAR(unicodedata_combining__doc__
,
307 "combining(unichr)\n\
309 Returns the canonical combining class assigned to the Unicode\n\
310 character unichr as integer. Returns 0 if no combining class is\n\
314 unicodedata_combining(PyObject
*self
, PyObject
*args
)
320 if (!PyArg_ParseTuple(args
, "O!:combining",
321 &PyUnicode_Type
, &v
))
324 if (c
== (Py_UCS4
)-1)
326 index
= (int) _getrecord_ex(c
)->combining
;
327 if (self
&& UCD_Check(self
)) {
328 const change_record
*old
= get_old_record(self
, c
);
329 if (old
->category_changed
== 0)
330 index
= 0; /* unassigned */
332 return PyLong_FromLong(index
);
335 PyDoc_STRVAR(unicodedata_mirrored__doc__
,
338 Returns the mirrored property assigned to the Unicode character\n\
339 unichr as integer. Returns 1 if the character has been identified as\n\
340 a \"mirrored\" character in bidirectional text, 0 otherwise.");
343 unicodedata_mirrored(PyObject
*self
, PyObject
*args
)
349 if (!PyArg_ParseTuple(args
, "O!:mirrored",
350 &PyUnicode_Type
, &v
))
353 if (c
== (Py_UCS4
)-1)
355 index
= (int) _getrecord_ex(c
)->mirrored
;
356 if (self
&& UCD_Check(self
)) {
357 const change_record
*old
= get_old_record(self
, c
);
358 if (old
->category_changed
== 0)
359 index
= 0; /* unassigned */
360 else if (old
->mirrored_changed
!= 0xFF)
361 index
= old
->mirrored_changed
;
363 return PyLong_FromLong(index
);
366 PyDoc_STRVAR(unicodedata_east_asian_width__doc__
,
367 "east_asian_width(unichr)\n\
369 Returns the east asian width assigned to the Unicode character\n\
373 unicodedata_east_asian_width(PyObject
*self
, PyObject
*args
)
379 if (!PyArg_ParseTuple(args
, "O!:east_asian_width",
380 &PyUnicode_Type
, &v
))
383 if (c
== (Py_UCS4
)-1)
385 index
= (int) _getrecord_ex(c
)->east_asian_width
;
386 if (self
&& UCD_Check(self
)) {
387 const change_record
*old
= get_old_record(self
, c
);
388 if (old
->category_changed
== 0)
389 index
= 0; /* unassigned */
391 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames
[index
]);
394 PyDoc_STRVAR(unicodedata_decomposition__doc__
,
395 "decomposition(unichr)\n\
397 Returns the character decomposition mapping assigned to the Unicode\n\
398 character unichr as string. An empty string is returned in case no\n\
399 such mapping is defined.");
402 unicodedata_decomposition(PyObject
*self
, PyObject
*args
)
406 int code
, index
, count
, i
;
407 unsigned int prefix_index
;
410 if (!PyArg_ParseTuple(args
, "O!:decomposition",
411 &PyUnicode_Type
, &v
))
414 if (c
== (Py_UCS4
)-1)
419 if (self
&& UCD_Check(self
)) {
420 const change_record
*old
= get_old_record(self
, c
);
421 if (old
->category_changed
== 0)
422 return PyUnicode_FromString(""); /* unassigned */
425 if (code
< 0 || code
>= 0x110000)
428 index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
429 index
= decomp_index2
[(index
<<DECOMP_SHIFT
)+
430 (code
&((1<<DECOMP_SHIFT
)-1))];
433 /* high byte is number of hex bytes (usually one or two), low byte
434 is prefix code (from*/
435 count
= decomp_data
[index
] >> 8;
437 /* XXX: could allocate the PyString up front instead
438 (strlen(prefix) + 5 * count + 1 bytes) */
440 /* Based on how index is calculated above and decomp_data is generated
441 from Tools/unicode/makeunicodedata.py, it should not be possible
442 to overflow decomp_prefix. */
443 prefix_index
= decomp_data
[index
] & 255;
444 assert(prefix_index
< (sizeof(decomp_prefix
)/sizeof(*decomp_prefix
)));
447 i
= strlen(decomp_prefix
[prefix_index
]);
448 memcpy(decomp
, decomp_prefix
[prefix_index
], i
);
450 while (count
-- > 0) {
453 assert((size_t)i
< sizeof(decomp
));
454 PyOS_snprintf(decomp
+ i
, sizeof(decomp
) - i
, "%04X",
455 decomp_data
[++index
]);
456 i
+= strlen(decomp
+ i
);
461 return PyUnicode_FromString(decomp
);
465 get_decomp_record(PyObject
*self
, Py_UCS4 code
, int *index
, int *prefix
, int *count
)
467 if (code
>= 0x110000) {
469 } else if (self
&& UCD_Check(self
) &&
470 get_old_record(self
, code
)->category_changed
==0) {
471 /* unassigned in old version */
475 *index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
476 *index
= decomp_index2
[(*index
<<DECOMP_SHIFT
)+
477 (code
&((1<<DECOMP_SHIFT
)-1))];
480 /* high byte is number of hex bytes (usually one or two), low byte
481 is prefix code (from*/
482 *count
= decomp_data
[*index
] >> 8;
483 *prefix
= decomp_data
[*index
] & 255;
495 #define NCount (VCount*TCount)
496 #define SCount (LCount*NCount)
499 nfd_nfkd(PyObject
*self
, PyObject
*input
, int k
)
502 Py_UNICODE
*i
, *end
, *o
;
503 /* Longest decomposition in Unicode 3.2: U+FDFA */
504 Py_UNICODE stack
[20];
505 Py_ssize_t space
, isize
;
506 int index
, prefix
, count
, stackptr
;
507 unsigned char prev
, cur
;
510 isize
= PyUnicode_GET_SIZE(input
);
511 /* Overallocate atmost 10 characters. */
512 space
= (isize
> 10 ? 10 : isize
) + isize
;
513 result
= PyUnicode_FromUnicode(NULL
, space
);
516 i
= PyUnicode_AS_UNICODE(input
);
518 o
= PyUnicode_AS_UNICODE(result
);
521 stack
[stackptr
++] = *i
++;
523 Py_UNICODE code
= stack
[--stackptr
];
524 /* Hangul Decomposition adds three characters in
525 a single step, so we need atleast that much room. */
527 Py_ssize_t newsize
= PyUnicode_GET_SIZE(result
) + 10;
529 if (PyUnicode_Resize(&result
, newsize
) == -1)
531 o
= PyUnicode_AS_UNICODE(result
) + newsize
- space
;
533 /* Hangul Decomposition. */
534 if (SBase
<= code
&& code
< (SBase
+SCount
)) {
535 int SIndex
= code
- SBase
;
536 int L
= LBase
+ SIndex
/ NCount
;
537 int V
= VBase
+ (SIndex
% NCount
) / TCount
;
538 int T
= TBase
+ SIndex
% TCount
;
548 /* normalization changes */
549 if (self
&& UCD_Check(self
)) {
550 Py_UCS4 value
= ((PreviousDBVersion
*)self
)->normalization(code
);
552 stack
[stackptr
++] = value
;
557 /* Other decompositions. */
558 get_decomp_record(self
, code
, &index
, &prefix
, &count
);
560 /* Copy character if it is not decomposable, or has a
561 compatibility decomposition, but we do NFD. */
562 if (!count
|| (prefix
&& !k
)) {
567 /* Copy decomposition onto the stack, in reverse
570 code
= decomp_data
[index
+ (--count
)];
571 stack
[stackptr
++] = code
;
576 /* Drop overallocation. Cannot fail. */
577 PyUnicode_Resize(&result
, PyUnicode_GET_SIZE(result
) - space
);
579 /* Sort canonically. */
580 i
= PyUnicode_AS_UNICODE(result
);
581 prev
= _getrecord_ex(*i
)->combining
;
582 end
= i
+ PyUnicode_GET_SIZE(result
);
583 for (i
++; i
< end
; i
++) {
584 cur
= _getrecord_ex(*i
)->combining
;
585 if (prev
== 0 || cur
== 0 || prev
<= cur
) {
589 /* Non-canonical order. Need to switch *i with previous. */
592 Py_UNICODE tmp
= o
[1];
596 if (o
< PyUnicode_AS_UNICODE(result
))
598 prev
= _getrecord_ex(*o
)->combining
;
599 if (prev
== 0 || prev
<= cur
)
602 prev
= _getrecord_ex(*i
)->combining
;
608 find_nfc_index(PyObject
*self
, struct reindex
* nfc
, Py_UNICODE code
)
611 for (index
= 0; nfc
[index
].start
; index
++) {
612 int start
= nfc
[index
].start
;
615 if (code
<= start
+ nfc
[index
].count
) {
616 int delta
= code
- start
;
617 return nfc
[index
].index
+ delta
;
624 nfc_nfkc(PyObject
*self
, PyObject
*input
, int k
)
627 Py_UNICODE
*i
, *i1
, *o
, *end
;
628 int f
,l
,index
,index1
,comb
;
630 Py_UNICODE
*skipped
[20];
633 result
= nfd_nfkd(self
, input
, k
);
637 /* We are going to modify result in-place.
638 If nfd_nfkd is changed to sometimes return the input,
639 this code needs to be reviewed. */
640 assert(result
!= input
);
642 i
= PyUnicode_AS_UNICODE(result
);
643 end
= i
+ PyUnicode_GET_SIZE(result
);
644 o
= PyUnicode_AS_UNICODE(result
);
648 for (index
= 0; index
< cskipped
; index
++) {
649 if (skipped
[index
] == i
) {
650 /* *i character is skipped.
652 skipped
[index
] = skipped
[cskipped
-1];
655 goto again
; /* continue while */
658 /* Hangul Composition. We don't need to check for <LV,T>
659 pairs, since we always have decomposed data. */
660 if (LBase
<= *i
&& *i
< (LBase
+LCount
) &&
662 VBase
<= i
[1] && i
[1] <= (VBase
+VCount
)) {
664 LIndex
= i
[0] - LBase
;
665 VIndex
= i
[1] - VBase
;
666 code
= SBase
+ (LIndex
*VCount
+VIndex
)*TCount
;
669 TBase
<= *i
&& *i
<= (TBase
+TCount
)) {
677 f
= find_nfc_index(self
, nfc_first
, *i
);
682 /* Find next unblocked character. */
686 int comb1
= _getrecord_ex(*i1
)->combining
;
687 if (comb1
&& comb
== comb1
) {
688 /* Character is blocked. */
692 l
= find_nfc_index(self
, nfc_last
, *i1
);
693 /* *i1 cannot be combined with *i. If *i1
694 is a starter, we don't need to look further.
695 Otherwise, record the combining class. */
704 index
= f
*TOTAL_LAST
+ l
;
705 index1
= comp_index
[index
>> COMP_SHIFT
];
706 code
= comp_data
[(index1
<<COMP_SHIFT
)+
707 (index
&((1<<COMP_SHIFT
)-1))];
711 /* Replace the original character. */
713 /* Mark the second character unused. */
714 skipped
[cskipped
++] = i1
;
716 f
= find_nfc_index(self
, nfc_first
, *i
);
723 PyUnicode_Resize(&result
, o
- PyUnicode_AS_UNICODE(result
));
727 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
729 is_normalized(PyObject
*self
, PyObject
*input
, int nfc
, int k
)
732 unsigned char prev_combining
= 0, quickcheck_mask
;
734 /* An older version of the database is requested, quickchecks must be
736 if (self
&& UCD_Check(self
))
739 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
740 as described in http://unicode.org/reports/tr15/#Annex8. */
741 quickcheck_mask
= 3 << ((nfc
? 4 : 0) + (k
? 2 : 0));
743 i
= PyUnicode_AS_UNICODE(input
);
744 end
= i
+ PyUnicode_GET_SIZE(input
);
746 const _PyUnicode_DatabaseRecord
*record
= _getrecord_ex(*i
++);
747 unsigned char combining
= record
->combining
;
748 unsigned char quickcheck
= record
->normalization_quick_check
;
750 if (quickcheck
& quickcheck_mask
)
751 return 0; /* this string might need normalization */
752 if (combining
&& prev_combining
> combining
)
753 return 0; /* non-canonical sort order, not normalized */
754 prev_combining
= combining
;
756 return 1; /* certainly normalized */
759 PyDoc_STRVAR(unicodedata_normalize__doc__
,
760 "normalize(form, unistr)\n\
762 Return the normal form 'form' for the Unicode string unistr. Valid\n\
763 values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
766 unicodedata_normalize(PyObject
*self
, PyObject
*args
)
771 if(!PyArg_ParseTuple(args
, "sO!:normalize",
772 &form
, &PyUnicode_Type
, &input
))
775 if (PyUnicode_GetSize(input
) == 0) {
776 /* Special case empty input strings, since resizing
777 them later would cause internal errors. */
782 if (strcmp(form
, "NFC") == 0) {
783 if (is_normalized(self
, input
, 1, 0)) {
787 return nfc_nfkc(self
, input
, 0);
789 if (strcmp(form
, "NFKC") == 0) {
790 if (is_normalized(self
, input
, 1, 1)) {
794 return nfc_nfkc(self
, input
, 1);
796 if (strcmp(form
, "NFD") == 0) {
797 if (is_normalized(self
, input
, 0, 0)) {
801 return nfd_nfkd(self
, input
, 0);
803 if (strcmp(form
, "NFKD") == 0) {
804 if (is_normalized(self
, input
, 0, 1)) {
808 return nfd_nfkd(self
, input
, 1);
810 PyErr_SetString(PyExc_ValueError
, "invalid normalization form");
814 /* -------------------------------------------------------------------- */
815 /* unicode character name tables */
817 /* data file generated by Tools/unicode/makeunicodedata.py */
818 #include "unicodename_db.h"
820 /* -------------------------------------------------------------------- */
821 /* database code (cut and pasted from the unidb package) */
824 _gethash(const char *s
, int len
, int scale
)
829 for (i
= 0; i
< len
; i
++) {
830 h
= (h
* scale
) + (unsigned char) toupper(Py_CHARMASK(s
[i
]));
833 h
= (h
^ ((ix
>>24) & 0xff)) & 0x00ffffff;
838 static char *hangul_syllables
[][3] = {
842 { "D", "YAE", "GS" },
843 { "DD", "EO", "N", },
845 { "M", "YEO", "NH" },
849 { "SS", "WAE", "LM" },
853 { "C", "WEO", "LP" },
870 is_unified_ideograph(Py_UCS4 code
)
873 (0x3400 <= code
&& code
<= 0x4DB5) || /* CJK Ideograph Extension A */
874 (0x4E00 <= code
&& code
<= 0x9FBB) || /* CJK Ideograph */
875 (0x20000 <= code
&& code
<= 0x2A6D6));/* CJK Ideograph Extension B */
879 _getucname(PyObject
*self
, Py_UCS4 code
, char* buffer
, int buflen
)
886 if (code
>= 0x110000)
889 if (self
&& UCD_Check(self
)) {
890 const change_record
*old
= get_old_record(self
, code
);
891 if (old
->category_changed
== 0) {
897 if (SBase
<= code
&& code
< SBase
+SCount
) {
898 /* Hangul syllable. */
899 int SIndex
= code
- SBase
;
900 int L
= SIndex
/ NCount
;
901 int V
= (SIndex
% NCount
) / TCount
;
902 int T
= SIndex
% TCount
;
905 /* Worst case: HANGUL SYLLABLE <10chars>. */
907 strcpy(buffer
, "HANGUL SYLLABLE ");
909 strcpy(buffer
, hangul_syllables
[L
][0]);
910 buffer
+= strlen(hangul_syllables
[L
][0]);
911 strcpy(buffer
, hangul_syllables
[V
][1]);
912 buffer
+= strlen(hangul_syllables
[V
][1]);
913 strcpy(buffer
, hangul_syllables
[T
][2]);
914 buffer
+= strlen(hangul_syllables
[T
][2]);
919 if (is_unified_ideograph(code
)) {
921 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
923 sprintf(buffer
, "CJK UNIFIED IDEOGRAPH-%X", code
);
927 /* get offset into phrasebook */
928 offset
= phrasebook_offset1
[(code
>>phrasebook_shift
)];
929 offset
= phrasebook_offset2
[(offset
<<phrasebook_shift
) +
930 (code
&((1<<phrasebook_shift
)-1))];
938 word
= phrasebook
[offset
] - phrasebook_short
;
940 word
= (word
<< 8) + phrasebook
[offset
+1];
943 word
= phrasebook
[offset
++];
946 return 0; /* buffer overflow */
949 /* copy word string from lexicon. the last character in the
950 word has bit 7 set. the last word in a string ends with
952 w
= lexicon
+ lexicon_offset
[word
];
955 return 0; /* buffer overflow */
959 return 0; /* buffer overflow */
960 buffer
[i
++] = *w
& 127;
962 break; /* end of word */
969 _cmpname(PyObject
*self
, int code
, const char* name
, int namelen
)
971 /* check if code corresponds to the given name */
973 char buffer
[NAME_MAXLEN
];
974 if (!_getucname(self
, code
, buffer
, sizeof(buffer
)))
976 for (i
= 0; i
< namelen
; i
++) {
977 if (toupper(Py_CHARMASK(name
[i
])) != buffer
[i
])
980 return buffer
[namelen
] == '\0';
984 find_syllable(const char *str
, int *len
, int *pos
, int count
, int column
)
988 for (i
= 0; i
< count
; i
++) {
989 char *s
= hangul_syllables
[i
][column
];
993 if (strncmp(str
, s
, len1
) == 0) {
1004 _getcode(PyObject
* self
, const char* name
, int namelen
, Py_UCS4
* code
)
1007 unsigned int mask
= code_size
-1;
1008 unsigned int i
, incr
;
1010 /* Check for hangul syllables. */
1011 if (strncmp(name
, "HANGUL SYLLABLE ", 16) == 0) {
1012 int len
, L
= -1, V
= -1, T
= -1;
1013 const char *pos
= name
+ 16;
1014 find_syllable(pos
, &len
, &L
, LCount
, 0);
1016 find_syllable(pos
, &len
, &V
, VCount
, 1);
1018 find_syllable(pos
, &len
, &T
, TCount
, 2);
1020 if (L
!= -1 && V
!= -1 && T
!= -1 && pos
-name
== namelen
) {
1021 *code
= SBase
+ (L
*VCount
+V
)*TCount
+ T
;
1024 /* Otherwise, it's an illegal syllable name. */
1028 /* Check for unified ideographs. */
1029 if (strncmp(name
, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1030 /* Four or five hexdigits must follow. */
1034 if (namelen
!= 4 && namelen
!= 5)
1038 if (*name
>= '0' && *name
<= '9')
1040 else if (*name
>= 'A' && *name
<= 'F')
1041 v
+= *name
- 'A' + 10;
1046 if (!is_unified_ideograph(v
))
1052 /* the following is the same as python's dictionary lookup, with
1053 only minor changes. see the makeunicodedata script for more
1056 h
= (unsigned int) _gethash(name
, namelen
, code_magic
);
1061 if (_cmpname(self
, v
, name
, namelen
)) {
1065 incr
= (h
^ (h
>> 3)) & mask
;
1069 i
= (i
+ incr
) & mask
;
1073 if (_cmpname(self
, v
, name
, namelen
)) {
1079 incr
= incr
^ code_poly
;
1083 static const _PyUnicode_Name_CAPI hashAPI
=
1085 sizeof(_PyUnicode_Name_CAPI
),
1090 /* -------------------------------------------------------------------- */
1091 /* Python bindings */
1093 PyDoc_STRVAR(unicodedata_name__doc__
,
1094 "name(unichr[, default])\n\
1095 Returns the name assigned to the Unicode character unichr as a\n\
1096 string. If no name is defined, default is returned, or, if not\n\
1097 given, ValueError is raised.");
1100 unicodedata_name(PyObject
* self
, PyObject
* args
)
1102 char name
[NAME_MAXLEN
];
1106 PyObject
* defobj
= NULL
;
1107 if (!PyArg_ParseTuple(args
, "O!|O:name", &PyUnicode_Type
, &v
, &defobj
))
1111 if (c
== (Py_UCS4
)-1)
1114 if (!_getucname(self
, c
, name
, sizeof(name
))) {
1115 if (defobj
== NULL
) {
1116 PyErr_SetString(PyExc_ValueError
, "no such name");
1125 return PyUnicode_FromString(name
);
1128 PyDoc_STRVAR(unicodedata_lookup__doc__
,
1131 Look up character by name. If a character with the\n\
1132 given name is found, return the corresponding Unicode\n\
1133 character. If not found, KeyError is raised.");
1136 unicodedata_lookup(PyObject
* self
, PyObject
* args
)
1143 if (!PyArg_ParseTuple(args
, "s#:lookup", &name
, &namelen
))
1146 if (!_getcode(self
, name
, namelen
, &code
)) {
1147 PyErr_Format(PyExc_KeyError
, "undefined character name '%s'",
1152 #ifndef Py_UNICODE_WIDE
1153 if (code
>= 0x10000) {
1154 str
[0] = 0xd800 + ((code
- 0x10000) >> 10);
1155 str
[1] = 0xdc00 + ((code
- 0x10000) & 0x3ff);
1156 return PyUnicode_FromUnicode(str
, 2);
1159 str
[0] = (Py_UNICODE
) code
;
1160 return PyUnicode_FromUnicode(str
, 1);
1163 /* XXX Add doc strings. */
1165 static PyMethodDef unicodedata_functions
[] = {
1166 {"decimal", unicodedata_decimal
, METH_VARARGS
, unicodedata_decimal__doc__
},
1167 {"digit", unicodedata_digit
, METH_VARARGS
, unicodedata_digit__doc__
},
1168 {"numeric", unicodedata_numeric
, METH_VARARGS
, unicodedata_numeric__doc__
},
1169 {"category", unicodedata_category
, METH_VARARGS
,
1170 unicodedata_category__doc__
},
1171 {"bidirectional", unicodedata_bidirectional
, METH_VARARGS
,
1172 unicodedata_bidirectional__doc__
},
1173 {"combining", unicodedata_combining
, METH_VARARGS
,
1174 unicodedata_combining__doc__
},
1175 {"mirrored", unicodedata_mirrored
, METH_VARARGS
,
1176 unicodedata_mirrored__doc__
},
1177 {"east_asian_width", unicodedata_east_asian_width
, METH_VARARGS
,
1178 unicodedata_east_asian_width__doc__
},
1179 {"decomposition", unicodedata_decomposition
, METH_VARARGS
,
1180 unicodedata_decomposition__doc__
},
1181 {"name", unicodedata_name
, METH_VARARGS
, unicodedata_name__doc__
},
1182 {"lookup", unicodedata_lookup
, METH_VARARGS
, unicodedata_lookup__doc__
},
1183 {"normalize", unicodedata_normalize
, METH_VARARGS
,
1184 unicodedata_normalize__doc__
},
1185 {NULL
, NULL
} /* sentinel */
1188 static PyTypeObject UCD_Type
= {
1189 /* The ob_type field must be initialized in the module init function
1190 * to be portable to Windows without using C++. */
1191 PyVarObject_HEAD_INIT(NULL
, 0)
1192 "unicodedata.UCD", /*tp_name*/
1193 sizeof(PreviousDBVersion
), /*tp_basicsize*/
1196 (destructor
)PyObject_Del
, /*tp_dealloc*/
1203 0, /*tp_as_sequence*/
1204 0, /*tp_as_mapping*/
1208 PyObject_GenericGetAttr
,/*tp_getattro*/
1211 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
1215 0, /*tp_richcompare*/
1216 0, /*tp_weaklistoffset*/
1219 unicodedata_functions
, /*tp_methods*/
1220 DB_members
, /*tp_members*/
1226 0, /*tp_dictoffset*/
1234 PyDoc_STRVAR(unicodedata_docstring
,
1235 "This module provides access to the Unicode Character Database which\n\
1236 defines character properties for all Unicode characters. The data in\n\
1237 this database is based on the UnicodeData.txt file version\n\
1238 5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1240 The module uses the same names and symbols as defined by the\n\
1241 UnicodeData File Format 5.1.0 (see\n\
1242 http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
1245 static struct PyModuleDef unicodedatamodule
= {
1246 PyModuleDef_HEAD_INIT
,
1248 unicodedata_docstring
,
1250 unicodedata_functions
,
1258 PyInit_unicodedata(void)
1262 Py_TYPE(&UCD_Type
) = &PyType_Type
;
1264 m
= PyModule_Create(&unicodedatamodule
);
1268 PyModule_AddStringConstant(m
, "unidata_version", UNIDATA_VERSION
);
1269 Py_INCREF(&UCD_Type
);
1270 PyModule_AddObject(m
, "UCD", (PyObject
*)&UCD_Type
);
1272 /* Previous versions */
1273 v
= new_previous_version("3.2.0", get_change_3_2_0
, normalization_3_2_0
);
1275 PyModule_AddObject(m
, "ucd_3_2_0", v
);
1278 v
= PyCapsule_New((void *)&hashAPI
, PyUnicodeData_CAPSULE_NAME
, NULL
);
1280 PyModule_AddObject(m
, "ucnhash_CAPI", v
);
1287 indent-tabs-mode: nil