2 Unicode character type helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
7 Copyright (c) Corporation for National Research Initiatives.
12 #include "unicodeobject.h"
14 #define ALPHA_MASK 0x01
15 #define DECIMAL_MASK 0x02
16 #define DIGIT_MASK 0x04
17 #define LOWER_MASK 0x08
18 #define LINEBREAK_MASK 0x10
19 #define SPACE_MASK 0x20
20 #define TITLE_MASK 0x40
21 #define UPPER_MASK 0x80
22 #define NODELTA_MASK 0x100
25 const Py_UNICODE upper
;
26 const Py_UNICODE lower
;
27 const Py_UNICODE title
;
28 const unsigned char decimal
;
29 const unsigned char digit
;
30 const unsigned short flags
;
31 } _PyUnicode_TypeRecord
;
33 #include "unicodetype_db.h"
35 static const _PyUnicode_TypeRecord
*
36 gettyperecord(Py_UNICODE code
)
40 #ifdef Py_UNICODE_WIDE
46 index
= index1
[(code
>>SHIFT
)];
47 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
50 return &_PyUnicode_TypeRecords
[index
];
53 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
54 type 'B', 0 otherwise. */
56 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch
)
59 case 0x000A: /* LINE FEED */
60 case 0x000D: /* CARRIAGE RETURN */
61 case 0x001C: /* FILE SEPARATOR */
62 case 0x001D: /* GROUP SEPARATOR */
63 case 0x001E: /* RECORD SEPARATOR */
64 case 0x0085: /* NEXT LINE */
65 case 0x2028: /* LINE SEPARATOR */
66 case 0x2029: /* PARAGRAPH SEPARATOR */
73 /* Returns the titlecase Unicode characters corresponding to ch or just
74 ch if no titlecase mapping is known. */
76 Py_UNICODE
_PyUnicode_ToTitlecase(register Py_UNICODE ch
)
78 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
86 if (ctype
->flags
& NODELTA_MASK
)
95 /* Returns 1 for Unicode characters having the category 'Lt', 0
98 int _PyUnicode_IsTitlecase(Py_UNICODE ch
)
100 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
102 return (ctype
->flags
& TITLE_MASK
) != 0;
105 /* Returns the integer decimal (0-9) for Unicode characters having
106 this property, -1 otherwise. */
108 int _PyUnicode_ToDecimalDigit(Py_UNICODE ch
)
110 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
112 return (ctype
->flags
& DECIMAL_MASK
) ? ctype
->decimal
: -1;
115 int _PyUnicode_IsDecimalDigit(Py_UNICODE ch
)
117 if (_PyUnicode_ToDecimalDigit(ch
) < 0)
122 /* Returns the integer digit (0-9) for Unicode characters having
123 this property, -1 otherwise. */
125 int _PyUnicode_ToDigit(Py_UNICODE ch
)
127 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
129 return (ctype
->flags
& DIGIT_MASK
) ? ctype
->digit
: -1;
132 int _PyUnicode_IsDigit(Py_UNICODE ch
)
134 if (_PyUnicode_ToDigit(ch
) < 0)
139 /* Returns the numeric value as double for Unicode characters having
140 this property, -1.0 otherwise. */
142 /* TODO: replace with unicodetype_db.h table */
144 double _PyUnicode_ToNumeric(Py_UNICODE ch
)
148 return (double) -1 / 2;
151 #ifdef Py_UNICODE_WIDE
164 #ifdef Py_UNICODE_WIDE
177 #ifdef Py_UNICODE_WIDE
182 return (double) 1 / 2;
184 return (double) 1 / 3;
186 #ifdef Py_UNICODE_WIDE
189 return (double) 1 / 4;
191 return (double) 1 / 5;
193 return (double) 1 / 6;
195 return (double) 1 / 8;
210 #ifdef Py_UNICODE_WIDE
229 #ifdef Py_UNICODE_WIDE
242 #ifdef Py_UNICODE_WIDE
249 return (double) 1000;
252 #ifdef Py_UNICODE_WIDE
256 return (double) 10000;
265 return (double) 11 / 2;
279 return (double) 13 / 2;
291 return (double) 15 / 2;
305 return (double) 17 / 2;
326 #ifdef Py_UNICODE_WIDE
336 #ifdef Py_UNICODE_WIDE
339 return (double) 2 / 3;
341 return (double) 2 / 5;
348 #ifdef Py_UNICODE_WIDE
354 #ifdef Py_UNICODE_WIDE
358 return (double) 2000;
360 return (double) 20000;
388 #ifdef Py_UNICODE_WIDE
393 return (double) 3 / 2;
395 #ifdef Py_UNICODE_WIDE
398 return (double) 3 / 4;
400 return (double) 3 / 5;
402 return (double) 3 / 8;
406 #ifdef Py_UNICODE_WIDE
411 #ifdef Py_UNICODE_WIDE
416 return (double) 3000;
418 return (double) 30000;
446 #ifdef Py_UNICODE_WIDE
451 return (double) 4 / 5;
454 #ifdef Py_UNICODE_WIDE
458 #ifdef Py_UNICODE_WIDE
462 return (double) 4000;
464 return (double) 40000;
490 #ifdef Py_UNICODE_WIDE
501 return (double) 5 / 2;
503 return (double) 5 / 6;
505 return (double) 5 / 8;
510 #ifdef Py_UNICODE_WIDE
525 #ifdef Py_UNICODE_WIDE
538 #ifdef Py_UNICODE_WIDE
544 return (double) 5000;
545 #ifdef Py_UNICODE_WIDE
549 return (double) 50000;
557 #ifdef Py_UNICODE_WIDE
562 #ifdef Py_UNICODE_WIDE
566 #ifdef Py_UNICODE_WIDE
570 return (double) 6000;
572 return (double) 60000;
580 #ifdef Py_UNICODE_WIDE
585 return (double) 7 / 2;
587 return (double) 7 / 8;
589 #ifdef Py_UNICODE_WIDE
593 #ifdef Py_UNICODE_WIDE
597 return (double) 7000;
599 return (double) 70000;
607 #ifdef Py_UNICODE_WIDE
612 #ifdef Py_UNICODE_WIDE
616 #ifdef Py_UNICODE_WIDE
620 return (double) 8000;
622 return (double) 80000;
630 #ifdef Py_UNICODE_WIDE
635 return (double) 9 / 2;
637 #ifdef Py_UNICODE_WIDE
641 #ifdef Py_UNICODE_WIDE
646 return (double) 9000;
648 return (double) 90000;
651 return (double) _PyUnicode_ToDigit(ch
);
655 int _PyUnicode_IsNumeric(Py_UNICODE ch
)
657 return _PyUnicode_ToNumeric(ch
) != -1.0;
660 #ifndef WANT_WCTYPE_FUNCTIONS
662 /* Returns 1 for Unicode characters having the bidirectional type
663 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
665 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch
)
668 case 0x0009: /* HORIZONTAL TABULATION */
669 case 0x000A: /* LINE FEED */
670 case 0x000B: /* VERTICAL TABULATION */
671 case 0x000C: /* FORM FEED */
672 case 0x000D: /* CARRIAGE RETURN */
673 case 0x001C: /* FILE SEPARATOR */
674 case 0x001D: /* GROUP SEPARATOR */
675 case 0x001E: /* RECORD SEPARATOR */
676 case 0x001F: /* UNIT SEPARATOR */
677 case 0x0020: /* SPACE */
678 case 0x0085: /* NEXT LINE */
679 case 0x00A0: /* NO-BREAK SPACE */
680 case 0x1680: /* OGHAM SPACE MARK */
681 case 0x2000: /* EN QUAD */
682 case 0x2001: /* EM QUAD */
683 case 0x2002: /* EN SPACE */
684 case 0x2003: /* EM SPACE */
685 case 0x2004: /* THREE-PER-EM SPACE */
686 case 0x2005: /* FOUR-PER-EM SPACE */
687 case 0x2006: /* SIX-PER-EM SPACE */
688 case 0x2007: /* FIGURE SPACE */
689 case 0x2008: /* PUNCTUATION SPACE */
690 case 0x2009: /* THIN SPACE */
691 case 0x200A: /* HAIR SPACE */
692 case 0x200B: /* ZERO WIDTH SPACE */
693 case 0x2028: /* LINE SEPARATOR */
694 case 0x2029: /* PARAGRAPH SEPARATOR */
695 case 0x202F: /* NARROW NO-BREAK SPACE */
696 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
697 case 0x3000: /* IDEOGRAPHIC SPACE */
704 /* Returns 1 for Unicode characters having the category 'Ll', 0
707 int _PyUnicode_IsLowercase(Py_UNICODE ch
)
709 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
711 return (ctype
->flags
& LOWER_MASK
) != 0;
714 /* Returns 1 for Unicode characters having the category 'Lu', 0
717 int _PyUnicode_IsUppercase(Py_UNICODE ch
)
719 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
721 return (ctype
->flags
& UPPER_MASK
) != 0;
724 /* Returns the uppercase Unicode characters corresponding to ch or just
725 ch if no uppercase mapping is known. */
727 Py_UNICODE
_PyUnicode_ToUppercase(Py_UNICODE ch
)
729 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
730 int delta
= ctype
->upper
;
731 if (ctype
->flags
& NODELTA_MASK
)
738 /* Returns the lowercase Unicode characters corresponding to ch or just
739 ch if no lowercase mapping is known. */
741 Py_UNICODE
_PyUnicode_ToLowercase(Py_UNICODE ch
)
743 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
744 int delta
= ctype
->lower
;
745 if (ctype
->flags
& NODELTA_MASK
)
752 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
753 'Lo' or 'Lm', 0 otherwise. */
755 int _PyUnicode_IsAlpha(Py_UNICODE ch
)
757 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
759 return (ctype
->flags
& ALPHA_MASK
) != 0;
764 /* Export the interfaces using the wchar_t type for portability
767 int _PyUnicode_IsWhitespace(Py_UNICODE ch
)
772 int _PyUnicode_IsLowercase(Py_UNICODE ch
)
777 int _PyUnicode_IsUppercase(Py_UNICODE ch
)
782 Py_UNICODE
_PyUnicode_ToLowercase(Py_UNICODE ch
)
787 Py_UNICODE
_PyUnicode_ToUppercase(Py_UNICODE ch
)
792 int _PyUnicode_IsAlpha(Py_UNICODE ch
)