2 Unicode character type helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
7 Copyright (c) Corporation for National Research Initiatives.
12 #include "unicodeobject.h"
14 #define ALPHA_MASK 0x01
15 #define DECIMAL_MASK 0x02
16 #define DIGIT_MASK 0x04
17 #define LOWER_MASK 0x08
18 #define LINEBREAK_MASK 0x10
19 #define SPACE_MASK 0x20
20 #define TITLE_MASK 0x40
21 #define UPPER_MASK 0x80
22 #define NODELTA_MASK 0x100
25 const Py_UNICODE upper
;
26 const Py_UNICODE lower
;
27 const Py_UNICODE title
;
28 const unsigned char decimal
;
29 const unsigned char digit
;
30 const unsigned short flags
;
31 } _PyUnicode_TypeRecord
;
33 #include "unicodetype_db.h"
35 static const _PyUnicode_TypeRecord
*
36 gettyperecord(Py_UNICODE code
)
40 #ifdef Py_UNICODE_WIDE
46 index
= index1
[(code
>>SHIFT
)];
47 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
50 return &_PyUnicode_TypeRecords
[index
];
53 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
54 type 'B', 0 otherwise. */
56 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch
)
59 case 0x000A: /* LINE FEED */
60 case 0x000D: /* CARRIAGE RETURN */
61 case 0x001C: /* FILE SEPARATOR */
62 case 0x001D: /* GROUP SEPARATOR */
63 case 0x001E: /* RECORD SEPARATOR */
64 case 0x0085: /* NEXT LINE */
65 case 0x2028: /* LINE SEPARATOR */
66 case 0x2029: /* PARAGRAPH SEPARATOR */
73 /* Returns the titlecase Unicode characters corresponding to ch or just
74 ch if no titlecase mapping is known. */
76 Py_UNICODE
_PyUnicode_ToTitlecase(register Py_UNICODE ch
)
78 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
79 int delta
= ctype
->title
;
81 if (ctype
->flags
& NODELTA_MASK
)
90 /* Returns 1 for Unicode characters having the category 'Lt', 0
93 int _PyUnicode_IsTitlecase(Py_UNICODE ch
)
95 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
97 return (ctype
->flags
& TITLE_MASK
) != 0;
100 /* Returns the integer decimal (0-9) for Unicode characters having
101 this property, -1 otherwise. */
103 int _PyUnicode_ToDecimalDigit(Py_UNICODE ch
)
105 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
107 return (ctype
->flags
& DECIMAL_MASK
) ? ctype
->decimal
: -1;
110 int _PyUnicode_IsDecimalDigit(Py_UNICODE ch
)
112 if (_PyUnicode_ToDecimalDigit(ch
) < 0)
117 /* Returns the integer digit (0-9) for Unicode characters having
118 this property, -1 otherwise. */
120 int _PyUnicode_ToDigit(Py_UNICODE ch
)
122 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
124 return (ctype
->flags
& DIGIT_MASK
) ? ctype
->digit
: -1;
127 int _PyUnicode_IsDigit(Py_UNICODE ch
)
129 if (_PyUnicode_ToDigit(ch
) < 0)
134 /* Returns the numeric value as double for Unicode characters having
135 this property, -1.0 otherwise. */
137 /* TODO: replace with unicodetype_db.h table */
139 double _PyUnicode_ToNumeric(Py_UNICODE ch
)
143 return (double) -1 / 2;
146 #ifdef Py_UNICODE_WIDE
159 #ifdef Py_UNICODE_WIDE
172 #ifdef Py_UNICODE_WIDE
177 return (double) 1 / 2;
179 return (double) 1 / 3;
181 #ifdef Py_UNICODE_WIDE
184 return (double) 1 / 4;
186 return (double) 1 / 5;
188 return (double) 1 / 6;
190 return (double) 1 / 8;
205 #ifdef Py_UNICODE_WIDE
224 #ifdef Py_UNICODE_WIDE
237 #ifdef Py_UNICODE_WIDE
244 return (double) 1000;
247 #ifdef Py_UNICODE_WIDE
251 return (double) 10000;
260 return (double) 11 / 2;
274 return (double) 13 / 2;
286 return (double) 15 / 2;
300 return (double) 17 / 2;
321 #ifdef Py_UNICODE_WIDE
331 #ifdef Py_UNICODE_WIDE
334 return (double) 2 / 3;
336 return (double) 2 / 5;
343 #ifdef Py_UNICODE_WIDE
349 #ifdef Py_UNICODE_WIDE
353 return (double) 2000;
355 return (double) 20000;
383 #ifdef Py_UNICODE_WIDE
388 return (double) 3 / 2;
390 #ifdef Py_UNICODE_WIDE
393 return (double) 3 / 4;
395 return (double) 3 / 5;
397 return (double) 3 / 8;
401 #ifdef Py_UNICODE_WIDE
406 #ifdef Py_UNICODE_WIDE
411 return (double) 3000;
413 return (double) 30000;
441 #ifdef Py_UNICODE_WIDE
446 return (double) 4 / 5;
449 #ifdef Py_UNICODE_WIDE
453 #ifdef Py_UNICODE_WIDE
457 return (double) 4000;
459 return (double) 40000;
485 #ifdef Py_UNICODE_WIDE
496 return (double) 5 / 2;
498 return (double) 5 / 6;
500 return (double) 5 / 8;
505 #ifdef Py_UNICODE_WIDE
520 #ifdef Py_UNICODE_WIDE
533 #ifdef Py_UNICODE_WIDE
539 return (double) 5000;
540 #ifdef Py_UNICODE_WIDE
544 return (double) 50000;
552 #ifdef Py_UNICODE_WIDE
557 #ifdef Py_UNICODE_WIDE
561 #ifdef Py_UNICODE_WIDE
565 return (double) 6000;
567 return (double) 60000;
575 #ifdef Py_UNICODE_WIDE
580 return (double) 7 / 2;
582 return (double) 7 / 8;
584 #ifdef Py_UNICODE_WIDE
588 #ifdef Py_UNICODE_WIDE
592 return (double) 7000;
594 return (double) 70000;
602 #ifdef Py_UNICODE_WIDE
607 #ifdef Py_UNICODE_WIDE
611 #ifdef Py_UNICODE_WIDE
615 return (double) 8000;
617 return (double) 80000;
625 #ifdef Py_UNICODE_WIDE
630 return (double) 9 / 2;
632 #ifdef Py_UNICODE_WIDE
636 #ifdef Py_UNICODE_WIDE
641 return (double) 9000;
643 return (double) 90000;
646 return (double) _PyUnicode_ToDigit(ch
);
650 int _PyUnicode_IsNumeric(Py_UNICODE ch
)
652 return _PyUnicode_ToNumeric(ch
) != -1.0;
655 #ifndef WANT_WCTYPE_FUNCTIONS
657 /* Returns 1 for Unicode characters having the bidirectional type
658 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
660 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch
)
663 case 0x0009: /* HORIZONTAL TABULATION */
664 case 0x000A: /* LINE FEED */
665 case 0x000B: /* VERTICAL TABULATION */
666 case 0x000C: /* FORM FEED */
667 case 0x000D: /* CARRIAGE RETURN */
668 case 0x001C: /* FILE SEPARATOR */
669 case 0x001D: /* GROUP SEPARATOR */
670 case 0x001E: /* RECORD SEPARATOR */
671 case 0x001F: /* UNIT SEPARATOR */
672 case 0x0020: /* SPACE */
673 case 0x0085: /* NEXT LINE */
674 case 0x00A0: /* NO-BREAK SPACE */
675 case 0x1680: /* OGHAM SPACE MARK */
676 case 0x2000: /* EN QUAD */
677 case 0x2001: /* EM QUAD */
678 case 0x2002: /* EN SPACE */
679 case 0x2003: /* EM SPACE */
680 case 0x2004: /* THREE-PER-EM SPACE */
681 case 0x2005: /* FOUR-PER-EM SPACE */
682 case 0x2006: /* SIX-PER-EM SPACE */
683 case 0x2007: /* FIGURE SPACE */
684 case 0x2008: /* PUNCTUATION SPACE */
685 case 0x2009: /* THIN SPACE */
686 case 0x200A: /* HAIR SPACE */
687 case 0x200B: /* ZERO WIDTH SPACE */
688 case 0x2028: /* LINE SEPARATOR */
689 case 0x2029: /* PARAGRAPH SEPARATOR */
690 case 0x202F: /* NARROW NO-BREAK SPACE */
691 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
692 case 0x3000: /* IDEOGRAPHIC SPACE */
699 /* Returns 1 for Unicode characters having the category 'Ll', 0
702 int _PyUnicode_IsLowercase(Py_UNICODE ch
)
704 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
706 return (ctype
->flags
& LOWER_MASK
) != 0;
709 /* Returns 1 for Unicode characters having the category 'Lu', 0
712 int _PyUnicode_IsUppercase(Py_UNICODE ch
)
714 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
716 return (ctype
->flags
& UPPER_MASK
) != 0;
719 /* Returns the uppercase Unicode characters corresponding to ch or just
720 ch if no uppercase mapping is known. */
722 Py_UNICODE
_PyUnicode_ToUppercase(Py_UNICODE ch
)
724 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
725 int delta
= ctype
->upper
;
726 if (ctype
->flags
& NODELTA_MASK
)
733 /* Returns the lowercase Unicode characters corresponding to ch or just
734 ch if no lowercase mapping is known. */
736 Py_UNICODE
_PyUnicode_ToLowercase(Py_UNICODE ch
)
738 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
739 int delta
= ctype
->lower
;
740 if (ctype
->flags
& NODELTA_MASK
)
747 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
748 'Lo' or 'Lm', 0 otherwise. */
750 int _PyUnicode_IsAlpha(Py_UNICODE ch
)
752 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
754 return (ctype
->flags
& ALPHA_MASK
) != 0;
759 /* Export the interfaces using the wchar_t type for portability
762 int _PyUnicode_IsWhitespace(Py_UNICODE ch
)
767 int _PyUnicode_IsLowercase(Py_UNICODE ch
)
772 int _PyUnicode_IsUppercase(Py_UNICODE ch
)
777 Py_UNICODE
_PyUnicode_ToLowercase(Py_UNICODE ch
)
782 Py_UNICODE
_PyUnicode_ToUppercase(Py_UNICODE ch
)
787 int _PyUnicode_IsAlpha(Py_UNICODE ch
)