2 Unicode character type helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
7 Copyright (c) Corporation for National Research Initiatives.
12 #include "unicodeobject.h"
14 #define ALPHA_MASK 0x01
15 #define DECIMAL_MASK 0x02
16 #define DIGIT_MASK 0x04
17 #define LOWER_MASK 0x08
18 #define LINEBREAK_MASK 0x10
19 #define SPACE_MASK 0x20
20 #define TITLE_MASK 0x40
21 #define UPPER_MASK 0x80
24 const Py_UNICODE upper
;
25 const Py_UNICODE lower
;
26 const Py_UNICODE title
;
27 const unsigned char decimal
;
28 const unsigned char digit
;
29 const unsigned short flags
;
30 } _PyUnicode_TypeRecord
;
32 #include "unicodetype_db.h"
34 static const _PyUnicode_TypeRecord
*
35 gettyperecord(Py_UNICODE code
)
39 #ifdef Py_UNICODE_WIDE
45 index
= index1
[(code
>>SHIFT
)];
46 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
49 return &_PyUnicode_TypeRecords
[index
];
52 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
53 type 'B', 0 otherwise. */
55 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch
)
58 case 0x000A: /* LINE FEED */
59 case 0x000D: /* CARRIAGE RETURN */
60 case 0x001C: /* FILE SEPARATOR */
61 case 0x001D: /* GROUP SEPARATOR */
62 case 0x001E: /* RECORD SEPARATOR */
63 case 0x0085: /* NEXT LINE */
64 case 0x2028: /* LINE SEPARATOR */
65 case 0x2029: /* PARAGRAPH SEPARATOR */
72 /* Returns the titlecase Unicode characters corresponding to ch or just
73 ch if no titlecase mapping is known. */
75 Py_UNICODE
_PyUnicode_ToTitlecase(register Py_UNICODE ch
)
77 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
91 /* Returns 1 for Unicode characters having the category 'Lt', 0
94 int _PyUnicode_IsTitlecase(Py_UNICODE ch
)
96 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
98 return (ctype
->flags
& TITLE_MASK
) != 0;
101 /* Returns the integer decimal (0-9) for Unicode characters having
102 this property, -1 otherwise. */
104 int _PyUnicode_ToDecimalDigit(Py_UNICODE ch
)
106 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
108 return (ctype
->flags
& DECIMAL_MASK
) ? ctype
->decimal
: -1;
111 int _PyUnicode_IsDecimalDigit(Py_UNICODE ch
)
113 if (_PyUnicode_ToDecimalDigit(ch
) < 0)
118 /* Returns the integer digit (0-9) for Unicode characters having
119 this property, -1 otherwise. */
121 int _PyUnicode_ToDigit(Py_UNICODE ch
)
123 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
125 return (ctype
->flags
& DIGIT_MASK
) ? ctype
->digit
: -1;
128 int _PyUnicode_IsDigit(Py_UNICODE ch
)
130 if (_PyUnicode_ToDigit(ch
) < 0)
135 /* Returns the numeric value as double for Unicode characters having
136 this property, -1.0 otherwise. */
138 /* TODO: replace with unicodetype_db.h table */
140 double _PyUnicode_ToNumeric(Py_UNICODE ch
)
144 return (double) -1 / 2;
147 #ifdef Py_UNICODE_WIDE
160 #ifdef Py_UNICODE_WIDE
173 #ifdef Py_UNICODE_WIDE
178 return (double) 1 / 2;
180 return (double) 1 / 3;
182 #ifdef Py_UNICODE_WIDE
185 return (double) 1 / 4;
187 return (double) 1 / 5;
189 return (double) 1 / 6;
191 return (double) 1 / 8;
206 #ifdef Py_UNICODE_WIDE
225 #ifdef Py_UNICODE_WIDE
238 #ifdef Py_UNICODE_WIDE
245 return (double) 1000;
248 #ifdef Py_UNICODE_WIDE
252 return (double) 10000;
261 return (double) 11 / 2;
275 return (double) 13 / 2;
287 return (double) 15 / 2;
301 return (double) 17 / 2;
322 #ifdef Py_UNICODE_WIDE
332 #ifdef Py_UNICODE_WIDE
335 return (double) 2 / 3;
337 return (double) 2 / 5;
344 #ifdef Py_UNICODE_WIDE
350 #ifdef Py_UNICODE_WIDE
354 return (double) 2000;
356 return (double) 20000;
384 #ifdef Py_UNICODE_WIDE
389 return (double) 3 / 2;
391 #ifdef Py_UNICODE_WIDE
394 return (double) 3 / 4;
396 return (double) 3 / 5;
398 return (double) 3 / 8;
402 #ifdef Py_UNICODE_WIDE
407 #ifdef Py_UNICODE_WIDE
412 return (double) 3000;
414 return (double) 30000;
442 #ifdef Py_UNICODE_WIDE
447 return (double) 4 / 5;
450 #ifdef Py_UNICODE_WIDE
454 #ifdef Py_UNICODE_WIDE
458 return (double) 4000;
460 return (double) 40000;
486 #ifdef Py_UNICODE_WIDE
497 return (double) 5 / 2;
499 return (double) 5 / 6;
501 return (double) 5 / 8;
506 #ifdef Py_UNICODE_WIDE
521 #ifdef Py_UNICODE_WIDE
534 #ifdef Py_UNICODE_WIDE
540 return (double) 5000;
541 #ifdef Py_UNICODE_WIDE
545 return (double) 50000;
553 #ifdef Py_UNICODE_WIDE
558 #ifdef Py_UNICODE_WIDE
562 #ifdef Py_UNICODE_WIDE
566 return (double) 6000;
568 return (double) 60000;
576 #ifdef Py_UNICODE_WIDE
581 return (double) 7 / 2;
583 return (double) 7 / 8;
585 #ifdef Py_UNICODE_WIDE
589 #ifdef Py_UNICODE_WIDE
593 return (double) 7000;
595 return (double) 70000;
603 #ifdef Py_UNICODE_WIDE
608 #ifdef Py_UNICODE_WIDE
612 #ifdef Py_UNICODE_WIDE
616 return (double) 8000;
618 return (double) 80000;
626 #ifdef Py_UNICODE_WIDE
631 return (double) 9 / 2;
633 #ifdef Py_UNICODE_WIDE
637 #ifdef Py_UNICODE_WIDE
642 return (double) 9000;
644 return (double) 90000;
647 return (double) _PyUnicode_ToDigit(ch
);
651 int _PyUnicode_IsNumeric(Py_UNICODE ch
)
653 return _PyUnicode_ToNumeric(ch
) != -1.0;
656 #ifndef WANT_WCTYPE_FUNCTIONS
658 /* Returns 1 for Unicode characters having the bidirectional type
659 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
661 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch
)
664 case 0x0009: /* HORIZONTAL TABULATION */
665 case 0x000A: /* LINE FEED */
666 case 0x000B: /* VERTICAL TABULATION */
667 case 0x000C: /* FORM FEED */
668 case 0x000D: /* CARRIAGE RETURN */
669 case 0x001C: /* FILE SEPARATOR */
670 case 0x001D: /* GROUP SEPARATOR */
671 case 0x001E: /* RECORD SEPARATOR */
672 case 0x001F: /* UNIT SEPARATOR */
673 case 0x0020: /* SPACE */
674 case 0x0085: /* NEXT LINE */
675 case 0x00A0: /* NO-BREAK SPACE */
676 case 0x1680: /* OGHAM SPACE MARK */
677 case 0x2000: /* EN QUAD */
678 case 0x2001: /* EM QUAD */
679 case 0x2002: /* EN SPACE */
680 case 0x2003: /* EM SPACE */
681 case 0x2004: /* THREE-PER-EM SPACE */
682 case 0x2005: /* FOUR-PER-EM SPACE */
683 case 0x2006: /* SIX-PER-EM SPACE */
684 case 0x2007: /* FIGURE SPACE */
685 case 0x2008: /* PUNCTUATION SPACE */
686 case 0x2009: /* THIN SPACE */
687 case 0x200A: /* HAIR SPACE */
688 case 0x200B: /* ZERO WIDTH SPACE */
689 case 0x2028: /* LINE SEPARATOR */
690 case 0x2029: /* PARAGRAPH SEPARATOR */
691 case 0x202F: /* NARROW NO-BREAK SPACE */
692 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
693 case 0x3000: /* IDEOGRAPHIC SPACE */
700 /* Returns 1 for Unicode characters having the category 'Ll', 0
703 int _PyUnicode_IsLowercase(Py_UNICODE ch
)
705 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
707 return (ctype
->flags
& LOWER_MASK
) != 0;
710 /* Returns 1 for Unicode characters having the category 'Lu', 0
713 int _PyUnicode_IsUppercase(Py_UNICODE ch
)
715 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
717 return (ctype
->flags
& UPPER_MASK
) != 0;
720 /* Returns the uppercase Unicode characters corresponding to ch or just
721 ch if no uppercase mapping is known. */
723 Py_UNICODE
_PyUnicode_ToUppercase(Py_UNICODE ch
)
725 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
726 int delta
= ctype
->upper
;
732 /* Returns the lowercase Unicode characters corresponding to ch or just
733 ch if no lowercase mapping is known. */
735 Py_UNICODE
_PyUnicode_ToLowercase(Py_UNICODE ch
)
737 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
738 int delta
= ctype
->lower
;
744 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
745 'Lo' or 'Lm', 0 otherwise. */
747 int _PyUnicode_IsAlpha(Py_UNICODE ch
)
749 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
751 return (ctype
->flags
& ALPHA_MASK
) != 0;
756 /* Export the interfaces using the wchar_t type for portability
759 int _PyUnicode_IsWhitespace(Py_UNICODE ch
)
764 int _PyUnicode_IsLowercase(Py_UNICODE ch
)
769 int _PyUnicode_IsUppercase(Py_UNICODE ch
)
774 Py_UNICODE
_PyUnicode_ToLowercase(Py_UNICODE ch
)
779 Py_UNICODE
_PyUnicode_ToUppercase(Py_UNICODE ch
)
784 int _PyUnicode_IsAlpha(Py_UNICODE ch
)