2 Unicode character type helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
7 Copyright (c) Corporation for National Research Initiatives.
12 #include "unicodeobject.h"
14 #define ALPHA_MASK 0x01
15 #define DECIMAL_MASK 0x02
16 #define DIGIT_MASK 0x04
17 #define LOWER_MASK 0x08
18 #define LINEBREAK_MASK 0x10
19 #define SPACE_MASK 0x20
20 #define TITLE_MASK 0x40
21 #define UPPER_MASK 0x80
22 #define XID_START_MASK 0x100
23 #define XID_CONTINUE_MASK 0x200
24 #define PRINTABLE_MASK 0x400
25 #define NODELTA_MASK 0x800
28 const Py_UNICODE upper
;
29 const Py_UNICODE lower
;
30 const Py_UNICODE title
;
31 const unsigned char decimal
;
32 const unsigned char digit
;
33 const unsigned short flags
;
34 } _PyUnicode_TypeRecord
;
36 #include "unicodetype_db.h"
38 static const _PyUnicode_TypeRecord
*
39 gettyperecord(Py_UNICODE code
)
43 #ifdef Py_UNICODE_WIDE
49 index
= index1
[(code
>>SHIFT
)];
50 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
53 return &_PyUnicode_TypeRecords
[index
];
56 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
57 type 'B', 0 otherwise. */
59 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch
)
62 case 0x000A: /* LINE FEED */
63 case 0x000D: /* CARRIAGE RETURN */
64 case 0x001C: /* FILE SEPARATOR */
65 case 0x001D: /* GROUP SEPARATOR */
66 case 0x001E: /* RECORD SEPARATOR */
67 case 0x0085: /* NEXT LINE */
68 case 0x2028: /* LINE SEPARATOR */
69 case 0x2029: /* PARAGRAPH SEPARATOR */
76 /* Returns the titlecase Unicode characters corresponding to ch or just
77 ch if no titlecase mapping is known. */
79 Py_UNICODE
_PyUnicode_ToTitlecase(register Py_UNICODE ch
)
81 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
82 int delta
= ctype
->title
;
84 if (ctype
->flags
& NODELTA_MASK
)
93 /* Returns 1 for Unicode characters having the category 'Lt', 0
96 int _PyUnicode_IsTitlecase(Py_UNICODE ch
)
98 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
100 return (ctype
->flags
& TITLE_MASK
) != 0;
103 /* Returns 1 for Unicode characters having the XID_Start property, 0
106 int _PyUnicode_IsXidStart(Py_UNICODE ch
)
108 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
110 return (ctype
->flags
& XID_START_MASK
) != 0;
113 /* Returns 1 for Unicode characters having the XID_Continue property,
116 int _PyUnicode_IsXidContinue(Py_UNICODE ch
)
118 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
120 return (ctype
->flags
& XID_CONTINUE_MASK
) != 0;
123 /* Returns the integer decimal (0-9) for Unicode characters having
124 this property, -1 otherwise. */
126 int _PyUnicode_ToDecimalDigit(Py_UNICODE ch
)
128 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
130 return (ctype
->flags
& DECIMAL_MASK
) ? ctype
->decimal
: -1;
133 int _PyUnicode_IsDecimalDigit(Py_UNICODE ch
)
135 if (_PyUnicode_ToDecimalDigit(ch
) < 0)
140 /* Returns the integer digit (0-9) for Unicode characters having
141 this property, -1 otherwise. */
143 int _PyUnicode_ToDigit(Py_UNICODE ch
)
145 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
147 return (ctype
->flags
& DIGIT_MASK
) ? ctype
->digit
: -1;
150 int _PyUnicode_IsDigit(Py_UNICODE ch
)
152 if (_PyUnicode_ToDigit(ch
) < 0)
157 /* Returns the numeric value as double for Unicode characters having
158 this property, -1.0 otherwise. */
160 /* TODO: replace with unicodetype_db.h table */
162 double _PyUnicode_ToNumeric(Py_UNICODE ch
)
166 return (double) -1 / 2;
169 #ifdef Py_UNICODE_WIDE
182 #ifdef Py_UNICODE_WIDE
195 #ifdef Py_UNICODE_WIDE
200 return (double) 1 / 2;
202 return (double) 1 / 3;
204 #ifdef Py_UNICODE_WIDE
207 return (double) 1 / 4;
209 return (double) 1 / 5;
211 return (double) 1 / 6;
213 return (double) 1 / 8;
228 #ifdef Py_UNICODE_WIDE
247 #ifdef Py_UNICODE_WIDE
260 #ifdef Py_UNICODE_WIDE
267 return (double) 1000;
270 #ifdef Py_UNICODE_WIDE
274 return (double) 10000;
283 return (double) 11 / 2;
297 return (double) 13 / 2;
309 return (double) 15 / 2;
323 return (double) 17 / 2;
344 #ifdef Py_UNICODE_WIDE
354 #ifdef Py_UNICODE_WIDE
357 return (double) 2 / 3;
359 return (double) 2 / 5;
366 #ifdef Py_UNICODE_WIDE
372 #ifdef Py_UNICODE_WIDE
376 return (double) 2000;
378 return (double) 20000;
406 #ifdef Py_UNICODE_WIDE
411 return (double) 3 / 2;
413 #ifdef Py_UNICODE_WIDE
416 return (double) 3 / 4;
418 return (double) 3 / 5;
420 return (double) 3 / 8;
424 #ifdef Py_UNICODE_WIDE
429 #ifdef Py_UNICODE_WIDE
434 return (double) 3000;
436 return (double) 30000;
464 #ifdef Py_UNICODE_WIDE
469 return (double) 4 / 5;
472 #ifdef Py_UNICODE_WIDE
476 #ifdef Py_UNICODE_WIDE
480 return (double) 4000;
482 return (double) 40000;
508 #ifdef Py_UNICODE_WIDE
519 return (double) 5 / 2;
521 return (double) 5 / 6;
523 return (double) 5 / 8;
528 #ifdef Py_UNICODE_WIDE
543 #ifdef Py_UNICODE_WIDE
556 #ifdef Py_UNICODE_WIDE
562 return (double) 5000;
563 #ifdef Py_UNICODE_WIDE
567 return (double) 50000;
575 #ifdef Py_UNICODE_WIDE
580 #ifdef Py_UNICODE_WIDE
584 #ifdef Py_UNICODE_WIDE
588 return (double) 6000;
590 return (double) 60000;
598 #ifdef Py_UNICODE_WIDE
603 return (double) 7 / 2;
605 return (double) 7 / 8;
607 #ifdef Py_UNICODE_WIDE
611 #ifdef Py_UNICODE_WIDE
615 return (double) 7000;
617 return (double) 70000;
625 #ifdef Py_UNICODE_WIDE
630 #ifdef Py_UNICODE_WIDE
634 #ifdef Py_UNICODE_WIDE
638 return (double) 8000;
640 return (double) 80000;
648 #ifdef Py_UNICODE_WIDE
653 return (double) 9 / 2;
655 #ifdef Py_UNICODE_WIDE
659 #ifdef Py_UNICODE_WIDE
664 return (double) 9000;
666 return (double) 90000;
669 return (double) _PyUnicode_ToDigit(ch
);
673 int _PyUnicode_IsNumeric(Py_UNICODE ch
)
675 return _PyUnicode_ToNumeric(ch
) != -1.0;
678 /* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
680 All characters except those characters defined in the Unicode character
681 database as following categories are considered printable.
682 * Cc (Other, Control)
684 * Cs (Other, Surrogate)
685 * Co (Other, Private Use)
686 * Cn (Other, Not Assigned)
687 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
688 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
689 * Zs (Separator, Space) other than ASCII space('\x20').
691 int _PyUnicode_IsPrintable(Py_UNICODE ch
)
693 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
695 return (ctype
->flags
& PRINTABLE_MASK
) != 0;
698 #ifndef WANT_WCTYPE_FUNCTIONS
700 /* Returns 1 for Unicode characters having the bidirectional type
701 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
703 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch
)
706 case 0x0009: /* HORIZONTAL TABULATION */
707 case 0x000A: /* LINE FEED */
708 case 0x000B: /* VERTICAL TABULATION */
709 case 0x000C: /* FORM FEED */
710 case 0x000D: /* CARRIAGE RETURN */
711 case 0x001C: /* FILE SEPARATOR */
712 case 0x001D: /* GROUP SEPARATOR */
713 case 0x001E: /* RECORD SEPARATOR */
714 case 0x001F: /* UNIT SEPARATOR */
715 case 0x0020: /* SPACE */
716 case 0x0085: /* NEXT LINE */
717 case 0x00A0: /* NO-BREAK SPACE */
718 case 0x1680: /* OGHAM SPACE MARK */
719 case 0x2000: /* EN QUAD */
720 case 0x2001: /* EM QUAD */
721 case 0x2002: /* EN SPACE */
722 case 0x2003: /* EM SPACE */
723 case 0x2004: /* THREE-PER-EM SPACE */
724 case 0x2005: /* FOUR-PER-EM SPACE */
725 case 0x2006: /* SIX-PER-EM SPACE */
726 case 0x2007: /* FIGURE SPACE */
727 case 0x2008: /* PUNCTUATION SPACE */
728 case 0x2009: /* THIN SPACE */
729 case 0x200A: /* HAIR SPACE */
730 case 0x200B: /* ZERO WIDTH SPACE */
731 case 0x2028: /* LINE SEPARATOR */
732 case 0x2029: /* PARAGRAPH SEPARATOR */
733 case 0x202F: /* NARROW NO-BREAK SPACE */
734 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
735 case 0x3000: /* IDEOGRAPHIC SPACE */
742 /* Returns 1 for Unicode characters having the category 'Ll', 0
745 int _PyUnicode_IsLowercase(Py_UNICODE ch
)
747 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
749 return (ctype
->flags
& LOWER_MASK
) != 0;
752 /* Returns 1 for Unicode characters having the category 'Lu', 0
755 int _PyUnicode_IsUppercase(Py_UNICODE ch
)
757 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
759 return (ctype
->flags
& UPPER_MASK
) != 0;
762 /* Returns the uppercase Unicode characters corresponding to ch or just
763 ch if no uppercase mapping is known. */
765 Py_UNICODE
_PyUnicode_ToUppercase(Py_UNICODE ch
)
767 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
768 int delta
= ctype
->upper
;
769 if (ctype
->flags
& NODELTA_MASK
)
776 /* Returns the lowercase Unicode characters corresponding to ch or just
777 ch if no lowercase mapping is known. */
779 Py_UNICODE
_PyUnicode_ToLowercase(Py_UNICODE ch
)
781 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
782 int delta
= ctype
->lower
;
783 if (ctype
->flags
& NODELTA_MASK
)
790 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
791 'Lo' or 'Lm', 0 otherwise. */
793 int _PyUnicode_IsAlpha(Py_UNICODE ch
)
795 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
797 return (ctype
->flags
& ALPHA_MASK
) != 0;
802 /* Export the interfaces using the wchar_t type for portability
805 int _PyUnicode_IsWhitespace(Py_UNICODE ch
)
810 int _PyUnicode_IsLowercase(Py_UNICODE ch
)
815 int _PyUnicode_IsUppercase(Py_UNICODE ch
)
820 Py_UNICODE
_PyUnicode_ToLowercase(Py_UNICODE ch
)
825 Py_UNICODE
_PyUnicode_ToUppercase(Py_UNICODE ch
)
830 int _PyUnicode_IsAlpha(Py_UNICODE ch
)