2 Unicode character type helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
7 Copyright (c) Corporation for National Research Initiatives.
12 #include "unicodeobject.h"
14 #define ALPHA_MASK 0x01
15 #define DECIMAL_MASK 0x02
16 #define DIGIT_MASK 0x04
17 #define LOWER_MASK 0x08
18 #define LINEBREAK_MASK 0x10
19 #define SPACE_MASK 0x20
20 #define TITLE_MASK 0x40
21 #define UPPER_MASK 0x80
24 const Py_UNICODE upper
;
25 const Py_UNICODE lower
;
26 const Py_UNICODE title
;
27 const unsigned char decimal
;
28 const unsigned char digit
;
29 const unsigned short flags
;
30 } _PyUnicode_TypeRecord
;
32 #include "unicodetype_db.h"
34 static const _PyUnicode_TypeRecord
*
35 gettyperecord(Py_UNICODE code
)
39 #ifdef Py_UNICODE_WIDE
45 index
= index1
[(code
>>SHIFT
)];
46 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
49 return &_PyUnicode_TypeRecords
[index
];
52 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
53 type 'B', 0 otherwise. */
55 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch
)
58 case 0x000A: /* LINE FEED */
59 case 0x000D: /* CARRIAGE RETURN */
60 case 0x001C: /* FILE SEPARATOR */
61 case 0x001D: /* GROUP SEPARATOR */
62 case 0x001E: /* RECORD SEPARATOR */
63 case 0x0085: /* NEXT LINE */
64 case 0x2028: /* LINE SEPARATOR */
65 case 0x2029: /* PARAGRAPH SEPARATOR */
72 /* Returns the titlecase Unicode characters corresponding to ch or just
73 ch if no titlecase mapping is known. */
75 Py_UNICODE
_PyUnicode_ToTitlecase(register Py_UNICODE ch
)
77 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
91 /* Returns 1 for Unicode characters having the category 'Lt', 0
94 int _PyUnicode_IsTitlecase(Py_UNICODE ch
)
96 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
98 return (ctype
->flags
& TITLE_MASK
) != 0;
101 /* Returns the integer decimal (0-9) for Unicode characters having
102 this property, -1 otherwise. */
104 int _PyUnicode_ToDecimalDigit(Py_UNICODE ch
)
106 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
108 return (ctype
->flags
& DECIMAL_MASK
) ? ctype
->decimal
: -1;
111 int _PyUnicode_IsDecimalDigit(Py_UNICODE ch
)
113 if (_PyUnicode_ToDecimalDigit(ch
) < 0)
118 /* Returns the integer digit (0-9) for Unicode characters having
119 this property, -1 otherwise. */
121 int _PyUnicode_ToDigit(Py_UNICODE ch
)
123 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
125 return (ctype
->flags
& DIGIT_MASK
) ? ctype
->digit
: -1;
128 int _PyUnicode_IsDigit(Py_UNICODE ch
)
130 if (_PyUnicode_ToDigit(ch
) < 0)
135 /* Returns the numeric value as double for Unicode characters having
136 this property, -1.0 otherwise. */
138 /* TODO: replace with unicodetype_db.h table */
140 double _PyUnicode_ToNumeric(Py_UNICODE ch
)
153 return (double) 1 / 2;
155 return (double) 1 / 3;
157 return (double) 1 / 4;
159 return (double) 1 / 5;
161 return (double) 1 / 6;
163 return (double) 1 / 8;
186 return (double) 1000;
189 return (double) 10000;
241 return (double) 2 / 3;
243 return (double) 2 / 5;
257 return (double) 3 / 4;
259 return (double) 3 / 5;
261 return (double) 3 / 8;
272 return (double) 4 / 5;
281 return (double) 5 / 6;
283 return (double) 5 / 8;
292 return (double) 5000;
306 return (double) 7 / 8;
324 return (double) _PyUnicode_ToDigit(ch
);
328 int _PyUnicode_IsNumeric(Py_UNICODE ch
)
330 if (_PyUnicode_ToNumeric(ch
) < 0.0)
335 #ifndef WANT_WCTYPE_FUNCTIONS
337 /* Returns 1 for Unicode characters having the bidirectional type
338 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
340 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch
)
343 case 0x0009: /* HORIZONTAL TABULATION */
344 case 0x000A: /* LINE FEED */
345 case 0x000B: /* VERTICAL TABULATION */
346 case 0x000C: /* FORM FEED */
347 case 0x000D: /* CARRIAGE RETURN */
348 case 0x001C: /* FILE SEPARATOR */
349 case 0x001D: /* GROUP SEPARATOR */
350 case 0x001E: /* RECORD SEPARATOR */
351 case 0x001F: /* UNIT SEPARATOR */
352 case 0x0020: /* SPACE */
353 case 0x0085: /* NEXT LINE */
354 case 0x00A0: /* NO-BREAK SPACE */
355 case 0x1680: /* OGHAM SPACE MARK */
356 case 0x2000: /* EN QUAD */
357 case 0x2001: /* EM QUAD */
358 case 0x2002: /* EN SPACE */
359 case 0x2003: /* EM SPACE */
360 case 0x2004: /* THREE-PER-EM SPACE */
361 case 0x2005: /* FOUR-PER-EM SPACE */
362 case 0x2006: /* SIX-PER-EM SPACE */
363 case 0x2007: /* FIGURE SPACE */
364 case 0x2008: /* PUNCTUATION SPACE */
365 case 0x2009: /* THIN SPACE */
366 case 0x200A: /* HAIR SPACE */
367 case 0x200B: /* ZERO WIDTH SPACE */
368 case 0x2028: /* LINE SEPARATOR */
369 case 0x2029: /* PARAGRAPH SEPARATOR */
370 case 0x202F: /* NARROW NO-BREAK SPACE */
371 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
372 case 0x3000: /* IDEOGRAPHIC SPACE */
379 /* Returns 1 for Unicode characters having the category 'Ll', 0
382 int _PyUnicode_IsLowercase(Py_UNICODE ch
)
384 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
386 return (ctype
->flags
& LOWER_MASK
) != 0;
389 /* Returns 1 for Unicode characters having the category 'Lu', 0
392 int _PyUnicode_IsUppercase(Py_UNICODE ch
)
394 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
396 return (ctype
->flags
& UPPER_MASK
) != 0;
399 /* Returns the uppercase Unicode characters corresponding to ch or just
400 ch if no uppercase mapping is known. */
402 Py_UNICODE
_PyUnicode_ToUppercase(Py_UNICODE ch
)
404 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
405 int delta
= ctype
->upper
;
411 /* Returns the lowercase Unicode characters corresponding to ch or just
412 ch if no lowercase mapping is known. */
414 Py_UNICODE
_PyUnicode_ToLowercase(Py_UNICODE ch
)
416 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
417 int delta
= ctype
->lower
;
423 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
424 'Lo' or 'Lm', 0 otherwise. */
426 int _PyUnicode_IsAlpha(Py_UNICODE ch
)
428 const _PyUnicode_TypeRecord
*ctype
= gettyperecord(ch
);
430 return (ctype
->flags
& ALPHA_MASK
) != 0;
435 /* Export the interfaces using the wchar_t type for portability
438 int _PyUnicode_IsWhitespace(Py_UNICODE ch
)
443 int _PyUnicode_IsLowercase(Py_UNICODE ch
)
448 int _PyUnicode_IsUppercase(Py_UNICODE ch
)
453 Py_UNICODE
_PyUnicode_ToLowercase(Py_UNICODE ch
)
458 Py_UNICODE
_PyUnicode_ToUppercase(Py_UNICODE ch
)
463 int _PyUnicode_IsAlpha(Py_UNICODE ch
)