Merged revisions 78818 via svnmerge from
[python/dscho.git] / Objects / unicodectype.c
blob8c710e05b70159c94ba7ad2e1b3d8edd1033a176
1 /*
2 Unicode character type helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
7 Copyright (c) Corporation for National Research Initiatives.
9 */
11 #include "Python.h"
12 #include "unicodeobject.h"
14 #define ALPHA_MASK 0x01
15 #define DECIMAL_MASK 0x02
16 #define DIGIT_MASK 0x04
17 #define LOWER_MASK 0x08
18 #define LINEBREAK_MASK 0x10
19 #define SPACE_MASK 0x20
20 #define TITLE_MASK 0x40
21 #define UPPER_MASK 0x80
22 #define XID_START_MASK 0x100
23 #define XID_CONTINUE_MASK 0x200
24 #define PRINTABLE_MASK 0x400
25 #define NODELTA_MASK 0x800
27 typedef struct {
28 const Py_UNICODE upper;
29 const Py_UNICODE lower;
30 const Py_UNICODE title;
31 const unsigned char decimal;
32 const unsigned char digit;
33 const unsigned short flags;
34 } _PyUnicode_TypeRecord;
36 #include "unicodetype_db.h"
38 static const _PyUnicode_TypeRecord *
39 gettyperecord(Py_UNICODE code)
41 int index;
43 #ifdef Py_UNICODE_WIDE
44 if (code >= 0x110000)
45 index = 0;
46 else
47 #endif
49 index = index1[(code>>SHIFT)];
50 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
53 return &_PyUnicode_TypeRecords[index];
56 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
57 type 'B', 0 otherwise. */
59 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
61 switch (ch) {
62 case 0x000A: /* LINE FEED */
63 case 0x000D: /* CARRIAGE RETURN */
64 case 0x001C: /* FILE SEPARATOR */
65 case 0x001D: /* GROUP SEPARATOR */
66 case 0x001E: /* RECORD SEPARATOR */
67 case 0x0085: /* NEXT LINE */
68 case 0x2028: /* LINE SEPARATOR */
69 case 0x2029: /* PARAGRAPH SEPARATOR */
70 return 1;
71 default:
72 return 0;
76 /* Returns the titlecase Unicode characters corresponding to ch or just
77 ch if no titlecase mapping is known. */
79 Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
81 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
82 int delta = ctype->title;
84 if (ctype->flags & NODELTA_MASK)
85 return delta;
87 if (delta >= 32768)
88 delta -= 65536;
90 return ch + delta;
93 /* Returns 1 for Unicode characters having the category 'Lt', 0
94 otherwise. */
96 int _PyUnicode_IsTitlecase(Py_UNICODE ch)
98 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
100 return (ctype->flags & TITLE_MASK) != 0;
103 /* Returns 1 for Unicode characters having the XID_Start property, 0
104 otherwise. */
106 int _PyUnicode_IsXidStart(Py_UNICODE ch)
108 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
110 return (ctype->flags & XID_START_MASK) != 0;
113 /* Returns 1 for Unicode characters having the XID_Continue property,
114 0 otherwise. */
116 int _PyUnicode_IsXidContinue(Py_UNICODE ch)
118 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
120 return (ctype->flags & XID_CONTINUE_MASK) != 0;
123 /* Returns the integer decimal (0-9) for Unicode characters having
124 this property, -1 otherwise. */
126 int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
128 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
130 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
133 int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
135 if (_PyUnicode_ToDecimalDigit(ch) < 0)
136 return 0;
137 return 1;
140 /* Returns the integer digit (0-9) for Unicode characters having
141 this property, -1 otherwise. */
143 int _PyUnicode_ToDigit(Py_UNICODE ch)
145 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
147 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
150 int _PyUnicode_IsDigit(Py_UNICODE ch)
152 if (_PyUnicode_ToDigit(ch) < 0)
153 return 0;
154 return 1;
157 /* Returns the numeric value as double for Unicode characters having
158 this property, -1.0 otherwise. */
160 /* TODO: replace with unicodetype_db.h table */
162 double _PyUnicode_ToNumeric(Py_UNICODE ch)
164 switch (ch) {
165 case 0x0F33:
166 return (double) -1 / 2;
167 case 0x17F0:
168 case 0x3007:
169 #ifdef Py_UNICODE_WIDE
170 case 0x1018A:
171 #endif
172 return (double) 0;
173 case 0x09F4:
174 case 0x17F1:
175 case 0x215F:
176 case 0x2160:
177 case 0x2170:
178 case 0x3021:
179 case 0x3192:
180 case 0x3220:
181 case 0x3280:
182 #ifdef Py_UNICODE_WIDE
183 case 0x10107:
184 case 0x10142:
185 case 0x10158:
186 case 0x10159:
187 case 0x1015A:
188 case 0x10320:
189 case 0x103D1:
190 #endif
191 return (double) 1;
192 case 0x00BD:
193 case 0x0F2A:
194 case 0x2CFD:
195 #ifdef Py_UNICODE_WIDE
196 case 0x10141:
197 case 0x10175:
198 case 0x10176:
199 #endif
200 return (double) 1 / 2;
201 case 0x2153:
202 return (double) 1 / 3;
203 case 0x00BC:
204 #ifdef Py_UNICODE_WIDE
205 case 0x10140:
206 #endif
207 return (double) 1 / 4;
208 case 0x2155:
209 return (double) 1 / 5;
210 case 0x2159:
211 return (double) 1 / 6;
212 case 0x215B:
213 return (double) 1 / 8;
214 case 0x0BF0:
215 case 0x1372:
216 case 0x2169:
217 case 0x2179:
218 case 0x2469:
219 case 0x247D:
220 case 0x2491:
221 case 0x24FE:
222 case 0x277F:
223 case 0x2789:
224 case 0x2793:
225 case 0x3038:
226 case 0x3229:
227 case 0x3289:
228 #ifdef Py_UNICODE_WIDE
229 case 0x10110:
230 case 0x10149:
231 case 0x10150:
232 case 0x10157:
233 case 0x10160:
234 case 0x10161:
235 case 0x10162:
236 case 0x10163:
237 case 0x10164:
238 case 0x10322:
239 case 0x103D3:
240 case 0x10A44:
241 #endif
242 return (double) 10;
243 case 0x0BF1:
244 case 0x137B:
245 case 0x216D:
246 case 0x217D:
247 #ifdef Py_UNICODE_WIDE
248 case 0x10119:
249 case 0x1014B:
250 case 0x10152:
251 case 0x1016A:
252 case 0x103D5:
253 case 0x10A46:
254 #endif
255 return (double) 100;
256 case 0x0BF2:
257 case 0x216F:
258 case 0x217F:
259 case 0x2180:
260 #ifdef Py_UNICODE_WIDE
261 case 0x10122:
262 case 0x1014D:
263 case 0x10154:
264 case 0x10171:
265 case 0x10A47:
266 #endif
267 return (double) 1000;
268 case 0x137C:
269 case 0x2182:
270 #ifdef Py_UNICODE_WIDE
271 case 0x1012B:
272 case 0x10155:
273 #endif
274 return (double) 10000;
275 case 0x216A:
276 case 0x217A:
277 case 0x246A:
278 case 0x247E:
279 case 0x2492:
280 case 0x24EB:
281 return (double) 11;
282 case 0x0F2F:
283 return (double) 11 / 2;
284 case 0x216B:
285 case 0x217B:
286 case 0x246B:
287 case 0x247F:
288 case 0x2493:
289 case 0x24EC:
290 return (double) 12;
291 case 0x246C:
292 case 0x2480:
293 case 0x2494:
294 case 0x24ED:
295 return (double) 13;
296 case 0x0F30:
297 return (double) 13 / 2;
298 case 0x246D:
299 case 0x2481:
300 case 0x2495:
301 case 0x24EE:
302 return (double) 14;
303 case 0x246E:
304 case 0x2482:
305 case 0x2496:
306 case 0x24EF:
307 return (double) 15;
308 case 0x0F31:
309 return (double) 15 / 2;
310 case 0x09F9:
311 case 0x246F:
312 case 0x2483:
313 case 0x2497:
314 case 0x24F0:
315 return (double) 16;
316 case 0x16EE:
317 case 0x2470:
318 case 0x2484:
319 case 0x2498:
320 case 0x24F1:
321 return (double) 17;
322 case 0x0F32:
323 return (double) 17 / 2;
324 case 0x16EF:
325 case 0x2471:
326 case 0x2485:
327 case 0x2499:
328 case 0x24F2:
329 return (double) 18;
330 case 0x16F0:
331 case 0x2472:
332 case 0x2486:
333 case 0x249A:
334 case 0x24F3:
335 return (double) 19;
336 case 0x09F5:
337 case 0x17F2:
338 case 0x2161:
339 case 0x2171:
340 case 0x3022:
341 case 0x3193:
342 case 0x3221:
343 case 0x3281:
344 #ifdef Py_UNICODE_WIDE
345 case 0x10108:
346 case 0x1015B:
347 case 0x1015C:
348 case 0x1015D:
349 case 0x1015E:
350 case 0x103D2:
351 #endif
352 return (double) 2;
353 case 0x2154:
354 #ifdef Py_UNICODE_WIDE
355 case 0x10177:
356 #endif
357 return (double) 2 / 3;
358 case 0x2156:
359 return (double) 2 / 5;
360 case 0x1373:
361 case 0x2473:
362 case 0x2487:
363 case 0x249B:
364 case 0x24F4:
365 case 0x3039:
366 #ifdef Py_UNICODE_WIDE
367 case 0x10111:
368 case 0x103D4:
369 case 0x10A45:
370 #endif
371 return (double) 20;
372 #ifdef Py_UNICODE_WIDE
373 case 0x1011A:
374 return (double) 200;
375 case 0x10123:
376 return (double) 2000;
377 case 0x1012C:
378 return (double) 20000;
379 #endif
380 case 0x3251:
381 return (double) 21;
382 case 0x3252:
383 return (double) 22;
384 case 0x3253:
385 return (double) 23;
386 case 0x3254:
387 return (double) 24;
388 case 0x3255:
389 return (double) 25;
390 case 0x3256:
391 return (double) 26;
392 case 0x3257:
393 return (double) 27;
394 case 0x3258:
395 return (double) 28;
396 case 0x3259:
397 return (double) 29;
398 case 0x09F6:
399 case 0x17F3:
400 case 0x2162:
401 case 0x2172:
402 case 0x3023:
403 case 0x3194:
404 case 0x3222:
405 case 0x3282:
406 #ifdef Py_UNICODE_WIDE
407 case 0x10109:
408 #endif
409 return (double) 3;
410 case 0x0F2B:
411 return (double) 3 / 2;
412 case 0x00BE:
413 #ifdef Py_UNICODE_WIDE
414 case 0x10178:
415 #endif
416 return (double) 3 / 4;
417 case 0x2157:
418 return (double) 3 / 5;
419 case 0x215C:
420 return (double) 3 / 8;
421 case 0x1374:
422 case 0x303A:
423 case 0x325A:
424 #ifdef Py_UNICODE_WIDE
425 case 0x10112:
426 case 0x10165:
427 #endif
428 return (double) 30;
429 #ifdef Py_UNICODE_WIDE
430 case 0x1011B:
431 case 0x1016B:
432 return (double) 300;
433 case 0x10124:
434 return (double) 3000;
435 case 0x1012D:
436 return (double) 30000;
437 #endif
438 case 0x325B:
439 return (double) 31;
440 case 0x325C:
441 return (double) 32;
442 case 0x325D:
443 return (double) 33;
444 case 0x325E:
445 return (double) 34;
446 case 0x325F:
447 return (double) 35;
448 case 0x32B1:
449 return (double) 36;
450 case 0x32B2:
451 return (double) 37;
452 case 0x32B3:
453 return (double) 38;
454 case 0x32B4:
455 return (double) 39;
456 case 0x09F7:
457 case 0x17F4:
458 case 0x2163:
459 case 0x2173:
460 case 0x3024:
461 case 0x3195:
462 case 0x3223:
463 case 0x3283:
464 #ifdef Py_UNICODE_WIDE
465 case 0x1010A:
466 #endif
467 return (double) 4;
468 case 0x2158:
469 return (double) 4 / 5;
470 case 0x1375:
471 case 0x32B5:
472 #ifdef Py_UNICODE_WIDE
473 case 0x10113:
474 #endif
475 return (double) 40;
476 #ifdef Py_UNICODE_WIDE
477 case 0x1011C:
478 return (double) 400;
479 case 0x10125:
480 return (double) 4000;
481 case 0x1012E:
482 return (double) 40000;
483 #endif
484 case 0x32B6:
485 return (double) 41;
486 case 0x32B7:
487 return (double) 42;
488 case 0x32B8:
489 return (double) 43;
490 case 0x32B9:
491 return (double) 44;
492 case 0x32BA:
493 return (double) 45;
494 case 0x32BB:
495 return (double) 46;
496 case 0x32BC:
497 return (double) 47;
498 case 0x32BD:
499 return (double) 48;
500 case 0x32BE:
501 return (double) 49;
502 case 0x17F5:
503 case 0x2164:
504 case 0x2174:
505 case 0x3025:
506 case 0x3224:
507 case 0x3284:
508 #ifdef Py_UNICODE_WIDE
509 case 0x1010B:
510 case 0x10143:
511 case 0x10148:
512 case 0x1014F:
513 case 0x1015F:
514 case 0x10173:
515 case 0x10321:
516 #endif
517 return (double) 5;
518 case 0x0F2C:
519 return (double) 5 / 2;
520 case 0x215A:
521 return (double) 5 / 6;
522 case 0x215D:
523 return (double) 5 / 8;
524 case 0x1376:
525 case 0x216C:
526 case 0x217C:
527 case 0x32BF:
528 #ifdef Py_UNICODE_WIDE
529 case 0x10114:
530 case 0x10144:
531 case 0x1014A:
532 case 0x10151:
533 case 0x10166:
534 case 0x10167:
535 case 0x10168:
536 case 0x10169:
537 case 0x10174:
538 case 0x10323:
539 #endif
540 return (double) 50;
541 case 0x216E:
542 case 0x217E:
543 #ifdef Py_UNICODE_WIDE
544 case 0x1011D:
545 case 0x10145:
546 case 0x1014C:
547 case 0x10153:
548 case 0x1016C:
549 case 0x1016D:
550 case 0x1016E:
551 case 0x1016F:
552 case 0x10170:
553 #endif
554 return (double) 500;
555 case 0x2181:
556 #ifdef Py_UNICODE_WIDE
557 case 0x10126:
558 case 0x10146:
559 case 0x1014E:
560 case 0x10172:
561 #endif
562 return (double) 5000;
563 #ifdef Py_UNICODE_WIDE
564 case 0x1012F:
565 case 0x10147:
566 case 0x10156:
567 return (double) 50000;
568 #endif
569 case 0x17F6:
570 case 0x2165:
571 case 0x2175:
572 case 0x3026:
573 case 0x3225:
574 case 0x3285:
575 #ifdef Py_UNICODE_WIDE
576 case 0x1010C:
577 #endif
578 return (double) 6;
579 case 0x1377:
580 #ifdef Py_UNICODE_WIDE
581 case 0x10115:
582 #endif
583 return (double) 60;
584 #ifdef Py_UNICODE_WIDE
585 case 0x1011E:
586 return (double) 600;
587 case 0x10127:
588 return (double) 6000;
589 case 0x10130:
590 return (double) 60000;
591 #endif
592 case 0x17F7:
593 case 0x2166:
594 case 0x2176:
595 case 0x3027:
596 case 0x3226:
597 case 0x3286:
598 #ifdef Py_UNICODE_WIDE
599 case 0x1010D:
600 #endif
601 return (double) 7;
602 case 0x0F2D:
603 return (double) 7 / 2;
604 case 0x215E:
605 return (double) 7 / 8;
606 case 0x1378:
607 #ifdef Py_UNICODE_WIDE
608 case 0x10116:
609 #endif
610 return (double) 70;
611 #ifdef Py_UNICODE_WIDE
612 case 0x1011F:
613 return (double) 700;
614 case 0x10128:
615 return (double) 7000;
616 case 0x10131:
617 return (double) 70000;
618 #endif
619 case 0x17F8:
620 case 0x2167:
621 case 0x2177:
622 case 0x3028:
623 case 0x3227:
624 case 0x3287:
625 #ifdef Py_UNICODE_WIDE
626 case 0x1010E:
627 #endif
628 return (double) 8;
629 case 0x1379:
630 #ifdef Py_UNICODE_WIDE
631 case 0x10117:
632 #endif
633 return (double) 80;
634 #ifdef Py_UNICODE_WIDE
635 case 0x10120:
636 return (double) 800;
637 case 0x10129:
638 return (double) 8000;
639 case 0x10132:
640 return (double) 80000;
641 #endif
642 case 0x17F9:
643 case 0x2168:
644 case 0x2178:
645 case 0x3029:
646 case 0x3228:
647 case 0x3288:
648 #ifdef Py_UNICODE_WIDE
649 case 0x1010F:
650 #endif
651 return (double) 9;
652 case 0x0F2E:
653 return (double) 9 / 2;
654 case 0x137A:
655 #ifdef Py_UNICODE_WIDE
656 case 0x10118:
657 #endif
658 return (double) 90;
659 #ifdef Py_UNICODE_WIDE
660 case 0x10121:
661 case 0x1034A:
662 return (double) 900;
663 case 0x1012A:
664 return (double) 9000;
665 case 0x10133:
666 return (double) 90000;
667 #endif
668 default:
669 return (double) _PyUnicode_ToDigit(ch);
673 int _PyUnicode_IsNumeric(Py_UNICODE ch)
675 return _PyUnicode_ToNumeric(ch) != -1.0;
678 /* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
679 0 otherwise.
680 All characters except those characters defined in the Unicode character
681 database as following categories are considered printable.
682 * Cc (Other, Control)
683 * Cf (Other, Format)
684 * Cs (Other, Surrogate)
685 * Co (Other, Private Use)
686 * Cn (Other, Not Assigned)
687 * Zl Separator, Line ('\u2028', LINE SEPARATOR)
688 * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
689 * Zs (Separator, Space) other than ASCII space('\x20').
691 int _PyUnicode_IsPrintable(Py_UNICODE ch)
693 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
695 return (ctype->flags & PRINTABLE_MASK) != 0;
698 #ifndef WANT_WCTYPE_FUNCTIONS
700 /* Returns 1 for Unicode characters having the bidirectional type
701 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
703 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
705 switch (ch) {
706 case 0x0009: /* HORIZONTAL TABULATION */
707 case 0x000A: /* LINE FEED */
708 case 0x000B: /* VERTICAL TABULATION */
709 case 0x000C: /* FORM FEED */
710 case 0x000D: /* CARRIAGE RETURN */
711 case 0x001C: /* FILE SEPARATOR */
712 case 0x001D: /* GROUP SEPARATOR */
713 case 0x001E: /* RECORD SEPARATOR */
714 case 0x001F: /* UNIT SEPARATOR */
715 case 0x0020: /* SPACE */
716 case 0x0085: /* NEXT LINE */
717 case 0x00A0: /* NO-BREAK SPACE */
718 case 0x1680: /* OGHAM SPACE MARK */
719 case 0x2000: /* EN QUAD */
720 case 0x2001: /* EM QUAD */
721 case 0x2002: /* EN SPACE */
722 case 0x2003: /* EM SPACE */
723 case 0x2004: /* THREE-PER-EM SPACE */
724 case 0x2005: /* FOUR-PER-EM SPACE */
725 case 0x2006: /* SIX-PER-EM SPACE */
726 case 0x2007: /* FIGURE SPACE */
727 case 0x2008: /* PUNCTUATION SPACE */
728 case 0x2009: /* THIN SPACE */
729 case 0x200A: /* HAIR SPACE */
730 case 0x200B: /* ZERO WIDTH SPACE */
731 case 0x2028: /* LINE SEPARATOR */
732 case 0x2029: /* PARAGRAPH SEPARATOR */
733 case 0x202F: /* NARROW NO-BREAK SPACE */
734 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
735 case 0x3000: /* IDEOGRAPHIC SPACE */
736 return 1;
737 default:
738 return 0;
742 /* Returns 1 for Unicode characters having the category 'Ll', 0
743 otherwise. */
745 int _PyUnicode_IsLowercase(Py_UNICODE ch)
747 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
749 return (ctype->flags & LOWER_MASK) != 0;
752 /* Returns 1 for Unicode characters having the category 'Lu', 0
753 otherwise. */
755 int _PyUnicode_IsUppercase(Py_UNICODE ch)
757 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
759 return (ctype->flags & UPPER_MASK) != 0;
762 /* Returns the uppercase Unicode characters corresponding to ch or just
763 ch if no uppercase mapping is known. */
765 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
767 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
768 int delta = ctype->upper;
769 if (ctype->flags & NODELTA_MASK)
770 return delta;
771 if (delta >= 32768)
772 delta -= 65536;
773 return ch + delta;
776 /* Returns the lowercase Unicode characters corresponding to ch or just
777 ch if no lowercase mapping is known. */
779 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
781 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
782 int delta = ctype->lower;
783 if (ctype->flags & NODELTA_MASK)
784 return delta;
785 if (delta >= 32768)
786 delta -= 65536;
787 return ch + delta;
790 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
791 'Lo' or 'Lm', 0 otherwise. */
793 int _PyUnicode_IsAlpha(Py_UNICODE ch)
795 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
797 return (ctype->flags & ALPHA_MASK) != 0;
800 #else
802 /* Export the interfaces using the wchar_t type for portability
803 reasons: */
805 int _PyUnicode_IsWhitespace(Py_UNICODE ch)
807 return iswspace(ch);
810 int _PyUnicode_IsLowercase(Py_UNICODE ch)
812 return iswlower(ch);
815 int _PyUnicode_IsUppercase(Py_UNICODE ch)
817 return iswupper(ch);
820 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
822 return towlower(ch);
825 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
827 return towupper(ch);
830 int _PyUnicode_IsAlpha(Py_UNICODE ch)
832 return iswalpha(ch);
835 #endif