The _lsprof module could crash the interpreter if it was given an external
[python.git] / Objects / unicodectype.c
blobebfb5b701243d851956cd4ab1211d040239d3a0c
1 /*
2 Unicode character type helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
7 Copyright (c) Corporation for National Research Initiatives.
9 */
11 #include "Python.h"
12 #include "unicodeobject.h"
14 #define ALPHA_MASK 0x01
15 #define DECIMAL_MASK 0x02
16 #define DIGIT_MASK 0x04
17 #define LOWER_MASK 0x08
18 #define LINEBREAK_MASK 0x10
19 #define SPACE_MASK 0x20
20 #define TITLE_MASK 0x40
21 #define UPPER_MASK 0x80
22 #define NODELTA_MASK 0x100
24 typedef struct {
25 const Py_UNICODE upper;
26 const Py_UNICODE lower;
27 const Py_UNICODE title;
28 const unsigned char decimal;
29 const unsigned char digit;
30 const unsigned short flags;
31 } _PyUnicode_TypeRecord;
33 #include "unicodetype_db.h"
35 static const _PyUnicode_TypeRecord *
36 gettyperecord(Py_UNICODE code)
38 int index;
40 #ifdef Py_UNICODE_WIDE
41 if (code >= 0x110000)
42 index = 0;
43 else
44 #endif
46 index = index1[(code>>SHIFT)];
47 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
50 return &_PyUnicode_TypeRecords[index];
53 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
54 type 'B', 0 otherwise. */
56 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
58 switch (ch) {
59 case 0x000A: /* LINE FEED */
60 case 0x000D: /* CARRIAGE RETURN */
61 case 0x001C: /* FILE SEPARATOR */
62 case 0x001D: /* GROUP SEPARATOR */
63 case 0x001E: /* RECORD SEPARATOR */
64 case 0x0085: /* NEXT LINE */
65 case 0x2028: /* LINE SEPARATOR */
66 case 0x2029: /* PARAGRAPH SEPARATOR */
67 return 1;
68 default:
69 return 0;
73 /* Returns the titlecase Unicode characters corresponding to ch or just
74 ch if no titlecase mapping is known. */
76 Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
78 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
79 int delta;
81 if (ctype->title)
82 delta = ctype->title;
83 else
84 delta = ctype->upper;
86 if (ctype->flags & NODELTA_MASK)
87 return delta;
89 if (delta >= 32768)
90 delta -= 65536;
92 return ch + delta;
95 /* Returns 1 for Unicode characters having the category 'Lt', 0
96 otherwise. */
98 int _PyUnicode_IsTitlecase(Py_UNICODE ch)
100 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
102 return (ctype->flags & TITLE_MASK) != 0;
105 /* Returns the integer decimal (0-9) for Unicode characters having
106 this property, -1 otherwise. */
108 int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
110 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
112 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
115 int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
117 if (_PyUnicode_ToDecimalDigit(ch) < 0)
118 return 0;
119 return 1;
122 /* Returns the integer digit (0-9) for Unicode characters having
123 this property, -1 otherwise. */
125 int _PyUnicode_ToDigit(Py_UNICODE ch)
127 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
129 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
132 int _PyUnicode_IsDigit(Py_UNICODE ch)
134 if (_PyUnicode_ToDigit(ch) < 0)
135 return 0;
136 return 1;
139 /* Returns the numeric value as double for Unicode characters having
140 this property, -1.0 otherwise. */
142 /* TODO: replace with unicodetype_db.h table */
144 double _PyUnicode_ToNumeric(Py_UNICODE ch)
146 switch (ch) {
147 case 0x0F33:
148 return (double) -1 / 2;
149 case 0x17F0:
150 case 0x3007:
151 #ifdef Py_UNICODE_WIDE
152 case 0x1018A:
153 #endif
154 return (double) 0;
155 case 0x09F4:
156 case 0x17F1:
157 case 0x215F:
158 case 0x2160:
159 case 0x2170:
160 case 0x3021:
161 case 0x3192:
162 case 0x3220:
163 case 0x3280:
164 #ifdef Py_UNICODE_WIDE
165 case 0x10107:
166 case 0x10142:
167 case 0x10158:
168 case 0x10159:
169 case 0x1015A:
170 case 0x10320:
171 case 0x103D1:
172 #endif
173 return (double) 1;
174 case 0x00BD:
175 case 0x0F2A:
176 case 0x2CFD:
177 #ifdef Py_UNICODE_WIDE
178 case 0x10141:
179 case 0x10175:
180 case 0x10176:
181 #endif
182 return (double) 1 / 2;
183 case 0x2153:
184 return (double) 1 / 3;
185 case 0x00BC:
186 #ifdef Py_UNICODE_WIDE
187 case 0x10140:
188 #endif
189 return (double) 1 / 4;
190 case 0x2155:
191 return (double) 1 / 5;
192 case 0x2159:
193 return (double) 1 / 6;
194 case 0x215B:
195 return (double) 1 / 8;
196 case 0x0BF0:
197 case 0x1372:
198 case 0x2169:
199 case 0x2179:
200 case 0x2469:
201 case 0x247D:
202 case 0x2491:
203 case 0x24FE:
204 case 0x277F:
205 case 0x2789:
206 case 0x2793:
207 case 0x3038:
208 case 0x3229:
209 case 0x3289:
210 #ifdef Py_UNICODE_WIDE
211 case 0x10110:
212 case 0x10149:
213 case 0x10150:
214 case 0x10157:
215 case 0x10160:
216 case 0x10161:
217 case 0x10162:
218 case 0x10163:
219 case 0x10164:
220 case 0x10322:
221 case 0x103D3:
222 case 0x10A44:
223 #endif
224 return (double) 10;
225 case 0x0BF1:
226 case 0x137B:
227 case 0x216D:
228 case 0x217D:
229 #ifdef Py_UNICODE_WIDE
230 case 0x10119:
231 case 0x1014B:
232 case 0x10152:
233 case 0x1016A:
234 case 0x103D5:
235 case 0x10A46:
236 #endif
237 return (double) 100;
238 case 0x0BF2:
239 case 0x216F:
240 case 0x217F:
241 case 0x2180:
242 #ifdef Py_UNICODE_WIDE
243 case 0x10122:
244 case 0x1014D:
245 case 0x10154:
246 case 0x10171:
247 case 0x10A47:
248 #endif
249 return (double) 1000;
250 case 0x137C:
251 case 0x2182:
252 #ifdef Py_UNICODE_WIDE
253 case 0x1012B:
254 case 0x10155:
255 #endif
256 return (double) 10000;
257 case 0x216A:
258 case 0x217A:
259 case 0x246A:
260 case 0x247E:
261 case 0x2492:
262 case 0x24EB:
263 return (double) 11;
264 case 0x0F2F:
265 return (double) 11 / 2;
266 case 0x216B:
267 case 0x217B:
268 case 0x246B:
269 case 0x247F:
270 case 0x2493:
271 case 0x24EC:
272 return (double) 12;
273 case 0x246C:
274 case 0x2480:
275 case 0x2494:
276 case 0x24ED:
277 return (double) 13;
278 case 0x0F30:
279 return (double) 13 / 2;
280 case 0x246D:
281 case 0x2481:
282 case 0x2495:
283 case 0x24EE:
284 return (double) 14;
285 case 0x246E:
286 case 0x2482:
287 case 0x2496:
288 case 0x24EF:
289 return (double) 15;
290 case 0x0F31:
291 return (double) 15 / 2;
292 case 0x09F9:
293 case 0x246F:
294 case 0x2483:
295 case 0x2497:
296 case 0x24F0:
297 return (double) 16;
298 case 0x16EE:
299 case 0x2470:
300 case 0x2484:
301 case 0x2498:
302 case 0x24F1:
303 return (double) 17;
304 case 0x0F32:
305 return (double) 17 / 2;
306 case 0x16EF:
307 case 0x2471:
308 case 0x2485:
309 case 0x2499:
310 case 0x24F2:
311 return (double) 18;
312 case 0x16F0:
313 case 0x2472:
314 case 0x2486:
315 case 0x249A:
316 case 0x24F3:
317 return (double) 19;
318 case 0x09F5:
319 case 0x17F2:
320 case 0x2161:
321 case 0x2171:
322 case 0x3022:
323 case 0x3193:
324 case 0x3221:
325 case 0x3281:
326 #ifdef Py_UNICODE_WIDE
327 case 0x10108:
328 case 0x1015B:
329 case 0x1015C:
330 case 0x1015D:
331 case 0x1015E:
332 case 0x103D2:
333 #endif
334 return (double) 2;
335 case 0x2154:
336 #ifdef Py_UNICODE_WIDE
337 case 0x10177:
338 #endif
339 return (double) 2 / 3;
340 case 0x2156:
341 return (double) 2 / 5;
342 case 0x1373:
343 case 0x2473:
344 case 0x2487:
345 case 0x249B:
346 case 0x24F4:
347 case 0x3039:
348 #ifdef Py_UNICODE_WIDE
349 case 0x10111:
350 case 0x103D4:
351 case 0x10A45:
352 #endif
353 return (double) 20;
354 #ifdef Py_UNICODE_WIDE
355 case 0x1011A:
356 return (double) 200;
357 case 0x10123:
358 return (double) 2000;
359 case 0x1012C:
360 return (double) 20000;
361 #endif
362 case 0x3251:
363 return (double) 21;
364 case 0x3252:
365 return (double) 22;
366 case 0x3253:
367 return (double) 23;
368 case 0x3254:
369 return (double) 24;
370 case 0x3255:
371 return (double) 25;
372 case 0x3256:
373 return (double) 26;
374 case 0x3257:
375 return (double) 27;
376 case 0x3258:
377 return (double) 28;
378 case 0x3259:
379 return (double) 29;
380 case 0x09F6:
381 case 0x17F3:
382 case 0x2162:
383 case 0x2172:
384 case 0x3023:
385 case 0x3194:
386 case 0x3222:
387 case 0x3282:
388 #ifdef Py_UNICODE_WIDE
389 case 0x10109:
390 #endif
391 return (double) 3;
392 case 0x0F2B:
393 return (double) 3 / 2;
394 case 0x00BE:
395 #ifdef Py_UNICODE_WIDE
396 case 0x10178:
397 #endif
398 return (double) 3 / 4;
399 case 0x2157:
400 return (double) 3 / 5;
401 case 0x215C:
402 return (double) 3 / 8;
403 case 0x1374:
404 case 0x303A:
405 case 0x325A:
406 #ifdef Py_UNICODE_WIDE
407 case 0x10112:
408 case 0x10165:
409 #endif
410 return (double) 30;
411 #ifdef Py_UNICODE_WIDE
412 case 0x1011B:
413 case 0x1016B:
414 return (double) 300;
415 case 0x10124:
416 return (double) 3000;
417 case 0x1012D:
418 return (double) 30000;
419 #endif
420 case 0x325B:
421 return (double) 31;
422 case 0x325C:
423 return (double) 32;
424 case 0x325D:
425 return (double) 33;
426 case 0x325E:
427 return (double) 34;
428 case 0x325F:
429 return (double) 35;
430 case 0x32B1:
431 return (double) 36;
432 case 0x32B2:
433 return (double) 37;
434 case 0x32B3:
435 return (double) 38;
436 case 0x32B4:
437 return (double) 39;
438 case 0x09F7:
439 case 0x17F4:
440 case 0x2163:
441 case 0x2173:
442 case 0x3024:
443 case 0x3195:
444 case 0x3223:
445 case 0x3283:
446 #ifdef Py_UNICODE_WIDE
447 case 0x1010A:
448 #endif
449 return (double) 4;
450 case 0x2158:
451 return (double) 4 / 5;
452 case 0x1375:
453 case 0x32B5:
454 #ifdef Py_UNICODE_WIDE
455 case 0x10113:
456 #endif
457 return (double) 40;
458 #ifdef Py_UNICODE_WIDE
459 case 0x1011C:
460 return (double) 400;
461 case 0x10125:
462 return (double) 4000;
463 case 0x1012E:
464 return (double) 40000;
465 #endif
466 case 0x32B6:
467 return (double) 41;
468 case 0x32B7:
469 return (double) 42;
470 case 0x32B8:
471 return (double) 43;
472 case 0x32B9:
473 return (double) 44;
474 case 0x32BA:
475 return (double) 45;
476 case 0x32BB:
477 return (double) 46;
478 case 0x32BC:
479 return (double) 47;
480 case 0x32BD:
481 return (double) 48;
482 case 0x32BE:
483 return (double) 49;
484 case 0x17F5:
485 case 0x2164:
486 case 0x2174:
487 case 0x3025:
488 case 0x3224:
489 case 0x3284:
490 #ifdef Py_UNICODE_WIDE
491 case 0x1010B:
492 case 0x10143:
493 case 0x10148:
494 case 0x1014F:
495 case 0x1015F:
496 case 0x10173:
497 case 0x10321:
498 #endif
499 return (double) 5;
500 case 0x0F2C:
501 return (double) 5 / 2;
502 case 0x215A:
503 return (double) 5 / 6;
504 case 0x215D:
505 return (double) 5 / 8;
506 case 0x1376:
507 case 0x216C:
508 case 0x217C:
509 case 0x32BF:
510 #ifdef Py_UNICODE_WIDE
511 case 0x10114:
512 case 0x10144:
513 case 0x1014A:
514 case 0x10151:
515 case 0x10166:
516 case 0x10167:
517 case 0x10168:
518 case 0x10169:
519 case 0x10174:
520 case 0x10323:
521 #endif
522 return (double) 50;
523 case 0x216E:
524 case 0x217E:
525 #ifdef Py_UNICODE_WIDE
526 case 0x1011D:
527 case 0x10145:
528 case 0x1014C:
529 case 0x10153:
530 case 0x1016C:
531 case 0x1016D:
532 case 0x1016E:
533 case 0x1016F:
534 case 0x10170:
535 #endif
536 return (double) 500;
537 case 0x2181:
538 #ifdef Py_UNICODE_WIDE
539 case 0x10126:
540 case 0x10146:
541 case 0x1014E:
542 case 0x10172:
543 #endif
544 return (double) 5000;
545 #ifdef Py_UNICODE_WIDE
546 case 0x1012F:
547 case 0x10147:
548 case 0x10156:
549 return (double) 50000;
550 #endif
551 case 0x17F6:
552 case 0x2165:
553 case 0x2175:
554 case 0x3026:
555 case 0x3225:
556 case 0x3285:
557 #ifdef Py_UNICODE_WIDE
558 case 0x1010C:
559 #endif
560 return (double) 6;
561 case 0x1377:
562 #ifdef Py_UNICODE_WIDE
563 case 0x10115:
564 #endif
565 return (double) 60;
566 #ifdef Py_UNICODE_WIDE
567 case 0x1011E:
568 return (double) 600;
569 case 0x10127:
570 return (double) 6000;
571 case 0x10130:
572 return (double) 60000;
573 #endif
574 case 0x17F7:
575 case 0x2166:
576 case 0x2176:
577 case 0x3027:
578 case 0x3226:
579 case 0x3286:
580 #ifdef Py_UNICODE_WIDE
581 case 0x1010D:
582 #endif
583 return (double) 7;
584 case 0x0F2D:
585 return (double) 7 / 2;
586 case 0x215E:
587 return (double) 7 / 8;
588 case 0x1378:
589 #ifdef Py_UNICODE_WIDE
590 case 0x10116:
591 #endif
592 return (double) 70;
593 #ifdef Py_UNICODE_WIDE
594 case 0x1011F:
595 return (double) 700;
596 case 0x10128:
597 return (double) 7000;
598 case 0x10131:
599 return (double) 70000;
600 #endif
601 case 0x17F8:
602 case 0x2167:
603 case 0x2177:
604 case 0x3028:
605 case 0x3227:
606 case 0x3287:
607 #ifdef Py_UNICODE_WIDE
608 case 0x1010E:
609 #endif
610 return (double) 8;
611 case 0x1379:
612 #ifdef Py_UNICODE_WIDE
613 case 0x10117:
614 #endif
615 return (double) 80;
616 #ifdef Py_UNICODE_WIDE
617 case 0x10120:
618 return (double) 800;
619 case 0x10129:
620 return (double) 8000;
621 case 0x10132:
622 return (double) 80000;
623 #endif
624 case 0x17F9:
625 case 0x2168:
626 case 0x2178:
627 case 0x3029:
628 case 0x3228:
629 case 0x3288:
630 #ifdef Py_UNICODE_WIDE
631 case 0x1010F:
632 #endif
633 return (double) 9;
634 case 0x0F2E:
635 return (double) 9 / 2;
636 case 0x137A:
637 #ifdef Py_UNICODE_WIDE
638 case 0x10118:
639 #endif
640 return (double) 90;
641 #ifdef Py_UNICODE_WIDE
642 case 0x10121:
643 case 0x1034A:
644 return (double) 900;
645 case 0x1012A:
646 return (double) 9000;
647 case 0x10133:
648 return (double) 90000;
649 #endif
650 default:
651 return (double) _PyUnicode_ToDigit(ch);
655 int _PyUnicode_IsNumeric(Py_UNICODE ch)
657 return _PyUnicode_ToNumeric(ch) != -1.0;
660 #ifndef WANT_WCTYPE_FUNCTIONS
662 /* Returns 1 for Unicode characters having the bidirectional type
663 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
665 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
667 switch (ch) {
668 case 0x0009: /* HORIZONTAL TABULATION */
669 case 0x000A: /* LINE FEED */
670 case 0x000B: /* VERTICAL TABULATION */
671 case 0x000C: /* FORM FEED */
672 case 0x000D: /* CARRIAGE RETURN */
673 case 0x001C: /* FILE SEPARATOR */
674 case 0x001D: /* GROUP SEPARATOR */
675 case 0x001E: /* RECORD SEPARATOR */
676 case 0x001F: /* UNIT SEPARATOR */
677 case 0x0020: /* SPACE */
678 case 0x0085: /* NEXT LINE */
679 case 0x00A0: /* NO-BREAK SPACE */
680 case 0x1680: /* OGHAM SPACE MARK */
681 case 0x2000: /* EN QUAD */
682 case 0x2001: /* EM QUAD */
683 case 0x2002: /* EN SPACE */
684 case 0x2003: /* EM SPACE */
685 case 0x2004: /* THREE-PER-EM SPACE */
686 case 0x2005: /* FOUR-PER-EM SPACE */
687 case 0x2006: /* SIX-PER-EM SPACE */
688 case 0x2007: /* FIGURE SPACE */
689 case 0x2008: /* PUNCTUATION SPACE */
690 case 0x2009: /* THIN SPACE */
691 case 0x200A: /* HAIR SPACE */
692 case 0x200B: /* ZERO WIDTH SPACE */
693 case 0x2028: /* LINE SEPARATOR */
694 case 0x2029: /* PARAGRAPH SEPARATOR */
695 case 0x202F: /* NARROW NO-BREAK SPACE */
696 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
697 case 0x3000: /* IDEOGRAPHIC SPACE */
698 return 1;
699 default:
700 return 0;
704 /* Returns 1 for Unicode characters having the category 'Ll', 0
705 otherwise. */
707 int _PyUnicode_IsLowercase(Py_UNICODE ch)
709 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
711 return (ctype->flags & LOWER_MASK) != 0;
714 /* Returns 1 for Unicode characters having the category 'Lu', 0
715 otherwise. */
717 int _PyUnicode_IsUppercase(Py_UNICODE ch)
719 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
721 return (ctype->flags & UPPER_MASK) != 0;
724 /* Returns the uppercase Unicode characters corresponding to ch or just
725 ch if no uppercase mapping is known. */
727 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
729 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
730 int delta = ctype->upper;
731 if (ctype->flags & NODELTA_MASK)
732 return delta;
733 if (delta >= 32768)
734 delta -= 65536;
735 return ch + delta;
738 /* Returns the lowercase Unicode characters corresponding to ch or just
739 ch if no lowercase mapping is known. */
741 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
743 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
744 int delta = ctype->lower;
745 if (ctype->flags & NODELTA_MASK)
746 return delta;
747 if (delta >= 32768)
748 delta -= 65536;
749 return ch + delta;
752 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
753 'Lo' or 'Lm', 0 otherwise. */
755 int _PyUnicode_IsAlpha(Py_UNICODE ch)
757 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
759 return (ctype->flags & ALPHA_MASK) != 0;
762 #else
764 /* Export the interfaces using the wchar_t type for portability
765 reasons: */
767 int _PyUnicode_IsWhitespace(Py_UNICODE ch)
769 return iswspace(ch);
772 int _PyUnicode_IsLowercase(Py_UNICODE ch)
774 return iswlower(ch);
777 int _PyUnicode_IsUppercase(Py_UNICODE ch)
779 return iswupper(ch);
782 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
784 return towlower(ch);
787 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
789 return towupper(ch);
792 int _PyUnicode_IsAlpha(Py_UNICODE ch)
794 return iswalpha(ch);
797 #endif