Issue #6629: Fix a data corruption issue in the new `io` package, which could
[python.git] / Objects / unicodectype.c
blob2afafb8da25c13bf067e70846d26aa51291c39a8
1 /*
2 Unicode character type helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
7 Copyright (c) Corporation for National Research Initiatives.
9 */
11 #include "Python.h"
12 #include "unicodeobject.h"
14 #define ALPHA_MASK 0x01
15 #define DECIMAL_MASK 0x02
16 #define DIGIT_MASK 0x04
17 #define LOWER_MASK 0x08
18 #define LINEBREAK_MASK 0x10
19 #define SPACE_MASK 0x20
20 #define TITLE_MASK 0x40
21 #define UPPER_MASK 0x80
22 #define NODELTA_MASK 0x100
24 typedef struct {
25 const Py_UNICODE upper;
26 const Py_UNICODE lower;
27 const Py_UNICODE title;
28 const unsigned char decimal;
29 const unsigned char digit;
30 const unsigned short flags;
31 } _PyUnicode_TypeRecord;
33 #include "unicodetype_db.h"
35 static const _PyUnicode_TypeRecord *
36 gettyperecord(Py_UNICODE code)
38 int index;
40 #ifdef Py_UNICODE_WIDE
41 if (code >= 0x110000)
42 index = 0;
43 else
44 #endif
46 index = index1[(code>>SHIFT)];
47 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
50 return &_PyUnicode_TypeRecords[index];
53 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
54 type 'B', 0 otherwise. */
56 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
58 switch (ch) {
59 case 0x000A: /* LINE FEED */
60 case 0x000D: /* CARRIAGE RETURN */
61 case 0x001C: /* FILE SEPARATOR */
62 case 0x001D: /* GROUP SEPARATOR */
63 case 0x001E: /* RECORD SEPARATOR */
64 case 0x0085: /* NEXT LINE */
65 case 0x2028: /* LINE SEPARATOR */
66 case 0x2029: /* PARAGRAPH SEPARATOR */
67 return 1;
68 default:
69 return 0;
73 /* Returns the titlecase Unicode characters corresponding to ch or just
74 ch if no titlecase mapping is known. */
76 Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
78 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
79 int delta = ctype->title;
81 if (ctype->flags & NODELTA_MASK)
82 return delta;
84 if (delta >= 32768)
85 delta -= 65536;
87 return ch + delta;
90 /* Returns 1 for Unicode characters having the category 'Lt', 0
91 otherwise. */
93 int _PyUnicode_IsTitlecase(Py_UNICODE ch)
95 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
97 return (ctype->flags & TITLE_MASK) != 0;
100 /* Returns the integer decimal (0-9) for Unicode characters having
101 this property, -1 otherwise. */
103 int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
105 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
107 return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
110 int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
112 if (_PyUnicode_ToDecimalDigit(ch) < 0)
113 return 0;
114 return 1;
117 /* Returns the integer digit (0-9) for Unicode characters having
118 this property, -1 otherwise. */
120 int _PyUnicode_ToDigit(Py_UNICODE ch)
122 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
124 return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
127 int _PyUnicode_IsDigit(Py_UNICODE ch)
129 if (_PyUnicode_ToDigit(ch) < 0)
130 return 0;
131 return 1;
134 /* Returns the numeric value as double for Unicode characters having
135 this property, -1.0 otherwise. */
137 /* TODO: replace with unicodetype_db.h table */
139 double _PyUnicode_ToNumeric(Py_UNICODE ch)
141 switch (ch) {
142 case 0x0F33:
143 return (double) -1 / 2;
144 case 0x17F0:
145 case 0x3007:
146 #ifdef Py_UNICODE_WIDE
147 case 0x1018A:
148 #endif
149 return (double) 0;
150 case 0x09F4:
151 case 0x17F1:
152 case 0x215F:
153 case 0x2160:
154 case 0x2170:
155 case 0x3021:
156 case 0x3192:
157 case 0x3220:
158 case 0x3280:
159 #ifdef Py_UNICODE_WIDE
160 case 0x10107:
161 case 0x10142:
162 case 0x10158:
163 case 0x10159:
164 case 0x1015A:
165 case 0x10320:
166 case 0x103D1:
167 #endif
168 return (double) 1;
169 case 0x00BD:
170 case 0x0F2A:
171 case 0x2CFD:
172 #ifdef Py_UNICODE_WIDE
173 case 0x10141:
174 case 0x10175:
175 case 0x10176:
176 #endif
177 return (double) 1 / 2;
178 case 0x2153:
179 return (double) 1 / 3;
180 case 0x00BC:
181 #ifdef Py_UNICODE_WIDE
182 case 0x10140:
183 #endif
184 return (double) 1 / 4;
185 case 0x2155:
186 return (double) 1 / 5;
187 case 0x2159:
188 return (double) 1 / 6;
189 case 0x215B:
190 return (double) 1 / 8;
191 case 0x0BF0:
192 case 0x1372:
193 case 0x2169:
194 case 0x2179:
195 case 0x2469:
196 case 0x247D:
197 case 0x2491:
198 case 0x24FE:
199 case 0x277F:
200 case 0x2789:
201 case 0x2793:
202 case 0x3038:
203 case 0x3229:
204 case 0x3289:
205 #ifdef Py_UNICODE_WIDE
206 case 0x10110:
207 case 0x10149:
208 case 0x10150:
209 case 0x10157:
210 case 0x10160:
211 case 0x10161:
212 case 0x10162:
213 case 0x10163:
214 case 0x10164:
215 case 0x10322:
216 case 0x103D3:
217 case 0x10A44:
218 #endif
219 return (double) 10;
220 case 0x0BF1:
221 case 0x137B:
222 case 0x216D:
223 case 0x217D:
224 #ifdef Py_UNICODE_WIDE
225 case 0x10119:
226 case 0x1014B:
227 case 0x10152:
228 case 0x1016A:
229 case 0x103D5:
230 case 0x10A46:
231 #endif
232 return (double) 100;
233 case 0x0BF2:
234 case 0x216F:
235 case 0x217F:
236 case 0x2180:
237 #ifdef Py_UNICODE_WIDE
238 case 0x10122:
239 case 0x1014D:
240 case 0x10154:
241 case 0x10171:
242 case 0x10A47:
243 #endif
244 return (double) 1000;
245 case 0x137C:
246 case 0x2182:
247 #ifdef Py_UNICODE_WIDE
248 case 0x1012B:
249 case 0x10155:
250 #endif
251 return (double) 10000;
252 case 0x216A:
253 case 0x217A:
254 case 0x246A:
255 case 0x247E:
256 case 0x2492:
257 case 0x24EB:
258 return (double) 11;
259 case 0x0F2F:
260 return (double) 11 / 2;
261 case 0x216B:
262 case 0x217B:
263 case 0x246B:
264 case 0x247F:
265 case 0x2493:
266 case 0x24EC:
267 return (double) 12;
268 case 0x246C:
269 case 0x2480:
270 case 0x2494:
271 case 0x24ED:
272 return (double) 13;
273 case 0x0F30:
274 return (double) 13 / 2;
275 case 0x246D:
276 case 0x2481:
277 case 0x2495:
278 case 0x24EE:
279 return (double) 14;
280 case 0x246E:
281 case 0x2482:
282 case 0x2496:
283 case 0x24EF:
284 return (double) 15;
285 case 0x0F31:
286 return (double) 15 / 2;
287 case 0x09F9:
288 case 0x246F:
289 case 0x2483:
290 case 0x2497:
291 case 0x24F0:
292 return (double) 16;
293 case 0x16EE:
294 case 0x2470:
295 case 0x2484:
296 case 0x2498:
297 case 0x24F1:
298 return (double) 17;
299 case 0x0F32:
300 return (double) 17 / 2;
301 case 0x16EF:
302 case 0x2471:
303 case 0x2485:
304 case 0x2499:
305 case 0x24F2:
306 return (double) 18;
307 case 0x16F0:
308 case 0x2472:
309 case 0x2486:
310 case 0x249A:
311 case 0x24F3:
312 return (double) 19;
313 case 0x09F5:
314 case 0x17F2:
315 case 0x2161:
316 case 0x2171:
317 case 0x3022:
318 case 0x3193:
319 case 0x3221:
320 case 0x3281:
321 #ifdef Py_UNICODE_WIDE
322 case 0x10108:
323 case 0x1015B:
324 case 0x1015C:
325 case 0x1015D:
326 case 0x1015E:
327 case 0x103D2:
328 #endif
329 return (double) 2;
330 case 0x2154:
331 #ifdef Py_UNICODE_WIDE
332 case 0x10177:
333 #endif
334 return (double) 2 / 3;
335 case 0x2156:
336 return (double) 2 / 5;
337 case 0x1373:
338 case 0x2473:
339 case 0x2487:
340 case 0x249B:
341 case 0x24F4:
342 case 0x3039:
343 #ifdef Py_UNICODE_WIDE
344 case 0x10111:
345 case 0x103D4:
346 case 0x10A45:
347 #endif
348 return (double) 20;
349 #ifdef Py_UNICODE_WIDE
350 case 0x1011A:
351 return (double) 200;
352 case 0x10123:
353 return (double) 2000;
354 case 0x1012C:
355 return (double) 20000;
356 #endif
357 case 0x3251:
358 return (double) 21;
359 case 0x3252:
360 return (double) 22;
361 case 0x3253:
362 return (double) 23;
363 case 0x3254:
364 return (double) 24;
365 case 0x3255:
366 return (double) 25;
367 case 0x3256:
368 return (double) 26;
369 case 0x3257:
370 return (double) 27;
371 case 0x3258:
372 return (double) 28;
373 case 0x3259:
374 return (double) 29;
375 case 0x09F6:
376 case 0x17F3:
377 case 0x2162:
378 case 0x2172:
379 case 0x3023:
380 case 0x3194:
381 case 0x3222:
382 case 0x3282:
383 #ifdef Py_UNICODE_WIDE
384 case 0x10109:
385 #endif
386 return (double) 3;
387 case 0x0F2B:
388 return (double) 3 / 2;
389 case 0x00BE:
390 #ifdef Py_UNICODE_WIDE
391 case 0x10178:
392 #endif
393 return (double) 3 / 4;
394 case 0x2157:
395 return (double) 3 / 5;
396 case 0x215C:
397 return (double) 3 / 8;
398 case 0x1374:
399 case 0x303A:
400 case 0x325A:
401 #ifdef Py_UNICODE_WIDE
402 case 0x10112:
403 case 0x10165:
404 #endif
405 return (double) 30;
406 #ifdef Py_UNICODE_WIDE
407 case 0x1011B:
408 case 0x1016B:
409 return (double) 300;
410 case 0x10124:
411 return (double) 3000;
412 case 0x1012D:
413 return (double) 30000;
414 #endif
415 case 0x325B:
416 return (double) 31;
417 case 0x325C:
418 return (double) 32;
419 case 0x325D:
420 return (double) 33;
421 case 0x325E:
422 return (double) 34;
423 case 0x325F:
424 return (double) 35;
425 case 0x32B1:
426 return (double) 36;
427 case 0x32B2:
428 return (double) 37;
429 case 0x32B3:
430 return (double) 38;
431 case 0x32B4:
432 return (double) 39;
433 case 0x09F7:
434 case 0x17F4:
435 case 0x2163:
436 case 0x2173:
437 case 0x3024:
438 case 0x3195:
439 case 0x3223:
440 case 0x3283:
441 #ifdef Py_UNICODE_WIDE
442 case 0x1010A:
443 #endif
444 return (double) 4;
445 case 0x2158:
446 return (double) 4 / 5;
447 case 0x1375:
448 case 0x32B5:
449 #ifdef Py_UNICODE_WIDE
450 case 0x10113:
451 #endif
452 return (double) 40;
453 #ifdef Py_UNICODE_WIDE
454 case 0x1011C:
455 return (double) 400;
456 case 0x10125:
457 return (double) 4000;
458 case 0x1012E:
459 return (double) 40000;
460 #endif
461 case 0x32B6:
462 return (double) 41;
463 case 0x32B7:
464 return (double) 42;
465 case 0x32B8:
466 return (double) 43;
467 case 0x32B9:
468 return (double) 44;
469 case 0x32BA:
470 return (double) 45;
471 case 0x32BB:
472 return (double) 46;
473 case 0x32BC:
474 return (double) 47;
475 case 0x32BD:
476 return (double) 48;
477 case 0x32BE:
478 return (double) 49;
479 case 0x17F5:
480 case 0x2164:
481 case 0x2174:
482 case 0x3025:
483 case 0x3224:
484 case 0x3284:
485 #ifdef Py_UNICODE_WIDE
486 case 0x1010B:
487 case 0x10143:
488 case 0x10148:
489 case 0x1014F:
490 case 0x1015F:
491 case 0x10173:
492 case 0x10321:
493 #endif
494 return (double) 5;
495 case 0x0F2C:
496 return (double) 5 / 2;
497 case 0x215A:
498 return (double) 5 / 6;
499 case 0x215D:
500 return (double) 5 / 8;
501 case 0x1376:
502 case 0x216C:
503 case 0x217C:
504 case 0x32BF:
505 #ifdef Py_UNICODE_WIDE
506 case 0x10114:
507 case 0x10144:
508 case 0x1014A:
509 case 0x10151:
510 case 0x10166:
511 case 0x10167:
512 case 0x10168:
513 case 0x10169:
514 case 0x10174:
515 case 0x10323:
516 #endif
517 return (double) 50;
518 case 0x216E:
519 case 0x217E:
520 #ifdef Py_UNICODE_WIDE
521 case 0x1011D:
522 case 0x10145:
523 case 0x1014C:
524 case 0x10153:
525 case 0x1016C:
526 case 0x1016D:
527 case 0x1016E:
528 case 0x1016F:
529 case 0x10170:
530 #endif
531 return (double) 500;
532 case 0x2181:
533 #ifdef Py_UNICODE_WIDE
534 case 0x10126:
535 case 0x10146:
536 case 0x1014E:
537 case 0x10172:
538 #endif
539 return (double) 5000;
540 #ifdef Py_UNICODE_WIDE
541 case 0x1012F:
542 case 0x10147:
543 case 0x10156:
544 return (double) 50000;
545 #endif
546 case 0x17F6:
547 case 0x2165:
548 case 0x2175:
549 case 0x3026:
550 case 0x3225:
551 case 0x3285:
552 #ifdef Py_UNICODE_WIDE
553 case 0x1010C:
554 #endif
555 return (double) 6;
556 case 0x1377:
557 #ifdef Py_UNICODE_WIDE
558 case 0x10115:
559 #endif
560 return (double) 60;
561 #ifdef Py_UNICODE_WIDE
562 case 0x1011E:
563 return (double) 600;
564 case 0x10127:
565 return (double) 6000;
566 case 0x10130:
567 return (double) 60000;
568 #endif
569 case 0x17F7:
570 case 0x2166:
571 case 0x2176:
572 case 0x3027:
573 case 0x3226:
574 case 0x3286:
575 #ifdef Py_UNICODE_WIDE
576 case 0x1010D:
577 #endif
578 return (double) 7;
579 case 0x0F2D:
580 return (double) 7 / 2;
581 case 0x215E:
582 return (double) 7 / 8;
583 case 0x1378:
584 #ifdef Py_UNICODE_WIDE
585 case 0x10116:
586 #endif
587 return (double) 70;
588 #ifdef Py_UNICODE_WIDE
589 case 0x1011F:
590 return (double) 700;
591 case 0x10128:
592 return (double) 7000;
593 case 0x10131:
594 return (double) 70000;
595 #endif
596 case 0x17F8:
597 case 0x2167:
598 case 0x2177:
599 case 0x3028:
600 case 0x3227:
601 case 0x3287:
602 #ifdef Py_UNICODE_WIDE
603 case 0x1010E:
604 #endif
605 return (double) 8;
606 case 0x1379:
607 #ifdef Py_UNICODE_WIDE
608 case 0x10117:
609 #endif
610 return (double) 80;
611 #ifdef Py_UNICODE_WIDE
612 case 0x10120:
613 return (double) 800;
614 case 0x10129:
615 return (double) 8000;
616 case 0x10132:
617 return (double) 80000;
618 #endif
619 case 0x17F9:
620 case 0x2168:
621 case 0x2178:
622 case 0x3029:
623 case 0x3228:
624 case 0x3288:
625 #ifdef Py_UNICODE_WIDE
626 case 0x1010F:
627 #endif
628 return (double) 9;
629 case 0x0F2E:
630 return (double) 9 / 2;
631 case 0x137A:
632 #ifdef Py_UNICODE_WIDE
633 case 0x10118:
634 #endif
635 return (double) 90;
636 #ifdef Py_UNICODE_WIDE
637 case 0x10121:
638 case 0x1034A:
639 return (double) 900;
640 case 0x1012A:
641 return (double) 9000;
642 case 0x10133:
643 return (double) 90000;
644 #endif
645 default:
646 return (double) _PyUnicode_ToDigit(ch);
650 int _PyUnicode_IsNumeric(Py_UNICODE ch)
652 return _PyUnicode_ToNumeric(ch) != -1.0;
655 #ifndef WANT_WCTYPE_FUNCTIONS
657 /* Returns 1 for Unicode characters having the bidirectional type
658 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
660 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
662 switch (ch) {
663 case 0x0009: /* HORIZONTAL TABULATION */
664 case 0x000A: /* LINE FEED */
665 case 0x000B: /* VERTICAL TABULATION */
666 case 0x000C: /* FORM FEED */
667 case 0x000D: /* CARRIAGE RETURN */
668 case 0x001C: /* FILE SEPARATOR */
669 case 0x001D: /* GROUP SEPARATOR */
670 case 0x001E: /* RECORD SEPARATOR */
671 case 0x001F: /* UNIT SEPARATOR */
672 case 0x0020: /* SPACE */
673 case 0x0085: /* NEXT LINE */
674 case 0x00A0: /* NO-BREAK SPACE */
675 case 0x1680: /* OGHAM SPACE MARK */
676 case 0x2000: /* EN QUAD */
677 case 0x2001: /* EM QUAD */
678 case 0x2002: /* EN SPACE */
679 case 0x2003: /* EM SPACE */
680 case 0x2004: /* THREE-PER-EM SPACE */
681 case 0x2005: /* FOUR-PER-EM SPACE */
682 case 0x2006: /* SIX-PER-EM SPACE */
683 case 0x2007: /* FIGURE SPACE */
684 case 0x2008: /* PUNCTUATION SPACE */
685 case 0x2009: /* THIN SPACE */
686 case 0x200A: /* HAIR SPACE */
687 case 0x200B: /* ZERO WIDTH SPACE */
688 case 0x2028: /* LINE SEPARATOR */
689 case 0x2029: /* PARAGRAPH SEPARATOR */
690 case 0x202F: /* NARROW NO-BREAK SPACE */
691 case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
692 case 0x3000: /* IDEOGRAPHIC SPACE */
693 return 1;
694 default:
695 return 0;
699 /* Returns 1 for Unicode characters having the category 'Ll', 0
700 otherwise. */
702 int _PyUnicode_IsLowercase(Py_UNICODE ch)
704 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
706 return (ctype->flags & LOWER_MASK) != 0;
709 /* Returns 1 for Unicode characters having the category 'Lu', 0
710 otherwise. */
712 int _PyUnicode_IsUppercase(Py_UNICODE ch)
714 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
716 return (ctype->flags & UPPER_MASK) != 0;
719 /* Returns the uppercase Unicode characters corresponding to ch or just
720 ch if no uppercase mapping is known. */
722 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
724 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
725 int delta = ctype->upper;
726 if (ctype->flags & NODELTA_MASK)
727 return delta;
728 if (delta >= 32768)
729 delta -= 65536;
730 return ch + delta;
733 /* Returns the lowercase Unicode characters corresponding to ch or just
734 ch if no lowercase mapping is known. */
736 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
738 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
739 int delta = ctype->lower;
740 if (ctype->flags & NODELTA_MASK)
741 return delta;
742 if (delta >= 32768)
743 delta -= 65536;
744 return ch + delta;
747 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
748 'Lo' or 'Lm', 0 otherwise. */
750 int _PyUnicode_IsAlpha(Py_UNICODE ch)
752 const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
754 return (ctype->flags & ALPHA_MASK) != 0;
757 #else
759 /* Export the interfaces using the wchar_t type for portability
760 reasons: */
762 int _PyUnicode_IsWhitespace(Py_UNICODE ch)
764 return iswspace(ch);
767 int _PyUnicode_IsLowercase(Py_UNICODE ch)
769 return iswlower(ch);
772 int _PyUnicode_IsUppercase(Py_UNICODE ch)
774 return iswupper(ch);
777 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
779 return towlower(ch);
782 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
784 return towupper(ch);
787 int _PyUnicode_IsAlpha(Py_UNICODE ch)
789 return iswalpha(ch);
792 #endif