Objects/unicodectype.c

   1 /*
   2    Unicode character type helpers.
   3
   4    Written by Marc-Andre Lemburg (mal@lemburg.com).
   5    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   6
   7    Copyright (c) Corporation for National Research Initiatives.
   8
   9 */
  10
  11 #include "Python.h"
  12 #include "unicodeobject.h"
  13
  14 #define ALPHA_MASK 0x01
  15 #define DECIMAL_MASK 0x02
  16 #define DIGIT_MASK 0x04
  17 #define LOWER_MASK 0x08
  18 #define LINEBREAK_MASK 0x10
  19 #define SPACE_MASK 0x20
  20 #define TITLE_MASK 0x40
  21 #define UPPER_MASK 0x80
  22
  23 typedef struct {
  24     const Py_UNICODE upper;
  25     const Py_UNICODE lower;
  26     const Py_UNICODE title;
  27     const unsigned char decimal;
  28     const unsigned char digit;
  29     const unsigned short flags;
  30 } _PyUnicode_TypeRecord;
  31
  32 #include "unicodetype_db.h"
  33
  34 static const _PyUnicode_TypeRecord *
  35 gettyperecord(Py_UNICODE code)
  36 {
  37     int index;
  38
  39 #ifdef Py_UNICODE_WIDE
  40     if (code >= 0x110000)
  41         index = 0;
  42     else
  43 #endif
  44     {
  45         index = index1[(code>>SHIFT)];
  46         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  47     }
  48
  49     return &_PyUnicode_TypeRecords[index];
  50 }
  51
  52 /* Returns 1 for Unicode characters having the category 'Zl', 'Zp' or
  53    type 'B', 0 otherwise. */
  54
  55 int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
  56 {
  57     switch (ch) {
  58     case 0x000A: /* LINE FEED */
  59     case 0x000D: /* CARRIAGE RETURN */
  60     case 0x001C: /* FILE SEPARATOR */
  61     case 0x001D: /* GROUP SEPARATOR */
  62     case 0x001E: /* RECORD SEPARATOR */
  63     case 0x0085: /* NEXT LINE */
  64     case 0x2028: /* LINE SEPARATOR */
  65     case 0x2029: /* PARAGRAPH SEPARATOR */
  66         return 1;
  67     default:
  68         return 0;
  69     }
  70 }
  71
  72 /* Returns the titlecase Unicode characters corresponding to ch or just
  73    ch if no titlecase mapping is known. */
  74
  75 Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
  76 {
  77     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
  78     int delta;
  79
  80     if (ctype->title)
  81         delta = ctype->title;
  82     else
  83         delta = ctype->upper;
  84
  85     if (delta >= 32768)
  86             delta -= 65536;
  87
  88     return ch + delta;
  89 }
  90
  91 /* Returns 1 for Unicode characters having the category 'Lt', 0
  92    otherwise. */
  93
  94 int _PyUnicode_IsTitlecase(Py_UNICODE ch)
  95 {
  96     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
  97
  98     return (ctype->flags & TITLE_MASK) != 0;
  99 }
 100
 101 /* Returns the integer decimal (0-9) for Unicode characters having
 102    this property, -1 otherwise. */
 103
 104 int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
 105 {
 106     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 107
 108     return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
 109 }
 110
 111 int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
 112 {
 113     if (_PyUnicode_ToDecimalDigit(ch) < 0)
 114         return 0;
 115     return 1;
 116 }
 117
 118 /* Returns the integer digit (0-9) for Unicode characters having
 119    this property, -1 otherwise. */
 120
 121 int _PyUnicode_ToDigit(Py_UNICODE ch)
 122 {
 123     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 124
 125     return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
 126 }
 127
 128 int _PyUnicode_IsDigit(Py_UNICODE ch)
 129 {
 130     if (_PyUnicode_ToDigit(ch) < 0)
 131         return 0;
 132     return 1;
 133 }
 134
 135 /* Returns the numeric value as double for Unicode characters having
 136    this property, -1.0 otherwise. */
 137
 138 /* TODO: replace with unicodetype_db.h table */
 139
 140 double _PyUnicode_ToNumeric(Py_UNICODE ch)
 141 {
 142     switch (ch) {
 143     case 0x3007:
 144         return (double) 0;
 145     case 0x09F4:
 146     case 0x215F:
 147     case 0x2160:
 148     case 0x2170:
 149     case 0x3021:
 150     case 0x3280:
 151         return (double) 1;
 152     case 0x00BD:
 153         return (double) 1 / 2;
 154     case 0x2153:
 155         return (double) 1 / 3;
 156     case 0x00BC:
 157         return (double) 1 / 4;
 158     case 0x2155:
 159         return (double) 1 / 5;
 160     case 0x2159:
 161         return (double) 1 / 6;
 162     case 0x215B:
 163         return (double) 1 / 8;
 164     case 0x0BF0:
 165     case 0x1372:
 166     case 0x2169:
 167     case 0x2179:
 168     case 0x2469:
 169     case 0x247D:
 170     case 0x2491:
 171     case 0x277F:
 172     case 0x2789:
 173     case 0x2793:
 174     case 0x3038:
 175     case 0x3289:
 176         return (double) 10;
 177     case 0x0BF1:
 178     case 0x137B:
 179     case 0x216D:
 180     case 0x217D:
 181         return (double) 100;
 182     case 0x0BF2:
 183     case 0x216F:
 184     case 0x217F:
 185     case 0x2180:
 186         return (double) 1000;
 187     case 0x137C:
 188     case 0x2182:
 189         return (double) 10000;
 190     case 0x216A:
 191     case 0x217A:
 192     case 0x246A:
 193     case 0x247E:
 194     case 0x2492:
 195         return (double) 11;
 196     case 0x216B:
 197     case 0x217B:
 198     case 0x246B:
 199     case 0x247F:
 200     case 0x2493:
 201         return (double) 12;
 202     case 0x246C:
 203     case 0x2480:
 204     case 0x2494:
 205         return (double) 13;
 206     case 0x246D:
 207     case 0x2481:
 208     case 0x2495:
 209         return (double) 14;
 210     case 0x246E:
 211     case 0x2482:
 212     case 0x2496:
 213         return (double) 15;
 214     case 0x09F9:
 215     case 0x246F:
 216     case 0x2483:
 217     case 0x2497:
 218         return (double) 16;
 219     case 0x16EE:
 220     case 0x2470:
 221     case 0x2484:
 222     case 0x2498:
 223         return (double) 17;
 224     case 0x16EF:
 225     case 0x2471:
 226     case 0x2485:
 227     case 0x2499:
 228         return (double) 18;
 229     case 0x16F0:
 230     case 0x2472:
 231     case 0x2486:
 232     case 0x249A:
 233         return (double) 19;
 234     case 0x09F5:
 235     case 0x2161:
 236     case 0x2171:
 237     case 0x3022:
 238     case 0x3281:
 239         return (double) 2;
 240     case 0x2154:
 241         return (double) 2 / 3;
 242     case 0x2156:
 243         return (double) 2 / 5;
 244     case 0x1373:
 245     case 0x2473:
 246     case 0x2487:
 247     case 0x249B:
 248     case 0x3039:
 249         return (double) 20;
 250     case 0x09F6:
 251     case 0x2162:
 252     case 0x2172:
 253     case 0x3023:
 254     case 0x3282:
 255         return (double) 3;
 256     case 0x00BE:
 257         return (double) 3 / 4;
 258     case 0x2157:
 259         return (double) 3 / 5;
 260     case 0x215C:
 261         return (double) 3 / 8;
 262     case 0x1374:
 263     case 0x303A:
 264         return (double) 30;
 265     case 0x09F7:
 266     case 0x2163:
 267     case 0x2173:
 268     case 0x3024:
 269     case 0x3283:
 270         return (double) 4;
 271     case 0x2158:
 272         return (double) 4 / 5;
 273     case 0x1375:
 274         return (double) 40;
 275     case 0x2164:
 276     case 0x2174:
 277     case 0x3025:
 278     case 0x3284:
 279         return (double) 5;
 280     case 0x215A:
 281         return (double) 5 / 6;
 282     case 0x215D:
 283         return (double) 5 / 8;
 284     case 0x1376:
 285     case 0x216C:
 286     case 0x217C:
 287         return (double) 50;
 288     case 0x216E:
 289     case 0x217E:
 290         return (double) 500;
 291     case 0x2181:
 292         return (double) 5000;
 293     case 0x2165:
 294     case 0x2175:
 295     case 0x3026:
 296     case 0x3285:
 297         return (double) 6;
 298     case 0x1377:
 299         return (double) 60;
 300     case 0x2166:
 301     case 0x2176:
 302     case 0x3027:
 303     case 0x3286:
 304         return (double) 7;
 305     case 0x215E:
 306         return (double) 7 / 8;
 307     case 0x1378:
 308         return (double) 70;
 309     case 0x2167:
 310     case 0x2177:
 311     case 0x3028:
 312     case 0x3287:
 313         return (double) 8;
 314     case 0x1379:
 315         return (double) 80;
 316     case 0x2168:
 317     case 0x2178:
 318     case 0x3029:
 319     case 0x3288:
 320         return (double) 9;
 321     case 0x137A:
 322         return (double) 90;
 323     default:
 324         return (double) _PyUnicode_ToDigit(ch);
 325     }
 326 }
 327
 328 int _PyUnicode_IsNumeric(Py_UNICODE ch)
 329 {
 330     if (_PyUnicode_ToNumeric(ch) < 0.0)
 331         return 0;
 332     return 1;
 333 }
 334
 335 #ifndef WANT_WCTYPE_FUNCTIONS
 336
 337 /* Returns 1 for Unicode characters having the bidirectional type
 338    'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
 339
 340 int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
 341 {
 342     switch (ch) {
 343     case 0x0009: /* HORIZONTAL TABULATION */
 344     case 0x000A: /* LINE FEED */
 345     case 0x000B: /* VERTICAL TABULATION */
 346     case 0x000C: /* FORM FEED */
 347     case 0x000D: /* CARRIAGE RETURN */
 348     case 0x001C: /* FILE SEPARATOR */
 349     case 0x001D: /* GROUP SEPARATOR */
 350     case 0x001E: /* RECORD SEPARATOR */
 351     case 0x001F: /* UNIT SEPARATOR */
 352     case 0x0020: /* SPACE */
 353     case 0x0085: /* NEXT LINE */
 354     case 0x00A0: /* NO-BREAK SPACE */
 355     case 0x1680: /* OGHAM SPACE MARK */
 356     case 0x2000: /* EN QUAD */
 357     case 0x2001: /* EM QUAD */
 358     case 0x2002: /* EN SPACE */
 359     case 0x2003: /* EM SPACE */
 360     case 0x2004: /* THREE-PER-EM SPACE */
 361     case 0x2005: /* FOUR-PER-EM SPACE */
 362     case 0x2006: /* SIX-PER-EM SPACE */
 363     case 0x2007: /* FIGURE SPACE */
 364     case 0x2008: /* PUNCTUATION SPACE */
 365     case 0x2009: /* THIN SPACE */
 366     case 0x200A: /* HAIR SPACE */
 367     case 0x200B: /* ZERO WIDTH SPACE */
 368     case 0x2028: /* LINE SEPARATOR */
 369     case 0x2029: /* PARAGRAPH SEPARATOR */
 370     case 0x202F: /* NARROW NO-BREAK SPACE */
 371     case 0x205F: /* MEDIUM MATHEMATICAL SPACE */
 372     case 0x3000: /* IDEOGRAPHIC SPACE */
 373         return 1;
 374     default:
 375         return 0;
 376     }
 377 }
 378
 379 /* Returns 1 for Unicode characters having the category 'Ll', 0
 380    otherwise. */
 381
 382 int _PyUnicode_IsLowercase(Py_UNICODE ch)
 383 {
 384     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 385
 386     return (ctype->flags & LOWER_MASK) != 0;
 387 }
 388
 389 /* Returns 1 for Unicode characters having the category 'Lu', 0
 390    otherwise. */
 391
 392 int _PyUnicode_IsUppercase(Py_UNICODE ch)
 393 {
 394     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 395
 396     return (ctype->flags & UPPER_MASK) != 0;
 397 }
 398
 399 /* Returns the uppercase Unicode characters corresponding to ch or just
 400    ch if no uppercase mapping is known. */
 401
 402 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
 403 {
 404     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 405     int delta = ctype->upper;
 406     if (delta >= 32768)
 407             delta -= 65536;
 408     return ch + delta;
 409 }
 410
 411 /* Returns the lowercase Unicode characters corresponding to ch or just
 412    ch if no lowercase mapping is known. */
 413
 414 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
 415 {
 416     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 417     int delta = ctype->lower;
 418     if (delta >= 32768)
 419             delta -= 65536;
 420     return ch + delta;
 421 }
 422
 423 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
 424    'Lo' or 'Lm',  0 otherwise. */
 425
 426 int _PyUnicode_IsAlpha(Py_UNICODE ch)
 427 {
 428     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 429
 430     return (ctype->flags & ALPHA_MASK) != 0;
 431 }
 432
 433 #else
 434
 435 /* Export the interfaces using the wchar_t type for portability
 436    reasons:  */
 437
 438 int _PyUnicode_IsWhitespace(Py_UNICODE ch)
 439 {
 440     return iswspace(ch);
 441 }
 442
 443 int _PyUnicode_IsLowercase(Py_UNICODE ch)
 444 {
 445     return iswlower(ch);
 446 }
 447
 448 int _PyUnicode_IsUppercase(Py_UNICODE ch)
 449 {
 450     return iswupper(ch);
 451 }
 452
 453 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
 454 {
 455     return towlower(ch);
 456 }
 457
 458 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
 459 {
 460     return towupper(ch);
 461 }
 462
 463 int _PyUnicode_IsAlpha(Py_UNICODE ch)
 464 {
 465     return iswalpha(ch);
 466 }
 467
 468 #endif