src/string/primitives.c

   1 /*
   2 Copyright (C) 2006-2009, Parrot Foundation.
   3 $Id$
   4
   5 =head1 NAME
   6
   7 src/string/string_primitives.c - String Primitives
   8
   9 =head1 DESCRIPTION
  10
  11 This file collects together all the functions that call into the ICU
  12 API.
  13
  14 =head2 Functions
  15
  16 =over 4
  17
  18 =cut
  19
  20 */
  21
  22 /* HEADERIZER HFILE: include/parrot/string_primitives.h */
  23
  24 #include "parrot/parrot.h"
  25 #if PARROT_HAS_ICU
  26 #  include <unicode/ucnv.h>
  27 #  include <unicode/utypes.h>
  28 #  include <unicode/uchar.h>
  29 #  include <unicode/ustring.h>
  30 #else
  31 #  include <ctype.h>
  32 #endif
  33
  34 /*
  35
  36 =item C<void string_set_data_directory(PARROT_INTERP, const char *dir)>
  37
  38 Set the directory where ICU finds its data files (encodings, locales,
  39 etc.).
  40
  41 =cut
  42
  43 */
  44
  45 PARROT_EXPORT
  46 void
  47 string_set_data_directory(PARROT_INTERP, ARGIN(const char *dir))
  48 {
  49     ASSERT_ARGS(string_set_data_directory)
  50 #if PARROT_HAS_ICU
  51     u_setDataDirectory(dir);
  52
  53     /* Since u_setDataDirectory doesn't have a result code, we'll spot
  54        check that everything is okay by making sure that '9' had decimal
  55        value 9. Using 57 rather than '9' so that the encoding of this
  56        source code file isn't an issue.... (Don't want to get bitten by
  57        EBCDIC.) */
  58
  59     if (!u_isdigit(57) || (u_charDigitValue(57) != 9))
  60         Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_ICU_ERROR,
  61             "string_set_data_directory: ICU data files not found"
  62             "(apparently) for directory [%s]", dir);
  63 #else
  64     UNUSED(dir);
  65
  66     Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_ICU_ERROR,
  67         "string_set_data_directory: parrot compiled without ICU support");
  68 #endif
  69 }
  70
  71 /*
  72
  73 =item C<Parrot_UInt4 string_unescape_one(PARROT_INTERP, UINTVAL *offset, const
  74 STRING *string)>
  75
  76 Unescape a single character. We assume that we're at the start of a
  77 sequence, right after the \.
  78
  79 =cut
  80
  81 */
  82
  83 PARROT_EXPORT
  84 Parrot_UInt4
  85 string_unescape_one(PARROT_INTERP, ARGMOD(UINTVAL *offset),
  86         ARGIN(const STRING *string))
  87 {
  88     ASSERT_ARGS(string_unescape_one)
  89     UINTVAL workchar  = 0;
  90     UINTVAL charcount = 0;
  91     const UINTVAL len = Parrot_str_byte_length(interp, string);
  92     const unsigned char * const buf = (unsigned char *)string->strstart;
  93
  94     /* Well, not right now */
  95     UINTVAL codepoint = buf[*offset];
  96     ++*offset;
  97
  98     switch (codepoint) {
  99       case 'x':
 100         codepoint = buf[*offset];
 101         if (codepoint >= '0' && codepoint <= '9') {
 102             workchar = codepoint - '0';
 103         }
 104         else if (codepoint >= 'a' && codepoint <= 'f') {
 105             workchar = codepoint - 'a' + 10;
 106         }
 107         else if (codepoint >= 'A' && codepoint <= 'F') {
 108             workchar = codepoint - 'A' + 10;
 109         }
 110         else if (codepoint == '{') {
 111             int i;
 112             ++*offset;
 113             workchar = 0;
 114             for (i = 0; i < 8 && *offset < len; ++i, ++*offset) {
 115                 codepoint = buf[*offset];
 116                 if (codepoint == '}') {
 117                     ++*offset;
 118                     return workchar;
 119                 }
 120                 workchar *= 16;
 121                 if (codepoint >= '0' && codepoint <= '9') {
 122                     workchar += codepoint - '0';
 123                 }
 124                 else if (codepoint >= 'a' && codepoint <= 'f') {
 125                     workchar += codepoint - 'a' + 10;
 126                 }
 127                 else if (codepoint >= 'A' && codepoint <= 'F') {
 128                     workchar += codepoint - 'A' + 10;
 129                 }
 130                 else {
 131                     Parrot_ex_throw_from_c_args(interp, NULL,
 132                             EXCEPTION_UNIMPLEMENTED,
 133                             "Illegal escape sequence inside {}");
 134                 }
 135             }
 136             if (*offset == len)
 137                 Parrot_ex_throw_from_c_args(interp, NULL,
 138                         EXCEPTION_UNIMPLEMENTED,
 139                         "Illegal escape sequence no '}'");
 140         }
 141         else {
 142             Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
 143                     "Illegal escape sequence in");
 144         }
 145
 146         ++*offset;
 147         if (*offset < len) {
 148             workchar *= 16;
 149             codepoint = buf[*offset];
 150             if (codepoint >= '0' && codepoint <= '9') {
 151                 workchar += codepoint - '0';
 152             }
 153             else if (codepoint >= 'a' && codepoint <= 'f') {
 154                 workchar += codepoint - 'a' + 10;
 155             }
 156             else if (codepoint >= 'A' && codepoint <= 'F') {
 157                 workchar += codepoint - 'A' + 10;
 158             }
 159             else {
 160                 return workchar;
 161             }
 162         }
 163         else {
 164             return workchar;
 165         }
 166         ++*offset;
 167         return workchar;
 168       case 'c':
 169         codepoint = buf[*offset];
 170         if (codepoint >= 'A' && codepoint <= 'Z') {
 171             workchar = codepoint - 'A' + 1;
 172         }
 173         else {
 174             Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_UNIMPLEMENTED,
 175                     "Illegal escape sequence");
 176         }
 177
 178         ++*offset;
 179         return workchar;
 180       case 'u':
 181         workchar = 0;
 182         for (charcount = 0; charcount < 4; charcount++) {
 183             if (*offset < len) {
 184                 workchar *= 16;
 185                 codepoint = buf[*offset];
 186                 if (codepoint >= '0' && codepoint <= '9') {
 187                     workchar += codepoint - '0';
 188                 }
 189                 else if (codepoint >= 'a' && codepoint <= 'f') {
 190                     workchar += codepoint - 'a' + 10;
 191                 }
 192                 else if (codepoint >= 'A' && codepoint <= 'F') {
 193                     workchar += codepoint - 'A' + 10;
 194                 }
 195                 else {
 196                     Parrot_ex_throw_from_c_args(interp, NULL,
 197                             EXCEPTION_UNIMPLEMENTED,
 198                             "Illegal escape sequence in uxxx escape");
 199                 }
 200             }
 201             else {
 202                 Parrot_ex_throw_from_c_args(interp, NULL,
 203                         EXCEPTION_UNIMPLEMENTED,
 204                         "Illegal escape sequence in uxxx escape - too short");
 205             }
 206
 207             ++*offset;
 208         }
 209         return workchar;
 210       case 'U':
 211         workchar = 0;
 212         for (charcount = 0; charcount < 8; charcount++) {
 213             if (*offset < len) {
 214                 workchar *= 16;
 215                 codepoint = buf[*offset];
 216                 if (codepoint >= '0' && codepoint <= '9') {
 217                     workchar += codepoint - '0';
 218                 }
 219                 else if (codepoint >= 'a' && codepoint <= 'f') {
 220                     workchar += codepoint - 'a' + 10;
 221                 }
 222                 else if (codepoint >= 'A' && codepoint <= 'F') {
 223                     workchar += codepoint - 'A' + 10;
 224                 }
 225                 else {
 226                     Parrot_ex_throw_from_c_args(interp, NULL,
 227                             EXCEPTION_UNIMPLEMENTED,
 228                             "Illegal escape sequence in Uxxx escape");
 229                 }
 230             }
 231             else {
 232                 Parrot_ex_throw_from_c_args(interp, NULL,
 233                         EXCEPTION_UNIMPLEMENTED,
 234                         "Illegal escape sequence in uxxx escape - too short");
 235             }
 236
 237             ++*offset;
 238         }
 239         return workchar;
 240       case '0':
 241       case '1':
 242       case '2':
 243       case '3':
 244       case '4':
 245       case '5':
 246       case '6':
 247       case '7':
 248         workchar = codepoint - '0';
 249         if (*offset < len) {
 250             workchar *= 8;
 251             codepoint = buf[*offset];
 252             if (codepoint >= '0' && codepoint <= '7') {
 253                 workchar += codepoint - '0';
 254             }
 255             else {
 256                 return workchar;
 257             }
 258         }
 259         else {
 260             return workchar;
 261         }
 262         ++*offset;
 263         if (*offset < len) {
 264             workchar *= 8;
 265             codepoint = buf[*offset];
 266             if (codepoint >= '0' && codepoint <= '7') {
 267                 workchar += codepoint - '0';
 268             }
 269             else {
 270                 return workchar;
 271             }
 272         }
 273         else {
 274             return workchar;
 275         }
 276         ++*offset;
 277         return workchar;
 278       case 'a':
 279         return 7; /* bell */
 280       case 'b':
 281         return 8; /* bs */
 282       case 't':
 283         return 9;
 284       case 'n':
 285         return 10;
 286       case 'v':
 287         return 11;
 288       case 'f':
 289         return 12;
 290       case 'r':
 291         return 13;
 292       case 'e':
 293         return 27;
 294       case 92: /* \ */
 295         return 92;
 296       case '"':
 297         return '"';
 298       default:
 299         return codepoint;  /* any not special return the char */
 300     }
 301 }
 302
 303 /*
 304
 305 =back
 306
 307 =head2 Character Property Functions
 308
 309 =over 4
 310
 311 =item C<INTVAL Parrot_char_digit_value(PARROT_INTERP, UINTVAL character)>
 312
 313 Returns the decimal digit value of the specified character if it is a decimal
 314 digit character. If not, then -1 is returned.
 315
 316 Note that as currently written, C<Parrot_char_digit_value()> can
 317 correctly return the decimal digit value of characters for which
 318 C<Parrot_char_is_digit()> returns false.
 319
 320 =cut
 321
 322 */
 323
 324 PARROT_EXPORT
 325 PARROT_CONST_FUNCTION
 326 INTVAL
 327 Parrot_char_digit_value(SHIM_INTERP, UINTVAL character)
 328 {
 329     ASSERT_ARGS(Parrot_char_digit_value)
 330 #if PARROT_HAS_ICU
 331     return u_charDigitValue(character);
 332 #else
 333     if ((character >= 0x30) && (character <= 0x39))
 334         return character - 0x30;
 335     return -1;
 336 #endif
 337 }
 338
 339 /*
 340
 341 =item C<char * str_dup_remove_quotes(const char *old)>
 342
 343 Duplicates a C string (minus the wrapping quotes).  Similar to strdup(),
 344 except it dies if it runs out of memory.
 345
 346 =cut
 347
 348 */
 349
 350 PARROT_EXPORT
 351 PARROT_MALLOC
 352 PARROT_CANNOT_RETURN_NULL
 353 char *
 354 str_dup_remove_quotes(ARGIN(const char *old))
 355 {
 356     ASSERT_ARGS(str_dup_remove_quotes)
 357     const size_t oldlen = strlen(old) + 1;
 358
 359     /* 2 for the beginning and ending quote chars */
 360     const size_t newlen = oldlen - 2;
 361     char * const copy   = (char *)mem_internal_allocate(newlen);
 362
 363     memcpy(copy, old + 1, newlen);
 364     copy[newlen - 1] = 0;
 365
 366     return copy;
 367 }
 368
 369 /*
 370
 371 =back
 372
 373 =head1 SEE ALSO
 374
 375 =over 4
 376
 377 =item F<include/parrot/string_primitives.h>
 378
 379 =item F<include/parrot/string.h>
 380
 381 =item F<src/string.c>
 382
 383 =back
 384
 385 =cut
 386
 387 */
 388
 389
 390 /*
 391  * Local variables:
 392  *   c-file-style: "parrot"
 393  * End:
 394  * vim: expandtab shiftwidth=4:
 395  */