src/string_primitives.c

   1 /*
   2 Copyright (C) 2006-2007, The Perl Foundation.
   3 $Id$
   4
   5 =head1 NAME
   6
   7 src/string_primitives.c - String Primitives
   8
   9 =head1 DESCRIPTION
  10
  11 This file collects together all the functions that call into the ICU
  12 API.
  13
  14 =head2 Functions
  15
  16 =over 4
  17
  18 =cut
  19
  20 */
  21
  22 /* HEADERIZER HFILE: include/parrot/string_primitives.h */
  23
  24 #include "parrot/parrot.h"
  25 #if PARROT_HAS_ICU
  26 #  include <unicode/ucnv.h>
  27 #  include <unicode/utypes.h>
  28 #  include <unicode/uchar.h>
  29 #  include <unicode/ustring.h>
  30 #else
  31 #  include <ctype.h>
  32 #endif
  33
  34 /*
  35
  36 =item C<void string_set_data_directory>
  37
  38 Set the directory where ICU finds its data files (encodings, locales,
  39 etc.).
  40
  41 =cut
  42
  43 */
  44
  45 PARROT_API
  46 void
  47 string_set_data_directory(PARROT_INTERP, ARGIN(const char *dir))
  48 {
  49 #if PARROT_HAS_ICU
  50     u_setDataDirectory(dir);
  51
  52     /* Since u_setDataDirectory doesn't have a result code, we'll spot
  53        check that everything is okay by making sure that '9' had decimal
  54        value 9. Using 57 rather than '9' so that the encoding of this
  55        source code file isn't an issue.... (Don't want to get bitten by
  56        EBCDIC.) */
  57
  58     if (!u_isdigit(57) || (u_charDigitValue(57) != 9)) {
  59             real_exception(interp, NULL, ICU_ERROR,
  60                 "string_set_data_directory: ICU data files not found"
  61                 "(apparently) for directory [%s]", dir);
  62     }
  63 #else
  64     UNUSED(dir);
  65
  66     real_exception(interp, NULL, ICU_ERROR,
  67         "string_set_data_directory: parrot compiled without ICU support");
  68 #endif
  69 }
  70
  71 /*
  72
  73 =item C<Parrot_UInt4 string_unescape_one>
  74
  75 Unescape a single character. We assume that we're at the start of a
  76 sequence, right after the \.
  77
  78 =cut
  79
  80 */
  81
  82 PARROT_API
  83 Parrot_UInt4
  84 string_unescape_one(PARROT_INTERP, ARGMOD(UINTVAL *offset),
  85         ARGMOD(STRING *string))
  86 {
  87     UINTVAL workchar = 0;
  88     UINTVAL charcount = 0;
  89     const UINTVAL len = string_length(interp, string);
  90     /* Well, not right now */
  91     UINTVAL codepoint = CHARSET_GET_BYTE(interp, string, *offset);
  92     ++*offset;
  93     switch (codepoint) {
  94         case 'x':
  95             codepoint = CHARSET_GET_BYTE(interp, string, *offset);
  96             if (codepoint >= '0' && codepoint <= '9') {
  97                 workchar = codepoint - '0';
  98             }
  99             else if (codepoint >= 'a' && codepoint <= 'f') {
 100                 workchar = codepoint - 'a' + 10;
 101             }
 102             else if (codepoint >= 'A' && codepoint <= 'F') {
 103                 workchar = codepoint - 'A' + 10;
 104             }
 105             else if (codepoint == '{') {
 106                 int i;
 107                 ++*offset;
 108                 workchar = 0;
 109                 for (i = 0; i < 8 && *offset < len; ++i, ++*offset) {
 110                     codepoint = CHARSET_GET_BYTE(interp, string, *offset);
 111                     if (codepoint == '}') {
 112                         ++*offset;
 113                         return workchar;
 114                     }
 115                     workchar *= 16;
 116                     if (codepoint >= '0' && codepoint <= '9') {
 117                         workchar += codepoint - '0';
 118                     }
 119                     else if (codepoint >= 'a' && codepoint <= 'f') {
 120                         workchar += codepoint - 'a' + 10;
 121                     }
 122                     else if (codepoint >= 'A' && codepoint <= 'F') {
 123                         workchar += codepoint - 'A' + 10;
 124                     }
 125                     else {
 126                         real_exception(interp, NULL, UNIMPLEMENTED,
 127                                 "Illegal escape sequence inside {}");
 128                     }
 129                 }
 130                 if (*offset == len)
 131                     real_exception(interp, NULL, UNIMPLEMENTED,
 132                             "Illegal escape sequence no '}'");
 133             }
 134             else {
 135                 real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence in");
 136             }
 137             ++*offset;
 138             if (*offset < len) {
 139                 workchar *= 16;
 140                 codepoint = CHARSET_GET_BYTE(interp, string, *offset);
 141                 if (codepoint >= '0' && codepoint <= '9') {
 142                     workchar += codepoint - '0';
 143                 }
 144                 else if (codepoint >= 'a' && codepoint <= 'f') {
 145                     workchar += codepoint - 'a' + 10;
 146                 }
 147                 else if (codepoint >= 'A' && codepoint <= 'F') {
 148                     workchar += codepoint - 'A' + 10;
 149                 }
 150                 else {
 151                     return workchar;
 152                 }
 153             }
 154             else {
 155                 return workchar;
 156             }
 157             ++*offset;
 158             return workchar;
 159         case 'c':
 160             codepoint = CHARSET_GET_BYTE(interp, string, *offset);
 161             if (codepoint >= 'A' && codepoint <= 'Z') {
 162                 workchar = codepoint - 'A' + 1;
 163             }
 164             else {
 165                 real_exception(interp, NULL, UNIMPLEMENTED, "Illegal escape sequence");
 166             }
 167             ++*offset;
 168             return workchar;
 169         case 'u':
 170             workchar = 0;
 171             for (charcount = 0; charcount < 4; charcount++) {
 172                 if (*offset < len) {
 173                     workchar *= 16;
 174                     codepoint = CHARSET_GET_BYTE(interp, string, *offset);
 175                     if (codepoint >= '0' && codepoint <= '9') {
 176                         workchar += codepoint - '0';
 177                     }
 178                     else if (codepoint >= 'a' && codepoint <= 'f') {
 179                         workchar += codepoint - 'a' + 10;
 180                     }
 181                     else if (codepoint >= 'A' && codepoint <= 'F') {
 182                         workchar += codepoint - 'A' + 10;
 183                     }
 184                     else {
 185                         real_exception(interp, NULL, UNIMPLEMENTED,
 186                                 "Illegal escape sequence in uxxx escape");
 187                     }
 188                 }
 189                 else {
 190                     real_exception(interp, NULL, UNIMPLEMENTED,
 191                         "Illegal escape sequence in uxxx escape - too short");
 192                 }
 193                 ++*offset;
 194             }
 195             return workchar;
 196         case 'U':
 197             workchar = 0;
 198             for (charcount = 0; charcount < 8; charcount++) {
 199                 if (*offset < len) {
 200                     workchar *= 16;
 201                     codepoint = CHARSET_GET_BYTE(interp, string, *offset);
 202                     if (codepoint >= '0' && codepoint <= '9') {
 203                         workchar += codepoint - '0';
 204                     }
 205                     else if (codepoint >= 'a' && codepoint <= 'f') {
 206                         workchar += codepoint - 'a' + 10;
 207                     }
 208                     else if (codepoint >= 'A' && codepoint <= 'F') {
 209                         workchar += codepoint - 'A' + 10;
 210                     }
 211                     else {
 212                         real_exception(interp, NULL, UNIMPLEMENTED,
 213                                 "Illegal escape sequence in Uxxx escape");
 214                     }
 215                 }
 216                 else {
 217                     real_exception(interp, NULL, UNIMPLEMENTED,
 218                         "Illegal escape sequence in uxxx escape - too short");
 219                 }
 220                 ++*offset;
 221             }
 222             return workchar;
 223         case '0':
 224         case '1':
 225         case '2':
 226         case '3':
 227         case '4':
 228         case '5':
 229         case '6':
 230         case '7':
 231             workchar = codepoint - '0';
 232             if (*offset < len) {
 233                 workchar *= 8;
 234                 codepoint = CHARSET_GET_BYTE(interp, string, *offset);
 235                 if (codepoint >= '0' && codepoint <= '7') {
 236                     workchar += codepoint - '0';
 237                 }
 238                 else {
 239                     return workchar;
 240                 }
 241             }
 242             else {
 243                 return workchar;
 244             }
 245             ++*offset;
 246             if (*offset < len) {
 247                 workchar *= 8;
 248                 codepoint = CHARSET_GET_BYTE(interp, string, *offset);
 249                 if (codepoint >= '0' && codepoint <= '7') {
 250                     workchar += codepoint - '0';
 251                 }
 252                 else {
 253                     return workchar;
 254                 }
 255             }
 256             else {
 257                 return workchar;
 258             }
 259             ++*offset;
 260             return workchar;
 261         case 'a':
 262             return 7; /* bell */
 263         case 'b':
 264             return 8; /* bs */
 265         case 't':
 266             return 9;
 267         case 'n':
 268             return 10;
 269         case 'v':
 270             return 11;
 271         case 'f':
 272             return 12;
 273         case 'r':
 274             return 13;
 275         case 'e':
 276             return 27;
 277         case 92: /* \ */
 278             return 92;
 279         case '"':
 280             return '"';
 281         default:
 282             return codepoint;  /* any not special return the char */
 283     }
 284 }
 285
 286 /*
 287
 288 =back
 289
 290 =head2 Character Property Functions
 291
 292 =over 4
 293
 294 =item C<INTVAL Parrot_char_digit_value>
 295
 296 Returns the decimal digit value of the specified character if it is a decimal
 297 digit character. If not, then -1 is returned.
 298
 299 Note that as currently written, C<Parrot_char_digit_value()> can
 300 correctly return the decimal digit value of characters for which
 301 C<Parrot_char_is_digit()> returns false.
 302
 303 =cut
 304
 305 */
 306
 307 PARROT_API
 308 PARROT_CONST_FUNCTION
 309 INTVAL
 310 Parrot_char_digit_value(SHIM_INTERP, UINTVAL character)
 311 {
 312 #if PARROT_HAS_ICU
 313     return u_charDigitValue(character);
 314 #else
 315     if ((character >= 0x30) || (character <= 0x39))
 316         return character - 0x30;
 317     return -1;
 318 #endif
 319 }
 320
 321 /*
 322
 323 =item C<char * str_dup>
 324
 325 Duplicate a C string.  Just like strdup(), except it dies if it runs
 326 out of memory.
 327
 328 =cut
 329
 330 */
 331
 332 PARROT_API
 333 PARROT_MALLOC
 334 PARROT_CANNOT_RETURN_NULL
 335 char *
 336 str_dup(ARGIN(const char *old))
 337 {
 338     const size_t bytes = strlen(old) + 1;
 339     char * const copy = (char *)mem_sys_allocate(bytes);
 340     memcpy(copy, old, bytes);
 341 #ifdef MEMDEBUG
 342     debug(interp, 1, "line %d str_dup %s [%x]\n", line, old, copy);
 343 #endif
 344     return copy;
 345 }
 346
 347 /*
 348
 349 =back
 350
 351 =head1 SEE ALSO
 352
 353 =over 4
 354
 355 =item F<include/parrot/string_primitives.h>
 356
 357 =item F<include/parrot/string.h>
 358
 359 =item F<src/string.c>
 360
 361 =back
 362
 363 =cut
 364
 365 */
 366
 367
 368 /*
 369  * Local variables:
 370  *   c-file-style: "parrot"
 371  * End:
 372  * vim: expandtab shiftwidth=4:
 373  */