quote.c

   1 /* ----------------------------------------------------------------------- *
   2  *
   3  *   Copyright 1996-2009 The NASM Authors - All Rights Reserved
   4  *   See the file AUTHORS included with the NASM distribution for
   5  *   the specific copyright holders.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following
   9  *   conditions are met:
  10  *
  11  *   * Redistributions of source code must retain the above copyright
  12  *     notice, this list of conditions and the following disclaimer.
  13  *   * Redistributions in binary form must reproduce the above
  14  *     copyright notice, this list of conditions and the following
  15  *     disclaimer in the documentation and/or other materials provided
  16  *     with the distribution.
  17  *
  18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  19  *     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  20  *     INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  21  *     MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22  *     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  23  *     CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24  *     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25  *     NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  26  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  *     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28  *     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  29  *     OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  30  *     EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  *
  32  * ----------------------------------------------------------------------- */
  33
  34 /*
  35  * quote.c
  36  */
  37
  38 #include "compiler.h"
  39
  40 #include <stdlib.h>
  41
  42 #include "nasmlib.h"
  43 #include "quote.h"
  44
  45 #define numvalue(c)  ((c)>='a' ? (c)-'a'+10 : (c)>='A' ? (c)-'A'+10 : (c)-'0')
  46
  47 char *nasm_quote(char *str, size_t len)
  48 {
  49     char c, c1, *p, *q, *nstr, *ep;
  50     unsigned char uc;
  51     bool sq_ok, dq_ok;
  52     size_t qlen;
  53
  54     sq_ok = dq_ok = true;
  55     ep = str+len;
  56     qlen = 0;                   /* Length if we need `...` quotes */
  57     for (p = str; p < ep; p++) {
  58         c = *p;
  59         switch (c) {
  60         case '\'':
  61             sq_ok = false;
  62             qlen++;
  63             break;
  64         case '\"':
  65             dq_ok = false;
  66             qlen++;
  67             break;
  68         case '`':
  69         case '\\':
  70             qlen += 2;
  71             break;
  72         default:
  73             if (c < ' ' || c > '~') {
  74                 sq_ok = dq_ok = false;
  75                 switch (c) {
  76                 case '\a':
  77                 case '\b':
  78                 case '\t':
  79                 case '\n':
  80                 case '\v':
  81                 case '\f':
  82                 case '\r':
  83                 case 27:
  84                     qlen += 2;
  85                     break;
  86                 default:
  87                     c1 = (p+1 < ep) ? p[1] : 0;
  88                     if (c1 >= '0' && c1 <= '7')
  89                         uc = 0377; /* Must use the full form */
  90                     else
  91                         uc = c;
  92                     if (uc > 077)
  93                         qlen++;
  94                     if (uc > 07)
  95                         qlen++;
  96                     qlen += 2;
  97                     break;
  98                 }
  99             } else {
 100                 qlen++;
 101             }
 102             break;
 103         }
 104     }
 105
 106     if (sq_ok || dq_ok) {
 107         /* Use '...' or "..." */
 108         nstr = nasm_malloc(len+3);
 109         nstr[0] = nstr[len+1] = sq_ok ? '\'' : '\"';
 110         nstr[len+2] = '\0';
 111         memcpy(nstr+1, str, len);
 112     } else {
 113         /* Need to use `...` quoted syntax */
 114         nstr = nasm_malloc(qlen+3);
 115         q = nstr;
 116         *q++ = '`';
 117         for (p = str; p < ep; p++) {
 118             c = *p;
 119             switch (c) {
 120             case '`':
 121             case '\\':
 122                 *q++ = '\\';
 123                 *q++ = c;
 124                 break;
 125             case 7:
 126                 *q++ = '\\';
 127                 *q++ = 'a';
 128                 break;
 129             case 8:
 130                 *q++ = '\\';
 131                 *q++ = 'b';
 132                 break;
 133             case 9:
 134                 *q++ = '\\';
 135                 *q++ = 't';
 136                 break;
 137             case 10:
 138                 *q++ = '\\';
 139                 *q++ = 'n';
 140                 break;
 141             case 11:
 142                 *q++ = '\\';
 143                 *q++ = 'v';
 144                 break;
 145             case 12:
 146                 *q++ = '\\';
 147                 *q++ = 'f';
 148                 break;
 149             case 13:
 150                 *q++ = '\\';
 151                 *q++ = 'r';
 152                 break;
 153             case 27:
 154                 *q++ = '\\';
 155                 *q++ = 'e';
 156                 break;
 157             default:
 158                 if (c < ' ' || c > '~') {
 159                     c1 = (p+1 < ep) ? p[1] : 0;
 160                     if (c1 >= '0' && c1 <= '7')
 161                         uc = 0377; /* Must use the full form */
 162                     else
 163                         uc = c;
 164                     *q++ = '\\';
 165                     if (uc > 077)
 166                         *q++ = ((unsigned char)c >> 6) + '0';
 167                     if (uc > 07)
 168                         *q++ = (((unsigned char)c >> 3) & 7) + '0';
 169                     *q++ = ((unsigned char)c & 7) + '0';
 170                     break;
 171                 } else {
 172                     *q++ = c;
 173                 }
 174                 break;
 175             }
 176         }
 177         *q++ = '`';
 178         *q++ = '\0';
 179         nasm_assert((size_t)(q-nstr) == qlen+3);
 180     }
 181     return nstr;
 182 }
 183
 184 static char *emit_utf8(char *q, int32_t v)
 185 {
 186     if (v < 0) {
 187         /* Impossible - do nothing */
 188     } else if (v <= 0x7f) {
 189         *q++ = v;
 190     } else if (v <= 0x000007ff) {
 191         *q++ = 0xc0 | (v >> 6);
 192         *q++ = 0x80 | (v & 63);
 193     } else if (v <= 0x0000ffff) {
 194         *q++ = 0xe0 | (v >> 12);
 195         *q++ = 0x80 | ((v >> 6) & 63);
 196         *q++ = 0x80 | (v & 63);
 197     } else if (v <= 0x001fffff) {
 198         *q++ = 0xf0 | (v >> 18);
 199         *q++ = 0x80 | ((v >> 12) & 63);
 200         *q++ = 0x80 | ((v >> 6) & 63);
 201         *q++ = 0x80 | (v & 63);
 202     } else if (v <= 0x03ffffff) {
 203         *q++ = 0xf8 | (v >> 24);
 204         *q++ = 0x80 | ((v >> 18) & 63);
 205         *q++ = 0x80 | ((v >> 12) & 63);
 206         *q++ = 0x80 | ((v >> 6) & 63);
 207         *q++ = 0x80 | (v & 63);
 208     } else {
 209         *q++ = 0xfc | (v >> 30);
 210         *q++ = 0x80 | ((v >> 24) & 63);
 211         *q++ = 0x80 | ((v >> 18) & 63);
 212         *q++ = 0x80 | ((v >> 12) & 63);
 213         *q++ = 0x80 | ((v >> 6) & 63);
 214         *q++ = 0x80 | (v & 63);
 215     }
 216     return q;
 217 }
 218
 219 /*
 220  * Do an *in-place* dequoting of the specified string, returning the
 221  * resulting length (which may be containing embedded nulls.)
 222  *
 223  * In-place replacement is possible since the unquoted length is always
 224  * shorter than or equal to the quoted length.
 225  *
 226  * *ep points to the final quote, or to the null if improperly quoted.
 227  */
 228 size_t nasm_unquote(char *str, char **ep)
 229 {
 230     char bq;
 231     char *p, *q;
 232     char *escp = NULL;
 233     char c;
 234     enum unq_state {
 235         st_start,
 236         st_backslash,
 237         st_hex,
 238         st_oct,
 239         st_ucs,
 240     } state;
 241     int ndig = 0;
 242     int32_t nval = 0;
 243
 244     p = q = str;
 245
 246     bq = *p++;
 247     if (!bq)
 248         return 0;
 249
 250     switch (bq) {
 251     case '\'':
 252     case '\"':
 253         /* '...' or "..." string */
 254         while ((c = *p) && c != bq) {
 255             p++;
 256             *q++ = c;
 257         }
 258         *q = '\0';
 259         break;
 260
 261     case '`':
 262         /* `...` string */
 263         state = st_start;
 264
 265         while ((c = *p)) {
 266             p++;
 267             switch (state) {
 268             case st_start:
 269                 switch (c) {
 270                 case '\\':
 271                     state = st_backslash;
 272                     break;
 273                 case '`':
 274                     p--;
 275                     goto out;
 276                 default:
 277                     *q++ = c;
 278                     break;
 279                 }
 280                 break;
 281
 282             case st_backslash:
 283                 state = st_start;
 284                 escp = p;       /* Beginning of argument sequence */
 285                 nval = 0;
 286                 switch (c) {
 287                 case 'a':
 288                     *q++ = 7;
 289                     break;
 290                 case 'b':
 291                     *q++ = 8;
 292                     break;
 293                 case 'e':
 294                     *q++ = 27;
 295                     break;
 296                 case 'f':
 297                     *q++ = 12;
 298                     break;
 299                 case 'n':
 300                     *q++ = 10;
 301                     break;
 302                 case 'r':
 303                     *q++ = 13;
 304                     break;
 305                 case 't':
 306                     *q++ = 9;
 307                     break;
 308                 case 'u':
 309                     state = st_ucs;
 310                     ndig = 4;
 311                     break;
 312                 case 'U':
 313                     state = st_ucs;
 314                     ndig = 8;
 315                     break;
 316                 case 'v':
 317                     *q++ = 11;
 318                     break;
 319                 case 'x':
 320                 case 'X':
 321                     state = st_hex;
 322                     ndig = 2;
 323                     break;
 324                 case '0':
 325                 case '1':
 326                 case '2':
 327                 case '3':
 328                 case '4':
 329                 case '5':
 330                 case '6':
 331                 case '7':
 332                     state = st_oct;
 333                     ndig = 2;   /* Up to two more digits */
 334                     nval = c - '0';
 335                     break;
 336                 default:
 337                     *q++ = c;
 338                     break;
 339                 }
 340                 break;
 341
 342             case st_oct:
 343                 if (c >= '0' && c <= '7') {
 344                     nval = (nval << 3) + (c - '0');
 345                     if (!--ndig) {
 346                         *q++ = nval;
 347                         state = st_start;
 348                     }
 349                 } else {
 350                     p--;        /* Process this character again */
 351                     *q++ = nval;
 352                     state = st_start;
 353                 }
 354                 break;
 355
 356             case st_hex:
 357                 if ((c >= '0' && c <= '9') ||
 358                     (c >= 'A' && c <= 'F') ||
 359                     (c >= 'a' && c <= 'f')) {
 360                     nval = (nval << 4) + numvalue(c);
 361                     if (!--ndig) {
 362                         *q++ = nval;
 363                         state = st_start;
 364                     }
 365                 } else {
 366                     p--;        /* Process this character again */
 367                     *q++ = (p > escp) ? nval : escp[-1];
 368                     state = st_start;
 369                 }
 370                 break;
 371
 372             case st_ucs:
 373                 if ((c >= '0' && c <= '9') ||
 374                     (c >= 'A' && c <= 'F') ||
 375                     (c >= 'a' && c <= 'f')) {
 376                     nval = (nval << 4) + numvalue(c);
 377                     if (!--ndig) {
 378                         q = emit_utf8(q, nval);
 379                         state = st_start;
 380                     }
 381                 } else {
 382                     p--;        /* Process this character again */
 383                     if (p > escp)
 384                         q = emit_utf8(q, nval);
 385                     else
 386                         *q++ = escp[-1];
 387                     state = st_start;
 388                 }
 389                 break;
 390             }
 391         }
 392         switch (state) {
 393         case st_start:
 394         case st_backslash:
 395             break;
 396         case st_oct:
 397             *q++ = nval;
 398             break;
 399         case st_hex:
 400             *q++ = (p > escp) ? nval : escp[-1];
 401             break;
 402         case st_ucs:
 403             if (p > escp)
 404                 q = emit_utf8(q, nval);
 405             else
 406                 *q++ = escp[-1];
 407             break;
 408         }
 409     out:
 410         break;
 411
 412     default:
 413         /* Not a quoted string, just return the input... */
 414         p = q = strchr(str, '\0');
 415         break;
 416     }
 417
 418     if (ep)
 419         *ep = p;
 420     return q-str;
 421 }
 422
 423 /*
 424  * Find the end of a quoted string; returns the pointer to the terminating
 425  * character (either the ending quote or the null character, if unterminated.)
 426  */
 427 char *nasm_skip_string(char *str)
 428 {
 429     char bq;
 430     char *p;
 431     char c;
 432     enum unq_state {
 433         st_start,
 434         st_backslash,
 435     } state;
 436
 437     bq = str[0];
 438     if (bq == '\'' || bq == '\"') {
 439         /* '...' or "..." string */
 440         for (p = str+1; *p && *p != bq; p++)
 441             ;
 442         return p;
 443     } else if (bq == '`') {
 444         /* `...` string */
 445         p = str+1;
 446         state = st_start;
 447
 448         while ((c = *p++)) {
 449             switch (state) {
 450             case st_start:
 451                 switch (c) {
 452                 case '\\':
 453                     state = st_backslash;
 454                     break;
 455                 case '`':
 456                     return p-1; /* Found the end */
 457                 default:
 458                     break;
 459                 }
 460                 break;
 461
 462             case st_backslash:
 463                 /*
 464                  * Note: for the purpose of finding the end of the string,
 465                  * all successor states to st_backslash are functionally
 466                  * equivalent to st_start, since either a backslash or
 467                  * a backquote will force a return to the st_start state.
 468                  */
 469                 state = st_start;
 470                 break;
 471             }
 472         }
 473         return p;               /* Unterminated string... */
 474     } else {
 475         return str;             /* Not a string... */
 476     }
 477 }