quote.c

   1 /* ----------------------------------------------------------------------- *
   2  *
   3  *   Copyright 1996-2009 The NASM Authors - All Rights Reserved
   4  *   See the file AUTHORS included with the NASM distribution for
   5  *   the specific copyright holders.
   6  *
   7  *   Redistribution and use in source and binary forms, with or without
   8  *   modification, are permitted provided that the following
   9  *   conditions are met:
  10  *
  11  *   * Redistributions of source code must retain the above copyright
  12  *     notice, this list of conditions and the following disclaimer.
  13  *   * Redistributions in binary form must reproduce the above
  14  *     copyright notice, this list of conditions and the following
  15  *     disclaimer in the documentation and/or other materials provided
  16  *     with the distribution.
  17  *
  18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  19  *     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  20  *     INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  21  *     MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22  *     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  23  *     CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  24  *     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  25  *     NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  26  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27  *     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28  *     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  29  *     OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  30  *     EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  *
  32  * ----------------------------------------------------------------------- */
  33
  34 /*
  35  * quote.c
  36  */
  37
  38 #include "compiler.h"
  39
  40 #include <stdlib.h>
  41
  42 #include "nasmlib.h"
  43 #include "quote.h"
  44
  45 #define numvalue(c)  ((c)>='a' ? (c)-'a'+10 : (c)>='A' ? (c)-'A'+10 : (c)-'0')
  46
  47 char *nasm_quote(char *str, size_t len)
  48 {
  49     char c, c1, *p, *q, *nstr, *ep;
  50     unsigned char uc;
  51     bool sq_ok, dq_ok;
  52     size_t qlen;
  53
  54     sq_ok = dq_ok = true;
  55     ep = str+len;
  56     qlen = 0;                   /* Length if we need `...` quotes */
  57     for (p = str; p < ep; p++) {
  58         c = *p;
  59         switch (c) {
  60         case '\'':
  61             sq_ok = false;
  62             qlen++;
  63             break;
  64         case '\"':
  65             dq_ok = false;
  66             qlen++;
  67             break;
  68         case '`':
  69         case '\\':
  70             qlen += 2;
  71             break;
  72         default:
  73             if (c < ' ' || c > '~') {
  74                 sq_ok = dq_ok = false;
  75                 switch (c) {
  76                 case '\a':
  77                 case '\b':
  78                 case '\t':
  79                 case '\n':
  80                 case '\v':
  81                 case '\f':
  82                 case '\r':
  83                 case 27:
  84                     qlen += 2;
  85                     break;
  86                 default:
  87                     c1 = (p+1 < ep) ? p[1] : 0;
  88                     if (c1 >= '0' && c1 <= '7')
  89                         uc = 0377; /* Must use the full form */
  90                     else
  91                         uc = c;
  92                     if (uc > 077)
  93                         qlen++;
  94                     if (uc > 07)
  95                         qlen++;
  96                     qlen += 2;
  97                     break;
  98                 }
  99             } else {
 100                 qlen++;
 101             }
 102             break;
 103         }
 104     }
 105
 106     if (sq_ok || dq_ok) {
 107         /* Use '...' or "..." */
 108         nstr = nasm_malloc(len+3);
 109         nstr[0] = nstr[len+1] = sq_ok ? '\'' : '\"';
 110         nstr[len+2] = '\0';
 111         if (len > 0)
 112             memcpy(nstr+1, str, len);
 113     } else {
 114         /* Need to use `...` quoted syntax */
 115         nstr = nasm_malloc(qlen+3);
 116         q = nstr;
 117         *q++ = '`';
 118         for (p = str; p < ep; p++) {
 119             c = *p;
 120             switch (c) {
 121             case '`':
 122             case '\\':
 123                 *q++ = '\\';
 124                 *q++ = c;
 125                 break;
 126             case 7:
 127                 *q++ = '\\';
 128                 *q++ = 'a';
 129                 break;
 130             case 8:
 131                 *q++ = '\\';
 132                 *q++ = 'b';
 133                 break;
 134             case 9:
 135                 *q++ = '\\';
 136                 *q++ = 't';
 137                 break;
 138             case 10:
 139                 *q++ = '\\';
 140                 *q++ = 'n';
 141                 break;
 142             case 11:
 143                 *q++ = '\\';
 144                 *q++ = 'v';
 145                 break;
 146             case 12:
 147                 *q++ = '\\';
 148                 *q++ = 'f';
 149                 break;
 150             case 13:
 151                 *q++ = '\\';
 152                 *q++ = 'r';
 153                 break;
 154             case 27:
 155                 *q++ = '\\';
 156                 *q++ = 'e';
 157                 break;
 158             default:
 159                 if (c < ' ' || c > '~') {
 160                     c1 = (p+1 < ep) ? p[1] : 0;
 161                     if (c1 >= '0' && c1 <= '7')
 162                         uc = 0377; /* Must use the full form */
 163                     else
 164                         uc = c;
 165                     *q++ = '\\';
 166                     if (uc > 077)
 167                         *q++ = ((unsigned char)c >> 6) + '0';
 168                     if (uc > 07)
 169                         *q++ = (((unsigned char)c >> 3) & 7) + '0';
 170                     *q++ = ((unsigned char)c & 7) + '0';
 171                     break;
 172                 } else {
 173                     *q++ = c;
 174                 }
 175                 break;
 176             }
 177         }
 178         *q++ = '`';
 179         *q++ = '\0';
 180         nasm_assert((size_t)(q-nstr) == qlen+3);
 181     }
 182     return nstr;
 183 }
 184
 185 static char *emit_utf8(char *q, int32_t v)
 186 {
 187     if (v < 0) {
 188         /* Impossible - do nothing */
 189     } else if (v <= 0x7f) {
 190         *q++ = v;
 191     } else if (v <= 0x000007ff) {
 192         *q++ = 0xc0 | (v >> 6);
 193         *q++ = 0x80 | (v & 63);
 194     } else if (v <= 0x0000ffff) {
 195         *q++ = 0xe0 | (v >> 12);
 196         *q++ = 0x80 | ((v >> 6) & 63);
 197         *q++ = 0x80 | (v & 63);
 198     } else if (v <= 0x001fffff) {
 199         *q++ = 0xf0 | (v >> 18);
 200         *q++ = 0x80 | ((v >> 12) & 63);
 201         *q++ = 0x80 | ((v >> 6) & 63);
 202         *q++ = 0x80 | (v & 63);
 203     } else if (v <= 0x03ffffff) {
 204         *q++ = 0xf8 | (v >> 24);
 205         *q++ = 0x80 | ((v >> 18) & 63);
 206         *q++ = 0x80 | ((v >> 12) & 63);
 207         *q++ = 0x80 | ((v >> 6) & 63);
 208         *q++ = 0x80 | (v & 63);
 209     } else {
 210         *q++ = 0xfc | (v >> 30);
 211         *q++ = 0x80 | ((v >> 24) & 63);
 212         *q++ = 0x80 | ((v >> 18) & 63);
 213         *q++ = 0x80 | ((v >> 12) & 63);
 214         *q++ = 0x80 | ((v >> 6) & 63);
 215         *q++ = 0x80 | (v & 63);
 216     }
 217     return q;
 218 }
 219
 220 /*
 221  * Do an *in-place* dequoting of the specified string, returning the
 222  * resulting length (which may be containing embedded nulls.)
 223  *
 224  * In-place replacement is possible since the unquoted length is always
 225  * shorter than or equal to the quoted length.
 226  *
 227  * *ep points to the final quote, or to the null if improperly quoted.
 228  */
 229 size_t nasm_unquote(char *str, char **ep)
 230 {
 231     char bq;
 232     char *p, *q;
 233     char *escp = NULL;
 234     char c;
 235     enum unq_state {
 236         st_start,
 237         st_backslash,
 238         st_hex,
 239         st_oct,
 240         st_ucs,
 241     } state;
 242     int ndig = 0;
 243     int32_t nval = 0;
 244
 245     p = q = str;
 246
 247     bq = *p++;
 248     if (!bq)
 249         return 0;
 250
 251     switch (bq) {
 252     case '\'':
 253     case '\"':
 254         /* '...' or "..." string */
 255         while ((c = *p) && c != bq) {
 256             p++;
 257             *q++ = c;
 258         }
 259         *q = '\0';
 260         break;
 261
 262     case '`':
 263         /* `...` string */
 264         state = st_start;
 265
 266         while ((c = *p)) {
 267             p++;
 268             switch (state) {
 269             case st_start:
 270                 switch (c) {
 271                 case '\\':
 272                     state = st_backslash;
 273                     break;
 274                 case '`':
 275                     p--;
 276                     goto out;
 277                 default:
 278                     *q++ = c;
 279                     break;
 280                 }
 281                 break;
 282
 283             case st_backslash:
 284                 state = st_start;
 285                 escp = p;       /* Beginning of argument sequence */
 286                 nval = 0;
 287                 switch (c) {
 288                 case 'a':
 289                     *q++ = 7;
 290                     break;
 291                 case 'b':
 292                     *q++ = 8;
 293                     break;
 294                 case 'e':
 295                     *q++ = 27;
 296                     break;
 297                 case 'f':
 298                     *q++ = 12;
 299                     break;
 300                 case 'n':
 301                     *q++ = 10;
 302                     break;
 303                 case 'r':
 304                     *q++ = 13;
 305                     break;
 306                 case 't':
 307                     *q++ = 9;
 308                     break;
 309                 case 'u':
 310                     state = st_ucs;
 311                     ndig = 4;
 312                     break;
 313                 case 'U':
 314                     state = st_ucs;
 315                     ndig = 8;
 316                     break;
 317                 case 'v':
 318                     *q++ = 11;
 319                     break;
 320                 case 'x':
 321                 case 'X':
 322                     state = st_hex;
 323                     ndig = 2;
 324                     break;
 325                 case '0':
 326                 case '1':
 327                 case '2':
 328                 case '3':
 329                 case '4':
 330                 case '5':
 331                 case '6':
 332                 case '7':
 333                     state = st_oct;
 334                     ndig = 2;   /* Up to two more digits */
 335                     nval = c - '0';
 336                     break;
 337                 default:
 338                     *q++ = c;
 339                     break;
 340                 }
 341                 break;
 342
 343             case st_oct:
 344                 if (c >= '0' && c <= '7') {
 345                     nval = (nval << 3) + (c - '0');
 346                     if (!--ndig) {
 347                         *q++ = nval;
 348                         state = st_start;
 349                     }
 350                 } else {
 351                     p--;        /* Process this character again */
 352                     *q++ = nval;
 353                     state = st_start;
 354                 }
 355                 break;
 356
 357             case st_hex:
 358                 if ((c >= '0' && c <= '9') ||
 359                     (c >= 'A' && c <= 'F') ||
 360                     (c >= 'a' && c <= 'f')) {
 361                     nval = (nval << 4) + numvalue(c);
 362                     if (!--ndig) {
 363                         *q++ = nval;
 364                         state = st_start;
 365                     }
 366                 } else {
 367                     p--;        /* Process this character again */
 368                     *q++ = (p > escp) ? nval : escp[-1];
 369                     state = st_start;
 370                 }
 371                 break;
 372
 373             case st_ucs:
 374                 if ((c >= '0' && c <= '9') ||
 375                     (c >= 'A' && c <= 'F') ||
 376                     (c >= 'a' && c <= 'f')) {
 377                     nval = (nval << 4) + numvalue(c);
 378                     if (!--ndig) {
 379                         q = emit_utf8(q, nval);
 380                         state = st_start;
 381                     }
 382                 } else {
 383                     p--;        /* Process this character again */
 384                     if (p > escp)
 385                         q = emit_utf8(q, nval);
 386                     else
 387                         *q++ = escp[-1];
 388                     state = st_start;
 389                 }
 390                 break;
 391             }
 392         }
 393         switch (state) {
 394         case st_start:
 395         case st_backslash:
 396             break;
 397         case st_oct:
 398             *q++ = nval;
 399             break;
 400         case st_hex:
 401             *q++ = (p > escp) ? nval : escp[-1];
 402             break;
 403         case st_ucs:
 404             if (p > escp)
 405                 q = emit_utf8(q, nval);
 406             else
 407                 *q++ = escp[-1];
 408             break;
 409         }
 410     out:
 411         break;
 412
 413     default:
 414         /* Not a quoted string, just return the input... */
 415         p = q = strchr(str, '\0');
 416         break;
 417     }
 418
 419     if (ep)
 420         *ep = p;
 421     return q-str;
 422 }
 423
 424 /*
 425  * Find the end of a quoted string; returns the pointer to the terminating
 426  * character (either the ending quote or the null character, if unterminated.)
 427  */
 428 char *nasm_skip_string(char *str)
 429 {
 430     char bq;
 431     char *p;
 432     char c;
 433     enum unq_state {
 434         st_start,
 435         st_backslash,
 436     } state;
 437
 438     bq = str[0];
 439     if (bq == '\'' || bq == '\"') {
 440         /* '...' or "..." string */
 441         for (p = str+1; *p && *p != bq; p++)
 442             ;
 443         return p;
 444     } else if (bq == '`') {
 445         /* `...` string */
 446         p = str+1;
 447         state = st_start;
 448
 449         while ((c = *p++)) {
 450             switch (state) {
 451             case st_start:
 452                 switch (c) {
 453                 case '\\':
 454                     state = st_backslash;
 455                     break;
 456                 case '`':
 457                     return p-1; /* Found the end */
 458                 default:
 459                     break;
 460                 }
 461                 break;
 462
 463             case st_backslash:
 464                 /*
 465                  * Note: for the purpose of finding the end of the string,
 466                  * all successor states to st_backslash are functionally
 467                  * equivalent to st_start, since either a backslash or
 468                  * a backquote will force a return to the st_start state.
 469                  */
 470                 state = st_start;
 471                 break;
 472             }
 473         }
 474         return p;               /* Unterminated string... */
 475     } else {
 476         return str;             /* Not a string... */
 477     }
 478 }