m4/syntax.c

   1 /* GNU m4 -- A simple macro processor
   2    Copyright (C) 1989-1994, 2002, 2004, 2006-2010, 2013-2014, 2017 Free
   3    Software Foundation, Inc.
   4
   5    This file is part of GNU M4.
   6
   7    GNU M4 is free software: you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation, either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    GNU M4 is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21 #include <config.h>
  22
  23 #include "m4private.h"
  24
  25 /* Define this to see runtime debug info.  Implied by DEBUG.  */
  26 /*#define DEBUG_SYNTAX */
  27
  28 /* THE SYNTAX TABLE
  29
  30    The input is read character by character and grouped together
  31    according to a syntax table.  The character groups are (definitions
  32    are all in m4module.h, those marked with a * are not yet in use):
  33
  34    Basic (all characters fall in one of these mutually exclusive bins)
  35    M4_SYNTAX_IGNORE     *Character to be deleted from input as if not present
  36    M4_SYNTAX_OTHER      Any character with no special meaning to m4
  37    M4_SYNTAX_SPACE      Whitespace (ignored when leading macro arguments)
  38    M4_SYNTAX_OPEN       Open list of macro arguments
  39    M4_SYNTAX_CLOSE      Close list of macro arguments
  40    M4_SYNTAX_COMMA      Separates macro arguments
  41    M4_SYNTAX_ACTIVE     This character is a macro name by itself
  42    M4_SYNTAX_ESCAPE     Use this character to prefix all macro names
  43
  44    M4_SYNTAX_ALPHA      Alphabetic characters (can start macro names)
  45    M4_SYNTAX_NUM        Numeric characters (can form macro names)
  46
  47    M4_SYNTAX_LQUOTE     A single character left quote
  48    M4_SYNTAX_BCOMM      A single character begin comment delimiter
  49
  50    Attribute (these are context sensitive, and exist in addition to basic)
  51    M4_SYNTAX_RQUOTE     A single character right quote
  52    M4_SYNTAX_ECOMM      A single character end comment delimiter
  53    M4_SYNTAX_DOLLAR     Indicates macro argument in user macros
  54    M4_SYNTAX_LBRACE     *Indicates start of extended macro argument
  55    M4_SYNTAX_RBRACE     *Indicates end of extended macro argument
  56
  57    Besides adding new facilities, the use of a syntax table will reduce
  58    the number of calls to next_token ().  Now groups of OTHER, NUM and
  59    SPACE characters can be returned as a single token, since next_token
  60    () knows they have no special syntactical meaning to m4.  This is,
  61    however, only possible if only single character quotes comments
  62    comments are used, because otherwise the quote and comment characters
  63    will not show up in the syntax-table.
  64
  65    Having a syntax table allows new facilities.  The new builtin
  66    "changesyntax" allows the user to change the category of any
  67    character.
  68
  69    By default, '\n' is both ECOMM and SPACE, depending on the context.
  70    Hence we have basic categories (mutually exclusive, can introduce a
  71    context, and can be empty sets), and attribute categories
  72    (additive, only recognized in context, and will never be empty).
  73
  74    The precedence as implemented by next_token () is:
  75
  76    M4_SYNTAX_IGNORE     *Filtered out below next_token ()
  77    M4_SYNTAX_ESCAPE     Reads macro name iff set, else next character
  78    M4_SYNTAX_ALPHA      Reads M4_SYNTAX_ALPHA and M4_SYNTAX_NUM as macro name
  79    M4_SYNTAX_LQUOTE     Reads all until balanced M4_SYNTAX_RQUOTE
  80    M4_SYNTAX_BCOMM      Reads all until M4_SYNTAX_ECOMM
  81
  82    M4_SYNTAX_OTHER  }   Reads all M4_SYNTAX_OTHER, M4_SYNTAX_NUM
  83    M4_SYNTAX_NUM    }
  84
  85    M4_SYNTAX_SPACE      Reads all M4_SYNTAX_SPACE, depending on buffering
  86    M4_SYNTAX_ACTIVE     Returns a single char as a macro name
  87
  88    M4_SYNTAX_OPEN   }   Returned as a single char
  89    M4_SYNTAX_CLOSE  }
  90    M4_SYNTAX_COMMA  }
  91
  92    M4_SYNTAX_RQUOTE and M4_SYNTAX_ECOMM are context-sensitive, and
  93    close out M4_SYNTAX_LQUOTE and M4_SYNTAX_BCOMM, respectively.
  94    Also, M4_SYNTAX_DOLLAR, M4_SYNTAX_LBRACE, and M4_SYNTAX_RBRACE are
  95    context-sensitive, only mattering when expanding macro definitions.
  96
  97    There are several optimizations that can be performed depending on
  98    known states of the syntax table.  For example, when searching for
  99    quotes, if there is only a single start quote and end quote
 100    delimiter, we can use memchr2 and search a word at a time, instead
 101    of performing a table lookup a byte at a time.  The is_single_*
 102    flags track whether quotes and comments have a single delimiter
 103    (always the case if changequote/changecom were used, and
 104    potentially the case after changesyntax).  Since we frequently need
 105    to access quotes, we store the oldest valid quote outside the
 106    lookup table; the suspect flag tracks whether a cleanup pass is
 107    needed to restore our invariants.  On the other hand, coalescing
 108    multiple M4_SYNTAX_OTHER bytes could form a delimiter, so many
 109    optimizations must be disabled if a multi-byte delimiter exists;
 110    this is handled by m4__safe_quotes.  Meanwhile, quotes and comments
 111    can be disabled if the leading delimiter is length 0.  */
 112
 113 static int add_syntax_attribute         (m4_syntax_table *, char, int);
 114 static int remove_syntax_attribute      (m4_syntax_table *, char, int);
 115 static void set_quote_age               (m4_syntax_table *, bool, bool);
 116
 117 m4_syntax_table *
 118 m4_syntax_create (void)
 119 {
 120   m4_syntax_table *syntax = (m4_syntax_table *) xzalloc (sizeof *syntax);
 121   int ch;
 122
 123   /* Set up default table.  This table never changes during operation,
 124      and contains no context attributes.  */
 125   for (ch = UCHAR_MAX + 1; --ch >= 0; )
 126     switch (ch)
 127       {
 128       case '(':
 129         syntax->orig[ch] = M4_SYNTAX_OPEN;
 130         break;
 131       case ')':
 132         syntax->orig[ch] = M4_SYNTAX_CLOSE;
 133         break;
 134       case ',':
 135         syntax->orig[ch] = M4_SYNTAX_COMMA;
 136         break;
 137       case '`':
 138         syntax->orig[ch] = M4_SYNTAX_LQUOTE;
 139         break;
 140       case '#':
 141         syntax->orig[ch] = M4_SYNTAX_BCOMM;
 142         break;
 143       default:
 144         if (isspace (ch))
 145           syntax->orig[ch] = M4_SYNTAX_SPACE;
 146         else if (isalpha (ch) || ch == '_')
 147           syntax->orig[ch] = M4_SYNTAX_ALPHA;
 148         else if (isdigit (ch))
 149           syntax->orig[ch] = M4_SYNTAX_NUM;
 150         else
 151           syntax->orig[ch] = M4_SYNTAX_OTHER;
 152       }
 153
 154   /* Set up current table to match default.  */
 155   m4_reset_syntax (syntax);
 156   syntax->cached_simple.str1 = syntax->cached_lquote;
 157   syntax->cached_simple.len1 = 1;
 158   syntax->cached_simple.str2 = syntax->cached_rquote;
 159   syntax->cached_simple.len2 = 1;
 160   return syntax;
 161 }
 162
 163 void
 164 m4_syntax_delete (m4_syntax_table *syntax)
 165 {
 166   assert (syntax);
 167
 168   free (syntax->quote.str1);
 169   free (syntax->quote.str2);
 170   free (syntax->comm.str1);
 171   free (syntax->comm.str2);
 172   free (syntax);
 173 }
 174
 175 int
 176 m4_syntax_code (char ch)
 177 {
 178   int code;
 179
 180   switch (ch)
 181     {
 182       /* Sorted according to the order of M4_SYNTAX_* in m4module.h.  */
 183       /* FIXME - revisit the ignore syntax attribute.  */
 184     case 'I': case 'i': code = M4_SYNTAX_IGNORE; break;
 185       /* Basic categories.  */
 186     case '@':           code = M4_SYNTAX_ESCAPE; break;
 187     case 'W': case 'w': code = M4_SYNTAX_ALPHA;  break;
 188     case 'L': case 'l': code = M4_SYNTAX_LQUOTE; break;
 189     case 'B': case 'b': code = M4_SYNTAX_BCOMM;  break;
 190     case 'A': case 'a': code = M4_SYNTAX_ACTIVE; break;
 191     case 'D': case 'd': code = M4_SYNTAX_NUM;    break;
 192     case 'S': case 's': code = M4_SYNTAX_SPACE;  break;
 193     case '(':           code = M4_SYNTAX_OPEN;   break;
 194     case ')':           code = M4_SYNTAX_CLOSE;  break;
 195     case ',':           code = M4_SYNTAX_COMMA;  break;
 196     case 'O': case 'o': code = M4_SYNTAX_OTHER;  break;
 197       /* Context categories.  */
 198     case '$':           code = M4_SYNTAX_DOLLAR; break;
 199     case '{':           code = M4_SYNTAX_LBRACE; break;
 200     case '}':           code = M4_SYNTAX_RBRACE; break;
 201     case 'R': case 'r': code = M4_SYNTAX_RQUOTE; break;
 202     case 'E': case 'e': code = M4_SYNTAX_ECOMM;  break;
 203
 204     default: code = -1;  break;
 205     }
 206
 207   return code;
 208 }
 209
 210
 211 \f
 212 /* Functions to manipulate the syntax table.  */
 213 static int
 214 add_syntax_attribute (m4_syntax_table *syntax, char ch, int code)
 215 {
 216   int c = to_uchar (ch);
 217   if (code & M4_SYNTAX_MASKS)
 218     {
 219       syntax->table[c] |= code;
 220       syntax->suspect = true;
 221     }
 222   else
 223     {
 224       if ((code & (M4_SYNTAX_SUSPECT)) != 0
 225           || m4_has_syntax (syntax, c, M4_SYNTAX_SUSPECT))
 226         syntax->suspect = true;
 227       syntax->table[c] = ((syntax->table[c] & M4_SYNTAX_MASKS) | code);
 228     }
 229
 230 #ifdef DEBUG_SYNTAX
 231   xfprintf(stderr, "Set syntax %o %c = %04X\n", c, isprint(c) ? c : '-',
 232            syntax->table[c]);
 233 #endif
 234
 235   return syntax->table[c];
 236 }
 237
 238 static int
 239 remove_syntax_attribute (m4_syntax_table *syntax, char ch, int code)
 240 {
 241   int c = to_uchar (ch);
 242   assert (code & M4_SYNTAX_MASKS);
 243   syntax->table[c] &= ~code;
 244   syntax->suspect = true;
 245
 246 #ifdef DEBUG_SYNTAX
 247   xfprintf(stderr, "Unset syntax %o %c = %04X\n", c, isprint(c) ? c : '-',
 248            syntax->table[c]);
 249 #endif
 250
 251   return syntax->table[c];
 252 }
 253
 254 /* Add the set CHARS of length LEN to syntax category CODE, removing
 255    them from whatever category they used to be in.  */
 256 static void
 257 add_syntax_set (m4_syntax_table *syntax, const char *chars, size_t len,
 258                 int code)
 259 {
 260   while (len--)
 261     add_syntax_attribute (syntax, *chars++, code);
 262 }
 263
 264 /* Remove the set CHARS of length LEN from syntax category CODE,
 265    adding them to category M4_SYNTAX_OTHER instead.  */
 266 static void
 267 subtract_syntax_set (m4_syntax_table *syntax, const char *chars, size_t len,
 268                      int code)
 269 {
 270   while (len--)
 271     {
 272       char ch = *chars++;
 273       if ((code & M4_SYNTAX_MASKS) != 0)
 274         remove_syntax_attribute (syntax, ch, code);
 275       else if (m4_has_syntax (syntax, ch, code))
 276         add_syntax_attribute (syntax, ch, M4_SYNTAX_OTHER);
 277     }
 278 }
 279
 280 /* Make the set CHARS of length LEN become syntax category CODE,
 281    removing CHARS from any other categories, and sending all bytes in
 282    the category but not in CHARS to category M4_SYNTAX_OTHER
 283    instead.  */
 284 static void
 285 set_syntax_set (m4_syntax_table *syntax, const char *chars, size_t len,
 286                 int code)
 287 {
 288   int ch;
 289   /* Explicit set of characters to install with this category; all
 290      other characters that used to have the category get reset to
 291      OTHER.  */
 292   for (ch = UCHAR_MAX + 1; --ch >= 0; )
 293     {
 294       if ((code & M4_SYNTAX_MASKS) != 0)
 295         remove_syntax_attribute (syntax, ch, code);
 296       else if (m4_has_syntax (syntax, ch, code))
 297         add_syntax_attribute (syntax, ch, M4_SYNTAX_OTHER);
 298     }
 299   while (len--)
 300     {
 301       ch = *chars++;
 302       add_syntax_attribute (syntax, ch, code);
 303     }
 304 }
 305
 306 /* Reset syntax category CODE to its default state, sending all other
 307    characters in the category back to their default state.  */
 308 static void
 309 reset_syntax_set (m4_syntax_table *syntax, int code)
 310 {
 311   int ch;
 312   for (ch = UCHAR_MAX + 1; --ch >= 0; )
 313     {
 314       /* Reset the category back to its default state.  All other
 315          characters that used to have this category get reset to
 316          their default state as well.  */
 317       if (code == M4_SYNTAX_RQUOTE)
 318         {
 319           if (ch == '\'')
 320             add_syntax_attribute (syntax, ch, code);
 321           else
 322             remove_syntax_attribute (syntax, ch, code);
 323         }
 324       else if (code == M4_SYNTAX_ECOMM)
 325         {
 326           if (ch == '\n')
 327             add_syntax_attribute (syntax, ch, code);
 328           else
 329             remove_syntax_attribute (syntax, ch, code);
 330         }
 331       else if (code == M4_SYNTAX_DOLLAR)
 332         {
 333           if (ch == '$')
 334             add_syntax_attribute (syntax, ch, code);
 335           else
 336             remove_syntax_attribute (syntax, ch, code);
 337         }
 338       else if (code == M4_SYNTAX_LBRACE)
 339         {
 340           if (ch == '{')
 341             add_syntax_attribute (syntax, ch, code);
 342           else
 343             remove_syntax_attribute (syntax, ch, code);
 344         }
 345       else if (code == M4_SYNTAX_RBRACE)
 346         {
 347           if (ch == '}')
 348             add_syntax_attribute (syntax, ch, code);
 349           else
 350             remove_syntax_attribute (syntax, ch, code);
 351         }
 352       else if (syntax->orig[ch] == code || m4_has_syntax (syntax, ch, code))
 353         add_syntax_attribute (syntax, ch, syntax->orig[ch]);
 354     }
 355 }
 356
 357 /* Reset the syntax table to its default state.  */
 358 void
 359 m4_reset_syntax (m4_syntax_table *syntax)
 360 {
 361   /* Restore the default syntax, which has known quote and comment
 362      properties.  */
 363   memcpy (syntax->table, syntax->orig, sizeof syntax->orig);
 364
 365   free (syntax->quote.str1);
 366   free (syntax->quote.str2);
 367   free (syntax->comm.str1);
 368   free (syntax->comm.str2);
 369
 370   /* The use of xmemdup0 is exploited by input.c.  */
 371   syntax->quote.str1 = xmemdup0 (DEF_LQUOTE, 1);
 372   syntax->quote.len1 = 1;
 373   syntax->quote.str2 = xmemdup0 (DEF_RQUOTE, 1);
 374   syntax->quote.len2 = 1;
 375   syntax->comm.str1 = xmemdup0 (DEF_BCOMM, 1);
 376   syntax->comm.len1 = 1;
 377   syntax->comm.str2 = xmemdup0 (DEF_ECOMM, 1);
 378   syntax->comm.len2 = 1;
 379   syntax->dollar = '$';
 380
 381   add_syntax_attribute (syntax, syntax->quote.str2[0], M4_SYNTAX_RQUOTE);
 382   add_syntax_attribute (syntax, syntax->comm.str2[0], M4_SYNTAX_ECOMM);
 383   add_syntax_attribute (syntax, '$', M4_SYNTAX_DOLLAR);
 384   add_syntax_attribute (syntax, '{', M4_SYNTAX_LBRACE);
 385   add_syntax_attribute (syntax, '}', M4_SYNTAX_RBRACE);
 386
 387   syntax->is_single_quotes = true;
 388   syntax->is_single_comments = true;
 389   syntax->is_single_dollar = true;
 390   syntax->is_macro_escaped = false;
 391   set_quote_age (syntax, true, false);
 392 }
 393
 394 /* Alter the syntax for category KEY, according to ACTION: '+' to add,
 395    '-' to subtract, '=' to set, or '\0' to reset.  The array CHARS of
 396    length LEN describes the characters to modify; it is ignored if
 397    ACTION is '\0'.  Return -1 if KEY is invalid, otherwise return the
 398    syntax category matching KEY.  */
 399 int
 400 m4_set_syntax (m4_syntax_table *syntax, char key, char action,
 401                const char *chars, size_t len)
 402 {
 403   int code;
 404
 405   assert (syntax && chars);
 406   code = m4_syntax_code (key);
 407   if (code < 0)
 408     {
 409       return -1;
 410     }
 411   syntax->suspect = false;
 412   switch (action)
 413     {
 414     case '+':
 415       add_syntax_set (syntax, chars, len, code);
 416       break;
 417     case '-':
 418       subtract_syntax_set (syntax, chars, len, code);
 419       break;
 420     case '=':
 421       set_syntax_set (syntax, chars, len, code);
 422       break;
 423     case '\0':
 424       assert (!len);
 425       reset_syntax_set (syntax, code);
 426       break;
 427     default:
 428       assert (false);
 429     }
 430
 431   /* Check for any cleanup needed.  */
 432   if (syntax->suspect)
 433     {
 434       int ch;
 435       int lquote = -1;
 436       int rquote = -1;
 437       int bcomm = -1;
 438       int ecomm = -1;
 439       bool single_quote_possible = true;
 440       bool single_comm_possible = true;
 441       int dollar = -1;
 442       if (m4_has_syntax (syntax, syntax->quote.str1[0], M4_SYNTAX_LQUOTE))
 443         {
 444           assert (syntax->quote.len1 == 1);
 445           lquote = to_uchar (syntax->quote.str1[0]);
 446         }
 447       if (m4_has_syntax (syntax, syntax->quote.str2[0], M4_SYNTAX_RQUOTE))
 448         {
 449           assert (syntax->quote.len2 == 1);
 450           rquote = to_uchar (syntax->quote.str2[0]);
 451         }
 452       if (m4_has_syntax (syntax, syntax->comm.str1[0], M4_SYNTAX_BCOMM))
 453         {
 454           assert (syntax->comm.len1 == 1);
 455           bcomm = to_uchar (syntax->comm.str1[0]);
 456         }
 457       if (m4_has_syntax (syntax, syntax->comm.str2[0], M4_SYNTAX_ECOMM))
 458         {
 459           assert (syntax->comm.len2 == 1);
 460           ecomm = to_uchar (syntax->comm.str2[0]);
 461         }
 462       syntax->is_single_dollar = false;
 463       syntax->is_macro_escaped = false;
 464       /* Find candidates for each category.  */
 465       for (ch = UCHAR_MAX + 1; --ch >= 0; )
 466         {
 467           if (m4_has_syntax (syntax, ch, M4_SYNTAX_LQUOTE))
 468             {
 469               if (lquote == -1)
 470                 lquote = ch;
 471               else if (lquote != ch)
 472                 single_quote_possible = false;
 473             }
 474           if (m4_has_syntax (syntax, ch, M4_SYNTAX_RQUOTE))
 475             {
 476               if (rquote == -1)
 477                 rquote = ch;
 478               else if (rquote != ch)
 479                 single_quote_possible = false;
 480             }
 481           if (m4_has_syntax (syntax, ch, M4_SYNTAX_BCOMM))
 482             {
 483               if (bcomm == -1)
 484                 bcomm = ch;
 485               else if (bcomm != ch)
 486                 single_comm_possible = false;
 487             }
 488           if (m4_has_syntax (syntax, ch, M4_SYNTAX_ECOMM))
 489             {
 490               if (ecomm == -1)
 491                 ecomm = ch;
 492               else if (ecomm != ch)
 493                 single_comm_possible = false;
 494             }
 495           if (m4_has_syntax (syntax, ch, M4_SYNTAX_DOLLAR))
 496             {
 497               if (dollar == -1)
 498                 {
 499                   syntax->dollar = dollar = ch;
 500                   syntax->is_single_dollar = true;
 501                 }
 502               else
 503                 syntax->is_single_dollar = false;
 504             }
 505           if (m4_has_syntax (syntax, ch, M4_SYNTAX_ESCAPE))
 506             syntax->is_macro_escaped = true;
 507         }
 508       /* Disable multi-character delimiters if we discovered
 509          delimiters.  */
 510       if (!single_quote_possible)
 511         syntax->is_single_quotes = false;
 512       if (!single_comm_possible)
 513         syntax->is_single_comments = false;
 514       if ((1 < syntax->quote.len1 || 1 < syntax->quote.len2)
 515           && (!syntax->is_single_quotes || lquote != -1 || rquote != -1))
 516         {
 517           if (syntax->quote.len1)
 518             {
 519               syntax->quote.len1 = lquote == to_uchar (syntax->quote.str1[0]);
 520               syntax->quote.str1[syntax->quote.len1] = '\0';
 521             }
 522           if (syntax->quote.len2)
 523             {
 524               syntax->quote.len2 = rquote == to_uchar (syntax->quote.str2[0]);
 525               syntax->quote.str2[syntax->quote.len2] = '\0';
 526             }
 527         }
 528       if ((1 < syntax->comm.len1 || 1 < syntax->comm.len2)
 529           && (!syntax->is_single_comments || bcomm != -1 || ecomm != -1))
 530         {
 531           if (syntax->comm.len1)
 532             {
 533               syntax->comm.len1 = bcomm == to_uchar (syntax->comm.str1[0]);
 534               syntax->comm.str1[syntax->comm.len1] = '\0';
 535             }
 536           if (syntax->comm.len2)
 537             {
 538               syntax->comm.len2 = ecomm == to_uchar (syntax->comm.str2[0]);
 539               syntax->comm.str2[syntax->comm.len2] = '\0';
 540             }
 541         }
 542       /* Update the strings.  */
 543       if (lquote != -1)
 544         {
 545           if (single_quote_possible)
 546             syntax->is_single_quotes = true;
 547           if (syntax->quote.len1)
 548             assert (syntax->quote.len1 == 1);
 549           else
 550             {
 551               free (syntax->quote.str1);
 552               syntax->quote.str1 = xcharalloc (2);
 553               syntax->quote.str1[1] = '\0';
 554               syntax->quote.len1 = 1;
 555             }
 556           syntax->quote.str1[0] = lquote;
 557           if (rquote == -1)
 558             {
 559               rquote = '\'';
 560               add_syntax_attribute (syntax, rquote, M4_SYNTAX_RQUOTE);
 561             }
 562           if (!syntax->quote.len2)
 563             {
 564               free (syntax->quote.str2);
 565               syntax->quote.str2 = xcharalloc (2);
 566             }
 567           syntax->quote.str2[0] = rquote;
 568           syntax->quote.str2[1] = '\0';
 569           syntax->quote.len2 = 1;
 570         }
 571       if (bcomm != -1)
 572         {
 573           if (single_comm_possible)
 574             syntax->is_single_comments = true;
 575           if (syntax->comm.len1)
 576             assert (syntax->comm.len1 == 1);
 577           else
 578             {
 579               free (syntax->comm.str1);
 580               syntax->comm.str1 = xcharalloc (2);
 581               syntax->comm.str1[1] = '\0';
 582               syntax->comm.len1 = 1;
 583             }
 584           syntax->comm.str1[0] = bcomm;
 585           if (ecomm == -1)
 586             {
 587               ecomm = '\n';
 588               add_syntax_attribute (syntax, ecomm, M4_SYNTAX_ECOMM);
 589             }
 590           if (!syntax->comm.len2)
 591             {
 592               free (syntax->comm.str2);
 593               syntax->comm.str2 = xcharalloc (2);
 594             }
 595           syntax->comm.str2[0] = ecomm;
 596           syntax->comm.str2[1] = '\0';
 597           syntax->comm.len2 = 1;
 598         }
 599     }
 600   set_quote_age (syntax, false, true);
 601   m4__quote_uncache (syntax);
 602   return code;
 603 }
 604
 605 \f
 606 /* Functions for setting quotes and comment delimiters.  Used by
 607    m4_changecom () and m4_changequote ().  Both functions override the
 608    syntax table to maintain compatibility.  */
 609
 610 /* Set the quote delimiters to LQ and RQ, with respective lengths
 611    LQ_LEN and RQ_LEN.  Pass NULL if the argument was not present, to
 612    distinguish from an explicit empty string.  */
 613 void
 614 m4_set_quotes (m4_syntax_table *syntax, const char *lq, size_t lq_len,
 615                const char *rq, size_t rq_len)
 616 {
 617   int ch;
 618
 619   assert (syntax);
 620
 621   /* POSIX states that with 0 arguments, the default quotes are used.
 622      POSIX XCU ERN 112 states that behavior is implementation-defined
 623      if there was only one argument, or if there is an empty string in
 624      either position when there are two arguments.  We allow an empty
 625      left quote to disable quoting, but a non-empty left quote will
 626      always create a non-empty right quote.  See the texinfo for what
 627      some other implementations do.  */
 628   if (!lq)
 629     {
 630       lq = DEF_LQUOTE;
 631       lq_len = 1;
 632       rq = DEF_RQUOTE;
 633       rq_len = 1;
 634     }
 635   else if (!rq || (lq_len && !rq_len))
 636     {
 637       rq = DEF_RQUOTE;
 638       rq_len = 1;
 639     }
 640
 641   if (syntax->quote.len1 == lq_len && syntax->quote.len2 == rq_len
 642       && memcmp (syntax->quote.str1, lq, lq_len) == 0
 643       && memcmp (syntax->quote.str2, rq, rq_len) == 0)
 644     return;
 645
 646   free (syntax->quote.str1);
 647   free (syntax->quote.str2);
 648   /* The use of xmemdup0 is exploited by input.c.  */
 649   syntax->quote.str1 = xmemdup0 (lq, lq_len);
 650   syntax->quote.len1 = lq_len;
 651   syntax->quote.str2 = xmemdup0 (rq, rq_len);
 652   syntax->quote.len2 = rq_len;
 653
 654   /* changequote overrides syntax_table, but be careful when it is
 655      used to select a start-quote sequence that is effectively
 656      disabled.  */
 657   syntax->is_single_quotes = true;
 658   for (ch = UCHAR_MAX + 1; --ch >= 0; )
 659     {
 660       if (m4_has_syntax (syntax, ch, M4_SYNTAX_LQUOTE))
 661         add_syntax_attribute (syntax, ch,
 662                               (syntax->orig[ch] == M4_SYNTAX_LQUOTE
 663                                ? M4_SYNTAX_OTHER : syntax->orig[ch]));
 664       if (m4_has_syntax (syntax, ch, M4_SYNTAX_RQUOTE))
 665         remove_syntax_attribute (syntax, ch, M4_SYNTAX_RQUOTE);
 666     }
 667
 668   if (!m4_has_syntax (syntax, *syntax->quote.str1,
 669                       (M4_SYNTAX_IGNORE | M4_SYNTAX_ESCAPE | M4_SYNTAX_ALPHA
 670                        | M4_SYNTAX_NUM)))
 671     {
 672       if (syntax->quote.len1 == 1)
 673         add_syntax_attribute (syntax, syntax->quote.str1[0], M4_SYNTAX_LQUOTE);
 674       if (syntax->quote.len2 == 1)
 675         add_syntax_attribute (syntax, syntax->quote.str2[0], M4_SYNTAX_RQUOTE);
 676     }
 677   set_quote_age (syntax, false, false);
 678 }
 679
 680 /* Set the comment delimiters to BC and EC, with respective lengths
 681    BC_LEN and EC_LEN.  Pass NULL if the argument was not present, to
 682    distinguish from an explicit empty string.  */
 683 void
 684 m4_set_comment (m4_syntax_table *syntax, const char *bc, size_t bc_len,
 685                 const char *ec, size_t ec_len)
 686 {
 687   int ch;
 688
 689   assert (syntax);
 690
 691   /* POSIX requires no arguments to disable comments, and that one
 692      argument use newline as the close-comment.  POSIX XCU ERN 131
 693      states that empty arguments invoke implementation-defined
 694      behavior.  We allow an empty begin comment to disable comments,
 695      and a non-empty begin comment will always create a non-empty end
 696      comment.  See the texinfo for what some other implementations
 697      do.  */
 698   if (!bc)
 699     {
 700       bc = ec = "";
 701       bc_len = ec_len = 0;
 702     }
 703   else if (!ec || (bc_len && !ec_len))
 704     {
 705       ec = DEF_ECOMM;
 706       ec_len = 1;
 707     }
 708
 709   if (syntax->comm.len1 == bc_len && syntax->comm.len2 == ec_len
 710       && memcmp (syntax->comm.str1, bc, bc_len) == 0
 711       && memcmp (syntax->comm.str2, ec, ec_len) == 0)
 712     return;
 713
 714   free (syntax->comm.str1);
 715   free (syntax->comm.str2);
 716   /* The use of xmemdup0 is exploited by input.c.  */
 717   syntax->comm.str1 = xmemdup0 (bc, bc_len);
 718   syntax->comm.len1 = bc_len;
 719   syntax->comm.str2 = xmemdup0 (ec, ec_len);
 720   syntax->comm.len2 = ec_len;
 721
 722   /* changecom overrides syntax_table, but be careful when it is used
 723      to select a start-comment sequence that is effectively
 724      disabled.  */
 725   syntax->is_single_comments = true;
 726   for (ch = UCHAR_MAX + 1; --ch >= 0; )
 727     {
 728       if (m4_has_syntax (syntax, ch, M4_SYNTAX_BCOMM))
 729         add_syntax_attribute (syntax, ch,
 730                               (syntax->orig[ch] == M4_SYNTAX_BCOMM
 731                                ? M4_SYNTAX_OTHER : syntax->orig[ch]));
 732       if (m4_has_syntax (syntax, ch, M4_SYNTAX_ECOMM))
 733         remove_syntax_attribute (syntax, ch, M4_SYNTAX_ECOMM);
 734     }
 735   if (!m4_has_syntax (syntax, *syntax->comm.str1,
 736                       (M4_SYNTAX_IGNORE | M4_SYNTAX_ESCAPE | M4_SYNTAX_ALPHA
 737                        | M4_SYNTAX_NUM | M4_SYNTAX_LQUOTE)))
 738     {
 739       if (syntax->comm.len1 == 1)
 740         add_syntax_attribute (syntax, syntax->comm.str1[0], M4_SYNTAX_BCOMM);
 741       if (syntax->comm.len2 == 1)
 742         add_syntax_attribute (syntax, syntax->comm.str2[0], M4_SYNTAX_ECOMM);
 743     }
 744   set_quote_age (syntax, false, false);
 745 }
 746
 747 /* Call this when changing anything that might impact the quote age,
 748    so that m4__quote_age and m4__safe_quotes will reflect the change.
 749    If RESET, changesyntax was reset to its default stage; if CHANGE,
 750    arbitrary syntax has changed; otherwise, just quotes or comment
 751    delimiters have changed.  */
 752 static void
 753 set_quote_age (m4_syntax_table *syntax, bool reset, bool change)
 754 {
 755   /* Multi-character quotes are inherently unsafe, since concatenation
 756      of individual characters can result in a quote delimiter,
 757      consider:
 758
 759      define(echo,``$1'')define(a,A)changequote(<[,]>)echo(<[]]><[>a]>)
 760      => A]> (not ]>a)
 761
 762    Also, unquoted close delimiters are unsafe, consider:
 763
 764      define(echo,``$1'')define(a,A)echo(`a''`a')
 765      => aA' (not a'a)
 766
 767    Duplicated start and end quote delimiters, as well as comment
 768    delimiters that overlap with quote delimiters or active characters,
 769    also present a problem, consider:
 770
 771      define(echo,$*)echo(a,a,a`'define(a,A)changecom(`,',`,'))
 772      => A,a,A (not A,A,A)
 773
 774    The impact of arbitrary changesyntax is difficult to characterize.
 775    So if things are in their default state, we use 0 for the upper 16
 776    bits of quote_age; otherwise we increment syntax_age for each
 777    changesyntax, but saturate it at 0xffff rather than wrapping
 778    around.  Perhaps a cache of other frequently used states is
 779    warranted, if changesyntax becomes more popular.
 780
 781    Perhaps someday we will fix $@ expansion to use the current
 782    settings of the comma category, or even allow multi-character
 783    argument separators via changesyntax.  Until then, we use a literal
 784    `,' in $@ expansion, therefore we must insist that `,' be an
 785    argument separator for quote_age to be non-zero.
 786
 787    Rather than check every token for an unquoted delimiter, we merely
 788    encode current_quote_age to 0 when things are unsafe, and non-zero
 789    when safe (namely, the syntax_age in the upper 16 bits, coupled
 790    with the 16-bit value composed of the single-character start and
 791    end quote delimiters).  There may be other situations which are
 792    safe even when this algorithm sets the quote_age to zero, but at
 793    least a quote_age of zero always produces correct results (although
 794    it may take more time in doing so).  */
 795
 796   unsigned short local_syntax_age;
 797   if (reset)
 798     local_syntax_age = 0;
 799   else if (change && syntax->syntax_age < 0xffff)
 800     local_syntax_age = ++syntax->syntax_age;
 801   else
 802     local_syntax_age = syntax->syntax_age;
 803   if (local_syntax_age < 0xffff && syntax->is_single_quotes
 804       && syntax->quote.len1 == 1 && syntax->quote.len2 == 1
 805       && !m4_has_syntax (syntax, *syntax->quote.str1,
 806                          (M4_SYNTAX_ALPHA | M4_SYNTAX_NUM | M4_SYNTAX_OPEN
 807                           | M4_SYNTAX_COMMA | M4_SYNTAX_CLOSE
 808                           | M4_SYNTAX_SPACE))
 809       && !m4_has_syntax (syntax, *syntax->quote.str2,
 810                          (M4_SYNTAX_ALPHA | M4_SYNTAX_NUM | M4_SYNTAX_OPEN
 811                           | M4_SYNTAX_COMMA | M4_SYNTAX_CLOSE
 812                           | M4_SYNTAX_SPACE))
 813       && *syntax->quote.str1 != *syntax->quote.str2
 814       && (!syntax->comm.len1
 815           || (*syntax->comm.str1 != *syntax->quote.str2
 816               && !m4_has_syntax (syntax, *syntax->comm.str1,
 817                                  (M4_SYNTAX_OPEN | M4_SYNTAX_COMMA
 818                                   | M4_SYNTAX_CLOSE))))
 819       && m4_has_syntax (syntax, ',', M4_SYNTAX_COMMA))
 820     {
 821       syntax->quote_age = ((local_syntax_age << 16)
 822                            | ((*syntax->quote.str1 & 0xff) << 8)
 823                            | (*syntax->quote.str2 & 0xff));
 824     }
 825   else
 826     syntax->quote_age = 0;
 827 }
 828
 829 /* Interface for caching frequently used quote pairs, independently of
 830    the current quote delimiters (for example, consider a text macro
 831    expansion that includes several copies of $@), and using AGE for
 832    optimization.  If QUOTES is NULL, don't use quoting.  If OBS is
 833    non-NULL, AGE should be the current quote age, and QUOTES should be
 834    m4_get_syntax_quotes; the return value will be a cached quote pair,
 835    where the pointer is valid at least as long as OBS is not reset,
 836    but whose contents are only guaranteed until the next changequote
 837    or quote_cache.  Otherwise, OBS is NULL, AGE should be the same as
 838    before, and QUOTES should be a previously returned cache value;
 839    used to refresh the contents of the result.  */
 840 const m4_string_pair *
 841 m4__quote_cache (m4_syntax_table *syntax, m4_obstack *obs, unsigned int age,
 842                  const m4_string_pair *quotes)
 843 {
 844   /* Implementation - if AGE is non-zero, then the implementation of
 845      set_quote_age guarantees that we can recreate the return value on
 846      the fly; so we use static storage, and the contents must be used
 847      immediately.  If AGE is zero, then we must copy QUOTES onto OBS,
 848      but we might as well cache that copy.  */
 849   if (!quotes)
 850     return NULL;
 851   if (age)
 852     {
 853       *syntax->cached_lquote = (age >> 8) & 0xff;
 854       *syntax->cached_rquote = age & 0xff;
 855       return &syntax->cached_simple;
 856     }
 857   if (!obs)
 858     return quotes;
 859   assert (quotes == &syntax->quote);
 860   if (!syntax->cached_quote)
 861     {
 862       assert (obstack_object_size (obs) == 0);
 863       syntax->cached_quote = (m4_string_pair *) obstack_copy (obs, quotes,
 864                                                               sizeof *quotes);
 865       syntax->cached_quote->str1 = (char *) obstack_copy0 (obs, quotes->str1,
 866                                                            quotes->len1);
 867       syntax->cached_quote->str2 = (char *) obstack_copy0 (obs, quotes->str2,
 868                                                            quotes->len2);
 869     }
 870   return syntax->cached_quote;
 871 }
 872
 873 \f
 874 /* Define these functions at the end, so that calls in the file use the
 875    faster macro version from m4module.h.  */
 876 #undef m4_get_syntax_lquote
 877 const char *
 878 m4_get_syntax_lquote (m4_syntax_table *syntax)
 879 {
 880   assert (syntax);
 881   return syntax->quote.str1;
 882 }
 883
 884 #undef m4_get_syntax_rquote
 885 const char *
 886 m4_get_syntax_rquote (m4_syntax_table *syntax)
 887 {
 888   assert (syntax);
 889   return syntax->quote.str2;
 890 }
 891
 892 #undef m4_get_syntax_quotes
 893 const m4_string_pair *
 894 m4_get_syntax_quotes (m4_syntax_table *syntax)
 895 {
 896   assert (syntax);
 897   return &syntax->quote;
 898 }
 899
 900 #undef m4_is_syntax_single_quotes
 901 bool
 902 m4_is_syntax_single_quotes (m4_syntax_table *syntax)
 903 {
 904   assert (syntax);
 905   return syntax->is_single_quotes;
 906 }
 907
 908 #undef m4_get_syntax_bcomm
 909 const char *
 910 m4_get_syntax_bcomm (m4_syntax_table *syntax)
 911 {
 912   assert (syntax);
 913   return syntax->comm.str1;
 914 }
 915
 916 #undef m4_get_syntax_ecomm
 917 const char *
 918 m4_get_syntax_ecomm (m4_syntax_table *syntax)
 919 {
 920   assert (syntax);
 921   return syntax->comm.str2;
 922 }
 923
 924 #undef m4_get_syntax_comments
 925 const m4_string_pair *
 926 m4_get_syntax_comments (m4_syntax_table *syntax)
 927 {
 928   assert (syntax);
 929   return &syntax->comm;
 930 }
 931
 932 #undef m4_is_syntax_single_comments
 933 bool
 934 m4_is_syntax_single_comments (m4_syntax_table *syntax)
 935 {
 936   assert (syntax);
 937   return syntax->is_single_comments;
 938 }
 939
 940 #undef m4_is_syntax_single_dollar
 941 bool
 942 m4_is_syntax_single_dollar (m4_syntax_table *syntax)
 943 {
 944   assert (syntax);
 945   return syntax->is_single_dollar;
 946 }
 947
 948 #undef m4_is_syntax_macro_escaped
 949 bool
 950 m4_is_syntax_macro_escaped (m4_syntax_table *syntax)
 951 {
 952   assert (syntax);
 953   return syntax->is_macro_escaped;
 954 }