src/input.c

   1 /* GNU m4 -- A simple macro processor
   2
   3    Copyright (C) 1989, 1990, 1991, 1992, 1993, 1994, 2004, 2005, 2006, 2007,
   4    2008 Free Software Foundation, Inc.
   5
   6    This file is part of GNU M4.
   7
   8    GNU M4 is free software: you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation, either version 3 of the License, or
  11    (at your option) any later version.
  12
  13    GNU M4 is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  20 */
  21
  22 /* Handling of different input sources, and lexical analysis.  */
  23
  24 #include "m4.h"
  25
  26 /* Unread input can be either files to be read (command line,
  27    "include", "sinclude"), strings which should be rescanned (macro
  28    expansion text), or quoted macro definitions (as returned by the
  29    builtin "defn").  Unread input is organized in a stack, implemented
  30    with an obstack.  Each input source is described by a "struct
  31    input_block".  The obstack is "current_input".  The top of the
  32    input stack is "isp".
  33
  34    The macro "m4wrap" places the text to be saved on another input
  35    stack, on the obstack "wrapup_stack", whose top is "wsp".  When EOF
  36    is seen on normal input (eg, when "current_input" is empty), input is
  37    switched over to "wrapup_stack", and the original "current_input" is
  38    freed.  A new stack is allocated for "wrapup_stack", which will
  39    accept any text produced by calls to "m4wrap" from within the
  40    wrapped text.  This process of shuffling "wrapup_stack" to
  41    "current_input" can continue indefinitely, even generating infinite
  42    loops (e.g. "define(`f',`m4wrap(`f')')f"), without memory leaks.
  43
  44    Pushing new input on the input stack is done by push_file (),
  45    push_string (), push_wrapup_init/push_wrapup_finish () (for wrapup
  46    text), and push_macro () (for macro definitions).  Because macro
  47    expansion needs direct access to the current input obstack (for
  48    optimization), push_string () is split in two functions,
  49    push_string_init (), which returns a pointer to the current input
  50    stack, and push_string_finish (), which returns a pointer to the
  51    final text.  The input_block *next is used to manage the
  52    coordination between the different push routines.
  53
  54    The current file and line number are stored in two global
  55    variables, for use by the error handling functions in m4.c.  Macro
  56    expansion wants to report the line where a macro name was detected,
  57    rather than where it finished collecting arguments.  This also
  58    applies to text resulting from macro expansions.  So each input
  59    block maintains its own notion of the current file and line, and
  60    swapping between input blocks updates the global variables
  61    accordingly.  */
  62
  63 #ifdef ENABLE_CHANGEWORD
  64 # include "regex.h"
  65 #endif /* ENABLE_CHANGEWORD */
  66
  67 /* Number of bytes where it is more efficient to inline the reference
  68    as a string than it is to track reference bookkeeping for those
  69    bytes.  */
  70 #define INPUT_INLINE_THRESHOLD 16
  71
  72 /* Type of an input block.  */
  73 enum input_type
  74 {
  75   INPUT_STRING,         /* String resulting from macro expansion.  */
  76   INPUT_FILE,           /* File from command line or include.  */
  77   INPUT_MACRO,          /* Builtin resulting from defn.  */
  78   INPUT_CHAIN           /* FIFO chain of separate strings and $@ refs.  */
  79 };
  80
  81 typedef enum input_type input_type;
  82
  83 /* A block of input to be scanned.  */
  84 struct input_block
  85 {
  86   input_block *prev;            /* Previous input_block on the input stack.  */
  87   input_type type;              /* See enum values.  */
  88   const char *file;             /* File where this input is from.  */
  89   int line;                     /* Line where this input is from.  */
  90   union
  91     {
  92       struct
  93         {
  94           char *str;            /* Remaining string value.  */
  95           size_t len;           /* Remaining length.  */
  96         }
  97         u_s;    /* INPUT_STRING */
  98       struct
  99         {
 100           FILE *fp;                  /* Input file handle.  */
 101           bool_bitfield end : 1;     /* True if peek has seen EOF.  */
 102           bool_bitfield close : 1;   /* True to close file on pop.  */
 103           bool_bitfield advance : 1; /* Track previous start_of_input_line.  */
 104         }
 105         u_f;    /* INPUT_FILE */
 106       builtin_func *func;       /* INPUT_MACRO */
 107       struct
 108         {
 109           token_chain *chain;   /* Current link in chain.  */
 110           token_chain *end;     /* Last link in chain.  */
 111         }
 112         u_c;    /* INPUT_CHAIN */
 113     }
 114   u;
 115 };
 116
 117 \f
 118 /* Current input file name.  */
 119 const char *current_file;
 120
 121 /* Current input line number.  */
 122 int current_line;
 123
 124 /* Obstack for storing individual tokens.  */
 125 static struct obstack token_stack;
 126
 127 /* Obstack for storing file names.  */
 128 static struct obstack file_names;
 129
 130 /* Wrapup input stack.  */
 131 static struct obstack *wrapup_stack;
 132
 133 /* Current stack, from input or wrapup.  */
 134 static struct obstack *current_input;
 135
 136 /* Bottom of token_stack, for obstack_free.  */
 137 static void *token_bottom;
 138
 139 /* Pointer to top of current_input.  */
 140 static input_block *isp;
 141
 142 /* Pointer to top of wrapup_stack.  */
 143 static input_block *wsp;
 144
 145 /* Aux. for handling split push_string ().  */
 146 static input_block *next;
 147
 148 /* Flag for next_char () to increment current_line.  */
 149 static bool start_of_input_line;
 150
 151 /* Flag for next_char () to recognize change in input block.  */
 152 static bool input_change;
 153
 154 #define CHAR_EOF        (UCHAR_MAX + 1) /* Return on EOF.  */
 155 #define CHAR_MACRO      (UCHAR_MAX + 2) /* Return for MACRO token.  */
 156 #define CHAR_QUOTE      (UCHAR_MAX + 3) /* Return for quoted string.  */
 157 #define CHAR_ARGV       (UCHAR_MAX + 4) /* Return for $@ reference.  */
 158
 159 /* Quote chars.  */
 160 string_pair curr_quote;
 161
 162 /* Comment chars.  */
 163 string_pair curr_comm;
 164
 165 #ifdef ENABLE_CHANGEWORD
 166
 167 # define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*"
 168
 169 /* Current regular expression for detecting words.  */
 170 static struct re_pattern_buffer word_regexp;
 171
 172 /* True if changeword is not active.  */
 173 static bool default_word_regexp;
 174
 175 /* Reused memory for detecting matches in word detection.  */
 176 static struct re_registers regs;
 177
 178 #else /* !ENABLE_CHANGEWORD */
 179 # define default_word_regexp true
 180 #endif /* !ENABLE_CHANGEWORD */
 181
 182 /* Track the current quote age, determined by all significant
 183    changequote, changecom, and changeword calls, since any one of
 184    these can alter the rescan of a prior parameter in a quoted
 185    context.  */
 186 static unsigned int current_quote_age;
 187
 188 /* Cache a quote pair.  See quote_cache.  */
 189 static string_pair *cached_quote;
 190
 191 static bool pop_input (bool);
 192 static void set_quote_age (void);
 193
 194 #ifdef DEBUG_INPUT
 195 static const char *token_type_string (token_type);
 196 #endif /* DEBUG_INPUT */
 197 \f
 198
 199 /*-------------------------------------------------------------------.
 200 | Given an obstack OBS, capture any unfinished text as a link in the |
 201 | chain that starts at *START and ends at *END.  START may be NULL   |
 202 | if *END is non-NULL.                                               |
 203 `-------------------------------------------------------------------*/
 204 void
 205 make_text_link (struct obstack *obs, token_chain **start, token_chain **end)
 206 {
 207   token_chain *chain;
 208   size_t len = obstack_object_size (obs);
 209
 210   assert (end && (start || *end));
 211   if (len)
 212     {
 213       char *str = (char *) obstack_finish (obs);
 214       chain = (token_chain *) obstack_alloc (obs, sizeof *chain);
 215       if (*end)
 216         (*end)->next = chain;
 217       else
 218         *start = chain;
 219       *end = chain;
 220       chain->next = NULL;
 221       chain->type = CHAIN_STR;
 222       chain->quote_age = 0;
 223       chain->u.u_s.str = str;
 224       chain->u.u_s.len = len;
 225       chain->u.u_s.level = -1;
 226     }
 227 }
 228
 229 /*-------------------------------------------------------------------.
 230 | push_file () pushes an input file on the input stack, saving the   |
 231 | current file name and line number.  If next is non-NULL, this push |
 232 | invalidates a call to push_string_init (), whose storage is        |
 233 | consequently released.  If CLOSE, then close FP after EOF is       |
 234 | detected.  TITLE is used as the location for text parsed from the  |
 235 | file (not necessarily the file name).                              |
 236 `-------------------------------------------------------------------*/
 237
 238 void
 239 push_file (FILE *fp, const char *title, bool close)
 240 {
 241   input_block *i;
 242
 243   if (next != NULL)
 244     {
 245       obstack_free (current_input, next);
 246       next = NULL;
 247     }
 248
 249   if (debug_level & DEBUG_TRACE_INPUT)
 250     DEBUG_MESSAGE1 ("input read from %s", title);
 251
 252   i = (input_block *) obstack_alloc (current_input, sizeof *i);
 253   i->type = INPUT_FILE;
 254   i->file = (char *) obstack_copy0 (&file_names, title, strlen (title));
 255   i->line = 1;
 256   input_change = true;
 257
 258   i->u.u_f.fp = fp;
 259   i->u.u_f.end = false;
 260   i->u.u_f.close = close;
 261   i->u.u_f.advance = start_of_input_line;
 262   output_current_line = -1;
 263
 264   i->prev = isp;
 265   isp = i;
 266 }
 267
 268 /*-----------------------------------------------------------------.
 269 | push_macro () pushes the builtin macro FUNC on the input stack.  |
 270 | If next is non-NULL, this push invalidates a call to             |
 271 | push_string_init (), whose storage is consequently released.     |
 272 `-----------------------------------------------------------------*/
 273
 274 void
 275 push_macro (builtin_func *func)
 276 {
 277   input_block *i;
 278
 279   if (next != NULL)
 280     {
 281       obstack_free (current_input, next);
 282       next = NULL;
 283     }
 284
 285   assert (func);
 286   i = (input_block *) obstack_alloc (current_input, sizeof *i);
 287   i->type = INPUT_MACRO;
 288   i->file = current_file;
 289   i->line = current_line;
 290   input_change = true;
 291
 292   i->u.func = func;
 293   i->prev = isp;
 294   isp = i;
 295 }
 296
 297 /*--------------------------------------------------------------.
 298 | First half of push_string ().  The return value points to the |
 299 | obstack where expansion text should be placed.                |
 300 `--------------------------------------------------------------*/
 301
 302 struct obstack *
 303 push_string_init (void)
 304 {
 305   /* Free any memory occupied by completely parsed strings.  */
 306   assert (next == NULL);
 307   while (isp && pop_input (false));
 308
 309   /* Reserve the next location on the obstack.  */
 310   next = (input_block *) obstack_alloc (current_input, sizeof *next);
 311   next->type = INPUT_STRING;
 312   next->file = current_file;
 313   next->line = current_line;
 314
 315   return current_input;
 316 }
 317
 318 /*--------------------------------------------------------------------.
 319 | This function allows gathering input from multiple locations,       |
 320 | rather than copying everything consecutively onto the input stack.  |
 321 | Must be called between push_string_init and push_string_finish.     |
 322 |                                                                     |
 323 | If TOKEN contains text, then convert the current input block into   |
 324 | a chain if it is not one already, and add the contents of TOKEN as  |
 325 | a new link in the chain.  LEVEL describes the current expansion     |
 326 | level, or -1 if TOKEN is composite, its contents reside entirely    |
 327 | on the current_input stack, and TOKEN lives in temporary storage.   |
 328 | If TOKEN is a simple string, then it belongs to the current macro   |
 329 | expansion.  If TOKEN is composite, then each text link has a level  |
 330 | of -1 if it belongs to the current macro expansion, otherwise it    |
 331 | is a back-reference where level tracks which stack it came from.    |
 332 | The resulting input block chain contains links with a level of -1   |
 333 | if the text belongs to the input stack, otherwise the level where   |
 334 | the back-reference comes from.                                      |
 335 |                                                                     |
 336 | Return true only if a reference was created to the contents of      |
 337 | TOKEN, in which case, LEVEL was non-negative and the lifetime of    |
 338 | TOKEN and its contents must last as long as the input engine can    |
 339 | parse references to it.  INUSE determines whether composite tokens  |
 340 | should favor creating back-references or copying text.              |
 341 `--------------------------------------------------------------------*/
 342 bool
 343 push_token (token_data *token, int level, bool inuse)
 344 {
 345   token_chain *src_chain = NULL;
 346   token_chain *chain;
 347
 348   assert (next);
 349
 350   /* Speed consideration - for short enough tokens, the speed and
 351      memory overhead of parsing another INPUT_CHAIN link outweighs the
 352      time to inline the token text.  But don't re-copy text if it
 353      already lives on the obstack.  */
 354   if (TOKEN_DATA_TYPE (token) == TOKEN_TEXT)
 355     {
 356       assert (level >= 0);
 357       if (TOKEN_DATA_LEN (token) <= INPUT_INLINE_THRESHOLD)
 358         {
 359           obstack_grow (current_input, TOKEN_DATA_TEXT (token),
 360                         TOKEN_DATA_LEN (token));
 361           return false;
 362         }
 363     }
 364   else
 365     {
 366       /* For composite tokens, if argv is already in use, creating
 367          additional references for long text segments is more
 368          efficient in time.  But if argv is not yet in use, and we
 369          have a composite token, then the token must already contain a
 370          back-reference, and memory usage is more efficient if we can
 371          avoid using the current expand_macro, even if it means larger
 372          copies.  */
 373       assert (TOKEN_DATA_TYPE (token) == TOKEN_COMP);
 374       src_chain = token->u.u_c.chain;
 375       while (level >= 0 && src_chain && src_chain->type == CHAIN_STR
 376              && (src_chain->u.u_s.len <= INPUT_INLINE_THRESHOLD
 377                  || (!inuse && src_chain->u.u_s.level == -1)))
 378         {
 379           obstack_grow (current_input, src_chain->u.u_s.str,
 380                         src_chain->u.u_s.len);
 381           src_chain = src_chain->next;
 382         }
 383       if (!src_chain)
 384         return false;
 385     }
 386
 387   if (next->type == INPUT_STRING)
 388     {
 389       next->type = INPUT_CHAIN;
 390       next->u.u_c.chain = next->u.u_c.end = NULL;
 391     }
 392   make_text_link (current_input, &next->u.u_c.chain, &next->u.u_c.end);
 393   if (TOKEN_DATA_TYPE (token) == TOKEN_TEXT)
 394     {
 395       chain = (token_chain *) obstack_alloc (current_input, sizeof *chain);
 396       if (next->u.u_c.end)
 397         next->u.u_c.end->next = chain;
 398       else
 399         next->u.u_c.chain = chain;
 400       next->u.u_c.end = chain;
 401       chain->next = NULL;
 402       chain->type = CHAIN_STR;
 403       chain->quote_age = TOKEN_DATA_QUOTE_AGE (token);
 404       chain->u.u_s.str = TOKEN_DATA_TEXT (token);
 405       chain->u.u_s.len = TOKEN_DATA_LEN (token);
 406       chain->u.u_s.level = level;
 407       adjust_refcount (level, true);
 408       inuse = true;
 409     }
 410   while (src_chain)
 411     {
 412       if (level == -1)
 413         {
 414           /* Nothing to copy, since link already lives on obstack.  */
 415           assert (src_chain->type != CHAIN_STR
 416                   || src_chain->u.u_s.level == -1);
 417           chain = src_chain;
 418         }
 419       else
 420         {
 421           /* Allow inlining the final link with subsequent text.  */
 422           if (!src_chain->next && src_chain->type == CHAIN_STR
 423               && (src_chain->u.u_s.len <= INPUT_INLINE_THRESHOLD
 424                   || (!inuse && src_chain->u.u_s.level == -1)))
 425             {
 426               obstack_grow (current_input, src_chain->u.u_s.str,
 427                             src_chain->u.u_s.len);
 428               break;
 429             }
 430           /* We must clone each link in the chain, since next_char
 431              destructively modifies the chain it is parsing.  */
 432           chain = (token_chain *) obstack_copy (current_input, src_chain,
 433                                                 sizeof *chain);
 434           if (chain->type == CHAIN_STR && chain->u.u_s.level == -1)
 435             {
 436               if (chain->u.u_s.len <= INPUT_INLINE_THRESHOLD || !inuse)
 437                 chain->u.u_s.str = (char *) obstack_copy (current_input,
 438                                                           chain->u.u_s.str,
 439                                                           chain->u.u_s.len);
 440               else
 441                 {
 442                   chain->u.u_s.level = level;
 443                   inuse = true;
 444                 }
 445             }
 446         }
 447       if (next->u.u_c.end)
 448         next->u.u_c.end->next = chain;
 449       else
 450         next->u.u_c.chain = chain;
 451       next->u.u_c.end = chain;
 452       if (chain->type == CHAIN_ARGV)
 453         {
 454           assert (!chain->u.u_a.comma && !chain->u.u_a.skip_last);
 455           inuse |= arg_adjust_refcount (chain->u.u_a.argv, true);
 456         }
 457       else if (chain->type == CHAIN_STR && chain->u.u_s.level >= 0)
 458         adjust_refcount (chain->u.u_s.level, true);
 459       src_chain = src_chain->next;
 460     }
 461   return inuse;
 462 }
 463
 464 /*-------------------------------------------------------------------.
 465 | Last half of push_string ().  If next is now NULL, a call to       |
 466 | push_file () or push_macro () has invalidated the previous call to |
 467 | push_string_init (), so we just give up.  If the new object is     |
 468 | void, we do not push it.  The function push_string_finish ()       |
 469 | returns an opaque pointer to the finished object, which can then   |
 470 | be printed with input_print when tracing is enabled.  This pointer |
 471 | is only for temporary use, since reading the next token will       |
 472 | invalidate the object.                                             |
 473 `-------------------------------------------------------------------*/
 474
 475 const input_block *
 476 push_string_finish (void)
 477 {
 478   input_block *ret = NULL;
 479   size_t len = obstack_object_size (current_input);
 480
 481   if (next == NULL)
 482     {
 483       assert (!len);
 484       return NULL;
 485     }
 486
 487   if (len || next->type == INPUT_CHAIN)
 488     {
 489       if (next->type == INPUT_STRING)
 490         {
 491           next->u.u_s.str = (char *) obstack_finish (current_input);
 492           next->u.u_s.len = len;
 493         }
 494       else
 495         make_text_link (current_input, &next->u.u_c.chain, &next->u.u_c.end);
 496       next->prev = isp;
 497       isp = next;
 498       input_change = true;
 499       ret = isp;
 500     }
 501   else
 502     obstack_free (current_input, next);
 503   next = NULL;
 504   return ret;
 505 }
 506
 507 /*--------------------------------------------------------------.
 508 | The function push_wrapup_init () returns an obstack ready for |
 509 | direct expansion of wrapup text, and should be followed by    |
 510 | push_wrapup_finish ().                                        |
 511 `--------------------------------------------------------------*/
 512
 513 struct obstack *
 514 push_wrapup_init (void)
 515 {
 516   input_block *i;
 517   i = (input_block *) obstack_alloc (wrapup_stack, sizeof *i);
 518   i->prev = wsp;
 519   i->type = INPUT_STRING;
 520   i->file = current_file;
 521   i->line = current_line;
 522   wsp = i;
 523   return wrapup_stack;
 524 }
 525
 526 /*---------------------------------------------------------------.
 527 | After pushing wrapup text, push_wrapup_finish () completes the |
 528 | bookkeeping.                                                   |
 529 `---------------------------------------------------------------*/
 530 void
 531 push_wrapup_finish (void)
 532 {
 533   input_block *i = wsp;
 534   if (obstack_object_size (wrapup_stack) == 0)
 535     {
 536       wsp = i->prev;
 537       obstack_free (wrapup_stack, i);
 538     }
 539   else
 540     {
 541       i->u.u_s.len = obstack_object_size (wrapup_stack);
 542       i->u.u_s.str = (char *) obstack_finish (wrapup_stack);
 543     }
 544 }
 545 \f
 546
 547 /*-------------------------------------------------------------------.
 548 | The function pop_input () pops one level of input sources.  If     |
 549 | CLEANUP, and the popped input_block is a file, current_file and    |
 550 | current_line are reset to the saved values before the memory for   |
 551 | the input_block is released.  The return value is false if cleanup |
 552 | is still required, or if the current input source is not           |
 553 | exhausted.                                                         |
 554 `-------------------------------------------------------------------*/
 555
 556 static bool
 557 pop_input (bool cleanup)
 558 {
 559   input_block *tmp = isp->prev;
 560   token_chain *chain;
 561
 562   switch (isp->type)
 563     {
 564     case INPUT_STRING:
 565       assert (!cleanup || !isp->u.u_s.len);
 566       if (isp->u.u_s.len)
 567         return false;
 568       break;
 569
 570     case INPUT_MACRO:
 571       if (!cleanup)
 572         return false;
 573       break;
 574
 575     case INPUT_CHAIN:
 576       chain = isp->u.u_c.chain;
 577       assert (!chain || !cleanup);
 578       while (chain)
 579         {
 580           switch (chain->type)
 581             {
 582             case CHAIN_STR:
 583               if (chain->u.u_s.len)
 584                 return false;
 585               if (chain->u.u_s.level >= 0)
 586                 adjust_refcount (chain->u.u_s.level, false);
 587               break;
 588             case CHAIN_ARGV:
 589               if (chain->u.u_a.index < arg_argc (chain->u.u_a.argv))
 590                 return false;
 591               arg_adjust_refcount (chain->u.u_a.argv, false);
 592               break;
 593             default:
 594               assert (!"pop_input");
 595               abort ();
 596             }
 597           isp->u.u_c.chain = chain = chain->next;
 598         }
 599       break;
 600
 601     case INPUT_FILE:
 602       if (!cleanup)
 603         return false;
 604       if (debug_level & DEBUG_TRACE_INPUT)
 605         {
 606           if (tmp)
 607             DEBUG_MESSAGE2 ("input reverted to %s, line %d",
 608                             tmp->file, tmp->line);
 609           else
 610             DEBUG_MESSAGE ("input exhausted");
 611         }
 612
 613       if (ferror (isp->u.u_f.fp))
 614         {
 615           m4_error (0, 0, NULL, _("read error"));
 616           if (isp->u.u_f.close)
 617             fclose (isp->u.u_f.fp);
 618         }
 619       else if (isp->u.u_f.close && fclose (isp->u.u_f.fp) == EOF)
 620         m4_error (0, errno, NULL, _("error reading file"));
 621       start_of_input_line = isp->u.u_f.advance;
 622       output_current_line = -1;
 623       break;
 624
 625     default:
 626       assert (!"pop_input");
 627       abort ();
 628     }
 629   obstack_free (current_input, isp);
 630   cached_quote = NULL;
 631   next = NULL;                  /* might be set in push_string_init () */
 632
 633   isp = tmp;
 634   input_change = true;
 635   return true;
 636 }
 637
 638 /*------------------------------------------------------------------------.
 639 | To switch input over to the wrapup stack, main () calls pop_wrapup ().  |
 640 | Since wrapup text can install new wrapup text, pop_wrapup () returns    |
 641 | false when there is no wrapup text on the stack, and true otherwise.    |
 642 `------------------------------------------------------------------------*/
 643
 644 bool
 645 pop_wrapup (void)
 646 {
 647   next = NULL;
 648   obstack_free (current_input, NULL);
 649   free (current_input);
 650
 651   if (wsp == NULL)
 652     {
 653       /* End of the program.  Free all memory even though we are about
 654          to exit, since it makes leak detection easier.  */
 655       obstack_free (&token_stack, NULL);
 656       obstack_free (&file_names, NULL);
 657       obstack_free (wrapup_stack, NULL);
 658       free (wrapup_stack);
 659 #ifdef ENABLE_CHANGEWORD
 660       regfree (&word_regexp);
 661 #endif /* ENABLE_CHANGEWORD */
 662       return false;
 663     }
 664
 665   current_input = wrapup_stack;
 666   wrapup_stack = (struct obstack *) xmalloc (sizeof *wrapup_stack);
 667   obstack_init (wrapup_stack);
 668
 669   isp = wsp;
 670   wsp = NULL;
 671   input_change = true;
 672
 673   return true;
 674 }
 675
 676 /*--------------------------------------------------------------.
 677 | Dump a representation of INPUT to the obstack OBS, for use in |
 678 | tracing.                                                      |
 679 `--------------------------------------------------------------*/
 680 void
 681 input_print (struct obstack *obs, const input_block *input)
 682 {
 683   int maxlen = max_debug_argument_length;
 684   token_chain *chain;
 685
 686   assert (input);
 687   switch (input->type)
 688     {
 689     case INPUT_STRING:
 690       shipout_string_trunc (obs, input->u.u_s.str, input->u.u_s.len, &maxlen);
 691       break;
 692     case INPUT_FILE:
 693       obstack_grow (obs, "<file: ", strlen ("<file: "));
 694       obstack_grow (obs, input->file, strlen (input->file));
 695       obstack_1grow (obs, '>');
 696       break;
 697     case INPUT_MACRO:
 698       func_print (obs, find_builtin_by_addr (input->u.func), false, NULL);
 699       break;
 700     case INPUT_CHAIN:
 701       chain = input->u.u_c.chain;
 702       while (chain)
 703         {
 704           switch (chain->type)
 705             {
 706             case CHAIN_STR:
 707               if (shipout_string_trunc (obs, chain->u.u_s.str,
 708                                         chain->u.u_s.len, &maxlen))
 709                 return;
 710               break;
 711             case CHAIN_ARGV:
 712               assert (!chain->u.u_a.comma);
 713               if (arg_print (obs, chain->u.u_a.argv, chain->u.u_a.index,
 714                              quote_cache (NULL, chain->quote_age,
 715                                           chain->u.u_a.quotes),
 716                              chain->u.u_a.flatten, NULL, &maxlen, false))
 717                 return;
 718               break;
 719             default:
 720               assert (!"input_print");
 721               abort ();
 722             }
 723           chain = chain->next;
 724         }
 725       break;
 726     default:
 727       assert (!"input_print");
 728       abort ();
 729     }
 730 }
 731 \f
 732
 733 /*------------------------------------------------------------------.
 734 | Low level input is done a character at a time.  The function      |
 735 | peek_input () is used to look at the next character in the input  |
 736 | stream.  At any given time, it reads from the input_block on the  |
 737 | top of the current input stack.  The return value is an unsigned  |
 738 | char, CHAR_EOF if there is no more input, CHAR_MACRO if a builtin |
 739 | token occurs next, or CHAR_ARGV if ALLOW_ARGV and the input is    |
 740 | visiting an argv reference with the correct quoting.              |
 741 `------------------------------------------------------------------*/
 742
 743 static int
 744 peek_input (bool allow_argv)
 745 {
 746   int ch;
 747   input_block *block = isp;
 748   token_chain *chain;
 749
 750   while (1)
 751     {
 752       if (block == NULL)
 753         return CHAR_EOF;
 754
 755       switch (block->type)
 756         {
 757         case INPUT_STRING:
 758           if (!block->u.u_s.len)
 759             break;
 760           return to_uchar (block->u.u_s.str[0]);
 761
 762         case INPUT_FILE:
 763           ch = getc (block->u.u_f.fp);
 764           if (ch != EOF)
 765             {
 766               ungetc (ch, block->u.u_f.fp);
 767               return ch;
 768             }
 769           block->u.u_f.end = true;
 770           break;
 771
 772         case INPUT_MACRO:
 773           return CHAR_MACRO;
 774
 775         case INPUT_CHAIN:
 776           chain = block->u.u_c.chain;
 777           while (chain)
 778             {
 779               unsigned int argc;
 780               switch (chain->type)
 781                 {
 782                 case CHAIN_STR:
 783                   if (chain->u.u_s.len)
 784                     return to_uchar (*chain->u.u_s.str);
 785                   break;
 786                 case CHAIN_ARGV:
 787                   argc = arg_argc (chain->u.u_a.argv);
 788                   if (chain->u.u_a.index == argc)
 789                     break;
 790                   if (chain->u.u_a.comma)
 791                     return ',';
 792                   /* Only return a reference if the quoting is correct
 793                      and the reference has more than one argument
 794                      left.  */
 795                   if (allow_argv && chain->quote_age == current_quote_age
 796                       && chain->u.u_a.quotes && chain->u.u_a.index + 1 < argc)
 797                     return CHAR_ARGV;
 798                   /* Rather than directly parse argv here, we push
 799                      another input block containing the next unparsed
 800                      argument from argv.  */
 801                   push_string_init ();
 802                   push_arg_quote (current_input, chain->u.u_a.argv,
 803                                   chain->u.u_a.index,
 804                                   quote_cache (NULL, chain->quote_age,
 805                                                chain->u.u_a.quotes));
 806                   chain->u.u_a.index++;
 807                   chain->u.u_a.comma = true;
 808                   push_string_finish ();
 809                   return peek_input (allow_argv);
 810                 default:
 811                   assert (!"peek_input");
 812                   abort ();
 813                 }
 814               chain = chain->next;
 815             }
 816           break;
 817
 818         default:
 819           assert (!"peek_input");
 820           abort ();
 821         }
 822       block = block->prev;
 823     }
 824 }
 825
 826 /*-------------------------------------------------------------------.
 827 | The function next_char () is used to read and advance the input to |
 828 | the next character.  It also manages line numbers for error        |
 829 | messages, so they do not get wrong due to lookahead.  The token    |
 830 | consisting of a newline alone is taken as belonging to the line it |
 831 | ends, and the current line number is not incremented until the     |
 832 | next character is read.  99.9% of all calls will read from a       |
 833 | string, so factor that out into a macro for speed.  If             |
 834 | ALLOW_QUOTE, and the current input matches the current quote age,  |
 835 | return CHAR_QUOTE and leave consumption of data for                |
 836 | append_quote_token.                                                |
 837 `-------------------------------------------------------------------*/
 838
 839 #define next_char(AQ)                                                   \
 840   (isp && isp->type == INPUT_STRING && isp->u.u_s.len && !input_change  \
 841    ? (isp->u.u_s.len--, to_uchar (*isp->u.u_s.str++))                   \
 842    : next_char_1 (AQ))
 843
 844 static int
 845 next_char_1 (bool allow_quote)
 846 {
 847   int ch;
 848   token_chain *chain;
 849
 850   while (1)
 851     {
 852       if (isp == NULL)
 853         {
 854           current_file = "";
 855           current_line = 0;
 856           return CHAR_EOF;
 857         }
 858
 859       if (input_change)
 860         {
 861           current_file = isp->file;
 862           current_line = isp->line;
 863           input_change = false;
 864         }
 865
 866       switch (isp->type)
 867         {
 868         case INPUT_STRING:
 869           if (!isp->u.u_s.len)
 870             break;
 871           isp->u.u_s.len--;
 872           return to_uchar (*isp->u.u_s.str++);
 873
 874         case INPUT_FILE:
 875           if (start_of_input_line)
 876             {
 877               start_of_input_line = false;
 878               current_line = ++isp->line;
 879             }
 880
 881           /* If stdin is a terminal, calling getc after peek_input
 882              already called it would make the user have to hit ^D
 883              twice to quit.  */
 884           ch = isp->u.u_f.end ? EOF : getc (isp->u.u_f.fp);
 885           if (ch != EOF)
 886             {
 887               if (ch == '\n')
 888                 start_of_input_line = true;
 889               return ch;
 890             }
 891           break;
 892
 893         case INPUT_MACRO:
 894           /* INPUT_MACRO input sources has only one token */
 895           pop_input (true);
 896           return CHAR_MACRO;
 897
 898         case INPUT_CHAIN:
 899           chain = isp->u.u_c.chain;
 900           while (chain)
 901             {
 902               if (allow_quote && chain->quote_age == current_quote_age)
 903                 return CHAR_QUOTE;
 904               switch (chain->type)
 905                 {
 906                 case CHAIN_STR:
 907                   if (chain->u.u_s.len)
 908                     {
 909                       /* Partial consumption invalidates quote age.  */
 910                       chain->quote_age = 0;
 911                       chain->u.u_s.len--;
 912                       return to_uchar (*chain->u.u_s.str++);
 913                     }
 914                   if (chain->u.u_s.level >= 0)
 915                     adjust_refcount (chain->u.u_s.level, false);
 916                   break;
 917                 case CHAIN_ARGV:
 918                   if (chain->u.u_a.index == arg_argc (chain->u.u_a.argv))
 919                     {
 920                       arg_adjust_refcount (chain->u.u_a.argv, false);
 921                       break;
 922                     }
 923                   if (chain->u.u_a.comma)
 924                     {
 925                       chain->u.u_a.comma = false;
 926                       return ',';
 927                     }
 928                   /* Rather than directly parse argv here, we push
 929                      another input block containing the next unparsed
 930                      argument from argv.  */
 931                   push_string_init ();
 932                   push_arg_quote (current_input, chain->u.u_a.argv,
 933                                   chain->u.u_a.index,
 934                                   quote_cache (NULL, chain->quote_age,
 935                                                chain->u.u_a.quotes));
 936                   chain->u.u_a.index++;
 937                   chain->u.u_a.comma = true;
 938                   push_string_finish ();
 939                   return next_char_1 (allow_quote);
 940                 default:
 941                   assert (!"next_char_1");
 942                   abort ();
 943                 }
 944               isp->u.u_c.chain = chain = chain->next;
 945             }
 946           break;
 947
 948         default:
 949           assert (!"next_char_1");
 950           abort ();
 951         }
 952
 953       /* End of input source --- pop one level.  */
 954       pop_input (true);
 955     }
 956 }
 957
 958 /*-------------------------------------------------------------------.
 959 | skip_line () simply discards all immediately following characters, |
 960 | up to the first newline.  It is only used from m4_dnl ().  Report  |
 961 | warnings on behalf of NAME.                                        |
 962 `-------------------------------------------------------------------*/
 963
 964 void
 965 skip_line (const char *name)
 966 {
 967   int ch;
 968   const char *file = current_file;
 969   int line = current_line;
 970
 971   while ((ch = next_char (false)) != CHAR_EOF && ch != '\n')
 972     ;
 973   if (ch == CHAR_EOF)
 974     /* current_file changed to "" if we see CHAR_EOF, use the
 975        previous value we stored earlier.  */
 976     m4_warn_at_line (0, file, line, name,
 977                      _("end of file treated as newline"));
 978   /* On the rare occasion that dnl crosses include file boundaries
 979      (either the input file did not end in a newline, or changeword
 980      was used), calling next_char can update current_file and
 981      current_line, and that update will be undone as we return to
 982      expand_macro.  This informs next_char to fix things again.  */
 983   if (file != current_file || line != current_line)
 984     input_change = true;
 985 }
 986
 987 /*-------------------------------------------------------------------.
 988 | When a MACRO token is seen, next_token () uses init_macro_token () |
 989 | to retrieve the value of the function pointer and store it in TD.  |
 990 `-------------------------------------------------------------------*/
 991
 992 static void
 993 init_macro_token (token_data *td)
 994 {
 995   assert (isp->type == INPUT_MACRO);
 996   TOKEN_DATA_TYPE (td) = TOKEN_FUNC;
 997   TOKEN_DATA_FUNC (td) = isp->u.func;
 998 }
 999
1000 /*-------------------------------------------------------------------.
1001 | When a QUOTE token is seen, convert TD to a composite (if it is    |
1002 | not one already), consisting of any unfinished text on OBS, as     |
1003 | well as the quoted token from the top of the input stack.  Use OBS |
1004 | for any additional allocations needed to store the token chain.    |
1005 `-------------------------------------------------------------------*/
1006 static void
1007 append_quote_token (struct obstack *obs, token_data *td)
1008 {
1009   token_chain *src_chain = isp->u.u_c.chain;
1010   token_chain *chain;
1011
1012   assert (isp->type == INPUT_CHAIN && obs && current_quote_age);
1013   isp->u.u_c.chain = src_chain->next;
1014
1015   /* Speed consideration - for short enough tokens, the speed and
1016      memory overhead of parsing another INPUT_CHAIN link outweighs the
1017      time to inline the token text.  */
1018   if (src_chain->type == CHAIN_STR
1019       && src_chain->u.u_s.len <= INPUT_INLINE_THRESHOLD)
1020     {
1021       assert (src_chain->u.u_s.level >= 0);
1022       obstack_grow (obs, src_chain->u.u_s.str, src_chain->u.u_s.len);
1023       adjust_refcount (src_chain->u.u_s.level, false);
1024       return;
1025     }
1026
1027   if (TOKEN_DATA_TYPE (td) == TOKEN_VOID)
1028     {
1029       TOKEN_DATA_TYPE (td) = TOKEN_COMP;
1030       td->u.u_c.chain = td->u.u_c.end = NULL;
1031       td->u.u_c.wrapper = td->u.u_c.has_func = false;
1032     }
1033   assert (TOKEN_DATA_TYPE (td) == TOKEN_COMP);
1034   make_text_link (obs, &td->u.u_c.chain, &td->u.u_c.end);
1035   chain = (token_chain *) obstack_copy (obs, src_chain, sizeof *chain);
1036   if (td->u.u_c.end)
1037     td->u.u_c.end->next = chain;
1038   else
1039     td->u.u_c.chain = chain;
1040   td->u.u_c.end = chain;
1041   if (chain->type == CHAIN_ARGV && chain->u.u_a.has_func)
1042     td->u.u_c.has_func = true;
1043   chain->next = NULL;
1044 }
1045
1046
1047 /*-------------------------------------------------------------------.
1048 | When an ARGV token is seen, convert TD to point to it via a        |
1049 | composite token.  Use OBS for any additional allocations needed to |
1050 | store the token chain.                                             |
1051 `-------------------------------------------------------------------*/
1052 static void
1053 init_argv_token (struct obstack *obs, token_data *td)
1054 {
1055   token_chain *src_chain;
1056   token_chain *chain;
1057   int ch = next_char (true);
1058
1059   assert (ch == CHAR_QUOTE && TOKEN_DATA_TYPE (td) == TOKEN_VOID
1060           && isp->type == INPUT_CHAIN && isp->u.u_c.chain->type == CHAIN_ARGV
1061           && obs && obstack_object_size (obs) == 0);
1062
1063   src_chain = isp->u.u_c.chain;
1064   isp->u.u_c.chain = src_chain->next;
1065   TOKEN_DATA_TYPE (td) = TOKEN_COMP;
1066   /* Clone the link, since the input will be discarded soon.  */
1067   chain = (token_chain *) obstack_copy (obs, src_chain, sizeof *chain);
1068   td->u.u_c.chain = td->u.u_c.end = chain;
1069   td->u.u_c.wrapper = true;
1070   td->u.u_c.has_func = chain->u.u_a.has_func;
1071   chain->next = NULL;
1072
1073   /* If the next character is not ',' or ')', then unlink the last
1074      argument from argv and schedule it for reparsing.  This way,
1075      expand_argument never has to deal with concatenation of argv with
1076      arbitrary text.  Note that the implementation of safe_quotes
1077      ensures peek_input won't return CHAR_ARGV if the user is perverse
1078      enough to mix comment delimiters with argument separators:
1079
1080        define(n,`$#')define(echo,$*)changecom(`,,',`)')n(echo(a,`,b`)'',c))
1081        => 2 (not 3)
1082
1083      Therefore, we do not have to worry about calling MATCH, and thus
1084      do not have to worry about pop_input being called and
1085      invalidating the argv reference.
1086
1087      When the $@ ref is used unchanged, we completely bypass the
1088      decrement of the argv refcount in next_char_1, since the ref is
1089      still live via the current collect_arguments.  However, when the
1090      last element of the $@ ref is reparsed, we must increase the argv
1091      refcount here, to compensate for the fact that it will be
1092      decreased once the final element is parsed.  */
1093   assert (*curr_comm.str1 != ',' && *curr_comm.str1 != ')'
1094           && *curr_comm.str1 != *curr_quote.str1);
1095   ch = peek_input (false);
1096   if (ch != ',' && ch != ')')
1097     {
1098       isp->u.u_c.chain = src_chain;
1099       src_chain->u.u_a.index = arg_argc (chain->u.u_a.argv) - 1;
1100       src_chain->u.u_a.comma = true;
1101       chain->u.u_a.skip_last = true;
1102       arg_adjust_refcount (chain->u.u_a.argv, true);
1103     }
1104 }
1105 \f
1106
1107 /*------------------------------------------------------------------.
1108 | This function is for matching a string against a prefix of the    |
1109 | input stream.  If the string S matches the input and CONSUME is   |
1110 | true, the input is discarded; otherwise any characters read are   |
1111 | pushed back again.  The function is used only when multicharacter |
1112 | quotes or comment delimiters are used.                            |
1113 `------------------------------------------------------------------*/
1114
1115 static bool
1116 match_input (const char *s, bool consume)
1117 {
1118   int n;                        /* number of characters matched */
1119   int ch;                       /* input character */
1120   const char *t;
1121   bool result = false;
1122
1123   ch = peek_input (false);
1124   if (ch != to_uchar (*s))
1125     return false;                       /* fail */
1126
1127   if (s[1] == '\0')
1128     {
1129       if (consume)
1130         next_char (false);
1131       return true;                      /* short match */
1132     }
1133
1134   next_char (false);
1135   for (n = 1, t = s++; (ch = peek_input (false)) == to_uchar (*s++); )
1136     {
1137       next_char (false);
1138       n++;
1139       if (*s == '\0')           /* long match */
1140         {
1141           if (consume)
1142             return true;
1143           result = true;
1144           break;
1145         }
1146     }
1147
1148   /* Failed or shouldn't consume, push back input.  */
1149   push_string_init ();
1150   obstack_grow (current_input, t, n);
1151   push_string_finish ();
1152   return result;
1153 }
1154
1155 /*--------------------------------------------------------------------.
1156 | The macro MATCH() is used to match a string S against the input.    |
1157 | The first character is handled inline, for speed.  Hopefully, this  |
1158 | will not hurt efficiency too much when single character quotes and  |
1159 | comment delimiters are used.  If CONSUME, then CH is the result of  |
1160 | next_char, and a successful match will discard the matched string.  |
1161 | Otherwise, CH is the result of peek_input, and the input stream is  |
1162 | effectively unchanged.                                              |
1163 `--------------------------------------------------------------------*/
1164
1165 #define MATCH(ch, s, consume)                                           \
1166   (to_uchar ((s)[0]) == (ch)                                            \
1167    && (ch) != '\0'                                                      \
1168    && ((s)[1] == '\0' || (match_input ((s) + (consume), consume))))
1169 \f
1170
1171 /*----------------------------------------------------------.
1172 | Inititialize input stacks, and quote/comment characters.  |
1173 `----------------------------------------------------------*/
1174
1175 void
1176 input_init (void)
1177 {
1178   current_file = "";
1179   current_line = 0;
1180
1181   current_input = (struct obstack *) xmalloc (sizeof *current_input);
1182   obstack_init (current_input);
1183   wrapup_stack = (struct obstack *) xmalloc (sizeof *wrapup_stack);
1184   obstack_init (wrapup_stack);
1185
1186   obstack_init (&file_names);
1187
1188   /* Allocate an object in the current chunk, so that obstack_free
1189      will always work even if the first token parsed spills to a new
1190      chunk.  */
1191   obstack_init (&token_stack);
1192   token_bottom = obstack_finish (&token_stack);
1193
1194   isp = NULL;
1195   wsp = NULL;
1196   next = NULL;
1197
1198   start_of_input_line = false;
1199
1200   curr_quote.str1 = xstrdup (DEF_LQUOTE);
1201   curr_quote.len1 = strlen (curr_quote.str1);
1202   curr_quote.str2 = xstrdup (DEF_RQUOTE);
1203   curr_quote.len2 = strlen (curr_quote.str2);
1204   curr_comm.str1 = xstrdup (DEF_BCOMM);
1205   curr_comm.len1 = strlen (curr_comm.str1);
1206   curr_comm.str2 = xstrdup (DEF_ECOMM);
1207   curr_comm.len2 = strlen (curr_comm.str2);
1208
1209 #ifdef ENABLE_CHANGEWORD
1210   set_word_regexp (NULL, user_word_regexp);
1211 #endif /* ENABLE_CHANGEWORD */
1212
1213   set_quote_age ();
1214 }
1215 \f
1216
1217 /*--------------------------------------------------------------------.
1218 | Set the quote delimiters to LQ and RQ.  Used by m4_changequote ().  |
1219 | Pass NULL if the argument was not present, to distinguish from an   |
1220 | explicit empty string.                                              |
1221 `--------------------------------------------------------------------*/
1222
1223 void
1224 set_quotes (const char *lq, const char *rq)
1225 {
1226   /* POSIX states that with 0 arguments, the default quotes are used.
1227      POSIX XCU ERN 112 states that behavior is implementation-defined
1228      if there was only one argument, or if there is an empty string in
1229      either position when there are two arguments.  We allow an empty
1230      left quote to disable quoting, but a non-empty left quote will
1231      always create a non-empty right quote.  See the texinfo for what
1232      some other implementations do.  */
1233   if (!lq)
1234     {
1235       lq = DEF_LQUOTE;
1236       rq = DEF_RQUOTE;
1237     }
1238   else if (!rq || (*lq && !*rq))
1239     rq = DEF_RQUOTE;
1240
1241   if (strcmp (curr_quote.str1, lq) == 0 && strcmp (curr_quote.str2, rq) == 0)
1242     return;
1243
1244   free (curr_quote.str1);
1245   free (curr_quote.str2);
1246   curr_quote.str1 = xstrdup (lq);
1247   curr_quote.len1 = strlen (curr_quote.str1);
1248   curr_quote.str2 = xstrdup (rq);
1249   curr_quote.len2 = strlen (curr_quote.str2);
1250   set_quote_age ();
1251 }
1252
1253 /*--------------------------------------------------------------------.
1254 | Set the comment delimiters to BC and EC.  Used by m4_changecom ().  |
1255 | Pass NULL if the argument was not present, to distinguish from an   |
1256 | explicit empty string.                                              |
1257 `--------------------------------------------------------------------*/
1258
1259 void
1260 set_comment (const char *bc, const char *ec)
1261 {
1262   /* POSIX requires no arguments to disable comments.  It requires
1263      empty arguments to be used as-is, but this is counter to
1264      traditional behavior, because a non-null begin and null end makes
1265      it impossible to end a comment.  An aardvark has been filed:
1266      http://www.opengroup.org/austin/mailarchives/ag-review/msg02168.html
1267      This implementation assumes the aardvark will be approved.  See
1268      the texinfo for what some other implementations do.  */
1269   if (!bc)
1270     bc = ec = "";
1271   else if (!ec || (*bc && !*ec))
1272     ec = DEF_ECOMM;
1273
1274   if (strcmp (curr_comm.str1, bc) == 0 && strcmp (curr_comm.str2, ec) == 0)
1275     return;
1276
1277   free (curr_comm.str1);
1278   free (curr_comm.str2);
1279   curr_comm.str1 = xstrdup (bc);
1280   curr_comm.len1 = strlen (curr_comm.str1);
1281   curr_comm.str2 = xstrdup (ec);
1282   curr_comm.len2 = strlen (curr_comm.str2);
1283   set_quote_age ();
1284 }
1285
1286 #ifdef ENABLE_CHANGEWORD
1287
1288 /*-------------------------------------------------------------------.
1289 | Set the regular expression for recognizing words to REGEXP, and    |
1290 | report errors on behalf of CALLER.  If REGEXP is NULL, revert back |
1291 | to the default parsing rules.                                      |
1292 `-------------------------------------------------------------------*/
1293
1294 void
1295 set_word_regexp (const char *caller, const char *regexp)
1296 {
1297   const char *msg;
1298   struct re_pattern_buffer new_word_regexp;
1299
1300   if (!*regexp || !strcmp (regexp, DEFAULT_WORD_REGEXP))
1301     {
1302       default_word_regexp = true;
1303       set_quote_age ();
1304       return;
1305     }
1306
1307   /* Dry run to see whether the new expression is compilable.  */
1308   init_pattern_buffer (&new_word_regexp, NULL);
1309   msg = re_compile_pattern (regexp, strlen (regexp), &new_word_regexp);
1310   regfree (&new_word_regexp);
1311
1312   if (msg != NULL)
1313     {
1314       m4_warn (0, caller, _("bad regular expression `%s': %s"), regexp, msg);
1315       return;
1316     }
1317
1318   /* If compilation worked, retry using the word_regexp struct.  We
1319      can't rely on struct assigns working, so redo the compilation.
1320      The fastmap can be reused between compilations, and will be freed
1321      by the final regfree.  */
1322   if (!word_regexp.fastmap)
1323     word_regexp.fastmap = xcharalloc (UCHAR_MAX + 1);
1324   msg = re_compile_pattern (regexp, strlen (regexp), &word_regexp);
1325   assert (!msg);
1326   re_set_registers (&word_regexp, &regs, regs.num_regs, regs.start, regs.end);
1327   if (re_compile_fastmap (&word_regexp))
1328     assert (false);
1329
1330   default_word_regexp = false;
1331   set_quote_age ();
1332 }
1333
1334 #endif /* ENABLE_CHANGEWORD */
1335
1336 /* Call this when changing anything that might impact the quote age,
1337    so that quote_age and safe_quotes will reflect the change.  */
1338 static void
1339 set_quote_age (void)
1340 {
1341   /* Multi-character quotes are inherently unsafe, since concatenation
1342      of individual characters can result in a quote delimiter,
1343      consider:
1344
1345      define(echo,``$1'')define(a,A)changequote(<[,]>)echo(<[]]><[>a]>)
1346      => A]> (not ]>a)
1347
1348    Also, unquoted close delimiters are unsafe, consider:
1349
1350      define(echo,``$1'')define(a,A)echo(`a''`a')
1351      => aA' (not a'a)
1352
1353    Comment delimiters that overlap with quote delimiters or active
1354    characters also present a problem, consider:
1355
1356      define(echo,$*)echo(a,a,a`'define(a,A)changecom(`,',`,'))
1357      => A,a,A (not A,A,A)
1358
1359    And let's not even think about the impact of changeword, since it
1360    will disappear for M4 2.0.
1361
1362    So rather than check every token for an unquoted delimiter, we
1363    merely encode current_quote_age to 0 when things are unsafe, and
1364    non-zero when safe (namely, to the 16-bit value composed of the
1365    single-character start and end quote delimiters).  There may be
1366    other situations which are safe even when this algorithm sets the
1367    quote_age to zero, but at least a quote_age of zero always produces
1368    correct results (although it may take more time in doing so).  */
1369
1370   /* Hueristic of characters that might impact rescan if they appear in
1371      a quote delimiter.  */
1372 #define Letters "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
1373   static const char unsafe[] = Letters "_0123456789(,) \t\n\r\f\v";
1374 #undef Letters
1375
1376   if (curr_quote.len1 == 1 && curr_quote.len2 == 1
1377       && strpbrk (curr_quote.str1, unsafe) == NULL
1378       && strpbrk (curr_quote.str2, unsafe) == NULL
1379       && default_word_regexp && *curr_quote.str1 != *curr_quote.str2
1380       && *curr_comm.str1 != '(' && *curr_comm.str1 != ','
1381       && *curr_comm.str1 != ')' && *curr_comm.str1 != *curr_quote.str1)
1382     current_quote_age = (((*curr_quote.str1 & 0xff) << 8)
1383                          | (*curr_quote.str2 & 0xff));
1384   else
1385     current_quote_age = 0;
1386   cached_quote = NULL;
1387 }
1388
1389 /* Return the current quote age.  Each non-trivial changequote alters
1390    this value; the idea is that if quoting hasn't changed, then we can
1391    skip parsing a single argument, quoted or unquoted, within the
1392    context of a quoted string, as well as skip parsing a series of
1393    quoted arguments within the context of argument collection.  */
1394 unsigned int
1395 quote_age (void)
1396 {
1397   /* This accessor is a function, so that the implementation can
1398      change if needed.  See set_quote_age for the current
1399      implementation.  */
1400   return current_quote_age;
1401 }
1402
1403 /* Return true if the current quote delimiters guarantee that
1404    reparsing the current token in the context of a quoted string will
1405    be safe.  This could always return false and behavior would still
1406    be correct, just slower.  */
1407 bool
1408 safe_quotes (void)
1409 {
1410   return current_quote_age != 0;
1411 }
1412
1413 /* Interface for caching frequently used quote pairs, using AGE for
1414    optimization.  If QUOTES is NULL, don't use quoting.  If OBS is
1415    non-NULL, AGE should be the current quote age, and QUOTES should be
1416    &curr_quote; the return value will be a cached quote pair, where
1417    the pointer is valid at least as long as OBS is not reset, but
1418    whose contents are only guaranteed until the next changequote or
1419    quote_cache.  Otherwise, OBS is NULL, AGE should be the same as
1420    before, and QUOTES should be a previously returned cache value;
1421    used to refresh the contents of the result.  */
1422 const string_pair *
1423 quote_cache (struct obstack *obs, unsigned int age, const string_pair *quotes)
1424 {
1425   static char lquote[2];
1426   static char rquote[2];
1427   static string_pair simple = {lquote, 1, rquote, 1};
1428
1429   /* Implementation - if AGE is non-zero, then the implementation of
1430      set_quote_age guarantees that we can recreate the return value on
1431      the fly; so we use static storage, and the contents must be used
1432      immediately.  If AGE is zero, then we must copy QUOTES onto OBS
1433      (since changequote will invalidate the original), but we might as
1434      well cache that copy (in case the current expansion contains more
1435      than one instance of $@).  */
1436   if (!quotes)
1437     return NULL;
1438   if (age)
1439     {
1440       *lquote = (age >> 8) & 0xff;
1441       *rquote = age & 0xff;
1442       return &simple;
1443     }
1444   if (!obs)
1445     return quotes;
1446   assert (next && quotes == &curr_quote);
1447   if (!cached_quote)
1448     {
1449       assert (obs == current_input && obstack_object_size (obs) == 0);
1450       cached_quote = (string_pair *) obstack_copy (obs, quotes,
1451                                                    sizeof *quotes);
1452       cached_quote->str1 = (char *) obstack_copy0 (obs, quotes->str1,
1453                                                    quotes->len1);
1454       cached_quote->str2 = (char *) obstack_copy0 (obs, quotes->str2,
1455                                                    quotes->len2);
1456     }
1457   return cached_quote;
1458 }
1459 \f
1460
1461 /*--------------------------------------------------------------------.
1462 | Parse a single token from the input stream, set TD to its           |
1463 | contents, and return its type.  A token is TOKEN_EOF if the         |
1464 | input_stack is empty; TOKEN_STRING for a quoted string or comment;  |
1465 | TOKEN_WORD for something that is a potential macro name; and        |
1466 | TOKEN_SIMPLE for any single character that is not a part of any of  |
1467 | the previous types.  If LINE is not NULL, set *LINE to the line     |
1468 | where the token starts.  If OBS is not NULL, expand TOKEN_STRING    |
1469 | directly into OBS rather than in token_stack temporary storage      |
1470 | area, and TD could be a TOKEN_COMP instead of the usual             |
1471 | TOKEN_TEXT.  If ALLOW_ARGV, OBS must be non-NULL, and an entire     |
1472 | series of arguments can be returned as TOKEN_ARGV when a $@         |
1473 | reference is encountered.  Report errors (unterminated comments or  |
1474 | strings) on behalf of CALLER, if non-NULL.                          |
1475 |                                                                     |
1476 | Next_token () returns the token type, and passes back a pointer to  |
1477 | the token data through TD.  Non-string token text is collected on   |
1478 | the obstack token_stack, which never contains more than one token   |
1479 | text at a time.  The storage pointed to by the fields in TD is      |
1480 | therefore subject to change the next time next_token () is called.  |
1481 `--------------------------------------------------------------------*/
1482
1483 token_type
1484 next_token (token_data *td, int *line, struct obstack *obs, bool allow_argv,
1485             const char *caller)
1486 {
1487   int ch;
1488   int quote_level;
1489   token_type type;
1490 #ifdef ENABLE_CHANGEWORD
1491   char *orig_text = NULL;
1492 #endif /* ENABLE_CHANGEWORD */
1493   const char *file;
1494   int dummy;
1495   /* The obstack where token data is stored.  Generally token_stack,
1496      for tokens where argument collection might not use the literal
1497      token.  But for comments and strings, we can output directly into
1498      the argument collection obstack obs, if one was provided.  */
1499   struct obstack *obs_td = &token_stack;
1500
1501   obstack_free (&token_stack, token_bottom);
1502   if (!line)
1503     line = &dummy;
1504
1505   /* Can't consume character until after CHAR_MACRO is handled.  */
1506   TOKEN_DATA_TYPE (td) = TOKEN_VOID;
1507   ch = peek_input (allow_argv && current_quote_age);
1508   if (ch == CHAR_EOF)
1509     {
1510 #ifdef DEBUG_INPUT
1511       xfprintf (stderr, "next_token -> EOF\n");
1512 #endif /* DEBUG_INPUT */
1513       next_char (false);
1514       return TOKEN_EOF;
1515     }
1516   if (ch == CHAR_MACRO)
1517     {
1518       init_macro_token (td);
1519       next_char (false);
1520 #ifdef DEBUG_INPUT
1521       xfprintf (stderr, "next_token -> MACDEF (%s)\n",
1522                 find_builtin_by_addr (TOKEN_DATA_FUNC (td))->name);
1523 #endif /* DEBUG_INPUT */
1524       return TOKEN_MACDEF;
1525     }
1526   if (ch == CHAR_ARGV)
1527     {
1528       init_argv_token (obs, td);
1529 #ifdef DEBUG_INPUT
1530       xfprintf (stderr, "next_token -> ARGV (%d args)\n",
1531                 (arg_argc (td->u.u_c.chain->u.u_a.argv)
1532                  - td->u.u_c.chain->u.u_a.index
1533                  - (td->u.u_c.chain->u.u_a.skip_last ? 1 : 0)));
1534 #endif
1535       return TOKEN_ARGV;
1536     }
1537
1538   next_char (false); /* Consume character we already peeked at.  */
1539   file = current_file;
1540   *line = current_line;
1541   if (MATCH (ch, curr_comm.str1, true))
1542     {
1543       if (obs)
1544         obs_td = obs;
1545       obstack_grow (obs_td, curr_comm.str1, curr_comm.len1);
1546       while ((ch = next_char (false)) < CHAR_EOF
1547              && !MATCH (ch, curr_comm.str2, true))
1548         obstack_1grow (obs_td, ch);
1549       if (ch != CHAR_EOF)
1550         {
1551           assert (ch < CHAR_EOF);
1552           obstack_grow (obs_td, curr_comm.str2, curr_comm.len2);
1553         }
1554       else
1555         /* Current_file changed to "" if we see CHAR_EOF, use the
1556            previous value we stored earlier.  */
1557         m4_error_at_line (EXIT_FAILURE, 0, file, *line, caller,
1558                           _("end of file in comment"));
1559
1560       type = TOKEN_STRING;
1561     }
1562   else if (default_word_regexp && (isalpha (ch) || ch == '_'))
1563     {
1564       obstack_1grow (&token_stack, ch);
1565       while ((ch = peek_input (false)) < CHAR_EOF
1566              && (isalnum (ch) || ch == '_'))
1567         {
1568           obstack_1grow (&token_stack, ch);
1569           next_char (false);
1570         }
1571       type = TOKEN_WORD;
1572     }
1573
1574 #ifdef ENABLE_CHANGEWORD
1575
1576   else if (!default_word_regexp && word_regexp.fastmap[ch])
1577     {
1578       obstack_1grow (&token_stack, ch);
1579       while (1)
1580         {
1581           ch = peek_input (false);
1582           if (ch >= CHAR_EOF)
1583             break;
1584           obstack_1grow (&token_stack, ch);
1585           if (re_match (&word_regexp, (char *) obstack_base (&token_stack),
1586                         obstack_object_size (&token_stack), 0, &regs)
1587               != obstack_object_size (&token_stack))
1588             {
1589               obstack_blank (&token_stack, -1);
1590               break;
1591             }
1592           next_char (false);
1593         }
1594
1595       obstack_1grow (&token_stack, '\0');
1596       orig_text = (char *) obstack_finish (&token_stack);
1597
1598       if (regs.start[1] != -1)
1599         obstack_grow (&token_stack, orig_text + regs.start[1],
1600                       regs.end[1] - regs.start[1]);
1601       else
1602         obstack_grow (&token_stack, orig_text, regs.end[0]);
1603
1604       type = TOKEN_WORD;
1605     }
1606
1607 #endif /* ENABLE_CHANGEWORD */
1608
1609   else if (!MATCH (ch, curr_quote.str1, true))
1610     {
1611       switch (ch)
1612         {
1613         case '(':
1614           type = TOKEN_OPEN;
1615           break;
1616         case ',':
1617           type = TOKEN_COMMA;
1618           break;
1619         case ')':
1620           type = TOKEN_CLOSE;
1621           break;
1622         default:
1623           type = TOKEN_SIMPLE;
1624           break;
1625         }
1626       obstack_1grow (&token_stack, ch);
1627     }
1628   else
1629     {
1630       if (obs)
1631         obs_td = obs;
1632       quote_level = 1;
1633       while (1)
1634         {
1635           ch = next_char (obs != NULL && current_quote_age);
1636           if (ch == CHAR_EOF)
1637             /* Current_file changed to "" if we see CHAR_EOF, use
1638                the previous value we stored earlier.  */
1639             m4_error_at_line (EXIT_FAILURE, 0, file, *line, caller,
1640                               _("end of file in string"));
1641
1642           if (ch == CHAR_QUOTE)
1643             append_quote_token (obs, td);
1644           else if (MATCH (ch, curr_quote.str2, true))
1645             {
1646               if (--quote_level == 0)
1647                 break;
1648               obstack_grow (obs_td, curr_quote.str2, curr_quote.len2);
1649             }
1650           else if (MATCH (ch, curr_quote.str1, true))
1651             {
1652               quote_level++;
1653               obstack_grow (obs_td, curr_quote.str1, curr_quote.len1);
1654             }
1655           else
1656             {
1657               assert (ch < CHAR_EOF);
1658               obstack_1grow (obs_td, ch);
1659             }
1660         }
1661       type = TOKEN_STRING;
1662     }
1663
1664   if (TOKEN_DATA_TYPE (td) == TOKEN_VOID)
1665     {
1666       TOKEN_DATA_TYPE (td) = TOKEN_TEXT;
1667       TOKEN_DATA_LEN (td) = obstack_object_size (obs_td);
1668       if (obs_td != obs)
1669         {
1670           obstack_1grow (obs_td, '\0');
1671           TOKEN_DATA_TEXT (td) = (char *) obstack_finish (obs_td);
1672         }
1673       else
1674         TOKEN_DATA_TEXT (td) = NULL;
1675       TOKEN_DATA_QUOTE_AGE (td) = current_quote_age;
1676 #ifdef ENABLE_CHANGEWORD
1677       if (orig_text == NULL)
1678         TOKEN_DATA_ORIG_TEXT (td) = TOKEN_DATA_TEXT (td);
1679       else
1680         {
1681           TOKEN_DATA_ORIG_TEXT (td) = orig_text;
1682           TOKEN_DATA_LEN (td) = strlen (orig_text);
1683         }
1684 #endif /* ENABLE_CHANGEWORD */
1685 #ifdef DEBUG_INPUT
1686       xfprintf (stderr, "next_token -> %s (%s), len %zu\n",
1687                 token_type_string (type), TOKEN_DATA_TEXT (td),
1688                 TOKEN_DATA_LEN (td));
1689 #endif /* DEBUG_INPUT */
1690     }
1691   else
1692     {
1693       assert (TOKEN_DATA_TYPE (td) == TOKEN_COMP && type == TOKEN_STRING);
1694 #ifdef DEBUG_INPUT
1695       {
1696         token_chain *chain;
1697         size_t len = 0;
1698         int links = 0;
1699         chain = td->u.u_c.chain;
1700         xfprintf (stderr, "next_token -> %s <chain> (",
1701                   token_type_string (type));
1702         while (chain)
1703           {
1704             switch (chain->type)
1705               {
1706               case CHAIN_STR:
1707                 xfprintf (stderr, "%s", chain->u.u_s.str);
1708                 len += chain->u.u_s.len;
1709                 break;
1710               case CHAIN_ARGV:
1711                 xfprintf (stderr, "{$@}");
1712                 break;
1713               default:
1714                 assert (!"next_token");
1715                 abort ();
1716               }
1717             links++;
1718             chain = chain->next;
1719           }
1720         xfprintf (stderr, "), %d links, len %zu\n",
1721                   links, len);
1722       }
1723 #endif /* DEBUG_INPUT */
1724     }
1725   return type;
1726 }
1727
1728 /*-----------------------------------------------.
1729 | Peek at the next token from the input stream.  |
1730 `-----------------------------------------------*/
1731
1732 token_type
1733 peek_token (void)
1734 {
1735   token_type result;
1736   int ch = peek_input (false);
1737
1738   if (ch == CHAR_EOF)
1739     {
1740       result = TOKEN_EOF;
1741     }
1742   else if (ch == CHAR_MACRO)
1743     {
1744       result = TOKEN_MACDEF;
1745     }
1746   else if (MATCH (ch, curr_comm.str1, false))
1747     {
1748       result = TOKEN_STRING;
1749     }
1750   else if ((default_word_regexp && (isalpha (ch) || ch == '_'))
1751 #ifdef ENABLE_CHANGEWORD
1752       || (!default_word_regexp && word_regexp.fastmap[ch])
1753 #endif /* ENABLE_CHANGEWORD */
1754       )
1755     {
1756       result = TOKEN_WORD;
1757     }
1758   else if (MATCH (ch, curr_quote.str1, false))
1759     {
1760       result = TOKEN_STRING;
1761     }
1762   else
1763     switch (ch)
1764       {
1765       case '(':
1766         result = TOKEN_OPEN;
1767         break;
1768       case ',':
1769         result = TOKEN_COMMA;
1770         break;
1771       case ')':
1772         result = TOKEN_CLOSE;
1773         break;
1774       default:
1775         result = TOKEN_SIMPLE;
1776       }
1777
1778 #ifdef DEBUG_INPUT
1779   xfprintf (stderr, "peek_token -> %s\n", token_type_string (result));
1780 #endif /* DEBUG_INPUT */
1781   return result;
1782 }
1783 \f
1784
1785 #ifdef DEBUG_INPUT
1786
1787 static const char *
1788 token_type_string (token_type t)
1789 {
1790  switch (t)
1791     {                           /* TOKSW */
1792     case TOKEN_EOF:
1793       return "EOF";
1794     case TOKEN_STRING:
1795       return "STRING";
1796     case TOKEN_WORD:
1797       return "WORD";
1798     case TOKEN_OPEN:
1799       return "OPEN";
1800     case TOKEN_COMMA:
1801       return "COMMA";
1802     case TOKEN_CLOSE:
1803       return "CLOSE";
1804     case TOKEN_SIMPLE:
1805       return "SIMPLE";
1806     case TOKEN_MACDEF:
1807       return "MACDEF";
1808     default:
1809       abort ();
1810     }
1811  }
1812
1813 static void
1814 print_token (const char *s, token_type t, token_data *td)
1815 {
1816   xfprintf (stderr, "%s: ", s);
1817   switch (t)
1818     {                           /* TOKSW */
1819     case TOKEN_OPEN:
1820     case TOKEN_COMMA:
1821     case TOKEN_CLOSE:
1822     case TOKEN_SIMPLE:
1823       xfprintf (stderr, "char:");
1824       break;
1825
1826     case TOKEN_WORD:
1827       xfprintf (stderr, "word:");
1828       break;
1829
1830     case TOKEN_STRING:
1831       xfprintf (stderr, "string:");
1832       break;
1833
1834     case TOKEN_MACDEF:
1835       xfprintf (stderr, "macro: %p\n", TOKEN_DATA_FUNC (td));
1836       break;
1837
1838     case TOKEN_EOF:
1839       xfprintf (stderr, "eof\n");
1840       break;
1841     }
1842   xfprintf (stderr, "\t\"%s\"\n", TOKEN_DATA_TEXT (td));
1843 }
1844
1845 static void M4_GNUC_UNUSED
1846 lex_debug (void)
1847 {
1848   token_type t;
1849   token_data td;
1850
1851   while ((t = next_token (&td, NULL, NULL, false, "<debug>")) != TOKEN_EOF)
1852     print_token ("lex", t, &td);
1853 }
1854 #endif /* DEBUG_INPUT */