s/obstack_print/shipout_string_trunc/.
[m4/ericb.git] / src / input.c
blobe2d332a5a3dc5b47c1c62bf36358ccd71896e181
1 /* GNU m4 -- A simple macro processor
3 Copyright (C) 1989, 1990, 1991, 1992, 1993, 1994, 2004, 2005, 2006, 2007,
4 2008 Free Software Foundation, Inc.
6 This file is part of GNU M4.
8 GNU M4 is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
13 GNU M4 is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 /* Handling of different input sources, and lexical analysis. */
24 #include "m4.h"
26 /* Unread input can be either files to be read (command line,
27 "include", "sinclude"), strings which should be rescanned (macro
28 expansion text), or quoted macro definitions (as returned by the
29 builtin "defn"). Unread input is organized in a stack, implemented
30 with an obstack. Each input source is described by a "struct
31 input_block". The obstack is "current_input". The top of the
32 input stack is "isp".
34 The macro "m4wrap" places the text to be saved on another input
35 stack, on the obstack "wrapup_stack", whose top is "wsp". When EOF
36 is seen on normal input (eg, when "current_input" is empty), input is
37 switched over to "wrapup_stack", and the original "current_input" is
38 freed. A new stack is allocated for "wrapup_stack", which will
39 accept any text produced by calls to "m4wrap" from within the
40 wrapped text. This process of shuffling "wrapup_stack" to
41 "current_input" can continue indefinitely, even generating infinite
42 loops (e.g. "define(`f',`m4wrap(`f')')f"), without memory leaks.
44 Pushing new input on the input stack is done by push_file (),
45 push_string (), push_wrapup_init/push_wrapup_finish () (for wrapup
46 text), and push_macro () (for macro definitions). Because macro
47 expansion needs direct access to the current input obstack (for
48 optimization), push_string () is split in two functions,
49 push_string_init (), which returns a pointer to the current input
50 stack, and push_string_finish (), which returns a pointer to the
51 final text. The input_block *next is used to manage the
52 coordination between the different push routines.
54 The current file and line number are stored in two global
55 variables, for use by the error handling functions in m4.c. Macro
56 expansion wants to report the line where a macro name was detected,
57 rather than where it finished collecting arguments. This also
58 applies to text resulting from macro expansions. So each input
59 block maintains its own notion of the current file and line, and
60 swapping between input blocks updates the global variables
61 accordingly. */
63 #ifdef ENABLE_CHANGEWORD
64 # include "regex.h"
65 #endif /* ENABLE_CHANGEWORD */
67 /* Number of bytes where it is more efficient to inline the reference
68 as a string than it is to track reference bookkeeping for those
69 bytes. */
70 #define INPUT_INLINE_THRESHOLD 16
72 /* Type of an input block. */
73 enum input_type
75 INPUT_STRING, /* String resulting from macro expansion. */
76 INPUT_FILE, /* File from command line or include. */
77 INPUT_MACRO, /* Builtin resulting from defn. */
78 INPUT_CHAIN /* FIFO chain of separate strings and $@ refs. */
81 typedef enum input_type input_type;
83 /* A block of input to be scanned. */
84 struct input_block
86 input_block *prev; /* Previous input_block on the input stack. */
87 input_type type; /* See enum values. */
88 const char *file; /* File where this input is from. */
89 int line; /* Line where this input is from. */
90 union
92 struct
94 char *str; /* Remaining string value. */
95 size_t len; /* Remaining length. */
97 u_s; /* INPUT_STRING */
98 struct
100 FILE *fp; /* Input file handle. */
101 bool_bitfield end : 1; /* True if peek has seen EOF. */
102 bool_bitfield close : 1; /* True to close file on pop. */
103 bool_bitfield advance : 1; /* Track previous start_of_input_line. */
105 u_f; /* INPUT_FILE */
106 builtin_func *func; /* INPUT_MACRO */
107 struct
109 token_chain *chain; /* Current link in chain. */
110 token_chain *end; /* Last link in chain. */
112 u_c; /* INPUT_CHAIN */
118 /* Current input file name. */
119 const char *current_file;
121 /* Current input line number. */
122 int current_line;
124 /* Obstack for storing individual tokens. */
125 static struct obstack token_stack;
127 /* Obstack for storing file names. */
128 static struct obstack file_names;
130 /* Wrapup input stack. */
131 static struct obstack *wrapup_stack;
133 /* Current stack, from input or wrapup. */
134 static struct obstack *current_input;
136 /* Bottom of token_stack, for obstack_free. */
137 static void *token_bottom;
139 /* Pointer to top of current_input. */
140 static input_block *isp;
142 /* Pointer to top of wrapup_stack. */
143 static input_block *wsp;
145 /* Aux. for handling split push_string (). */
146 static input_block *next;
148 /* Flag for next_char () to increment current_line. */
149 static bool start_of_input_line;
151 /* Flag for next_char () to recognize change in input block. */
152 static bool input_change;
154 #define CHAR_EOF (UCHAR_MAX + 1) /* Return on EOF. */
155 #define CHAR_MACRO (UCHAR_MAX + 2) /* Return for MACRO token. */
156 #define CHAR_QUOTE (UCHAR_MAX + 3) /* Return for quoted string. */
157 #define CHAR_ARGV (UCHAR_MAX + 4) /* Return for $@ reference. */
159 /* Quote chars. */
160 string_pair curr_quote;
162 /* Comment chars. */
163 string_pair curr_comm;
165 #ifdef ENABLE_CHANGEWORD
167 # define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*"
169 /* Current regular expression for detecting words. */
170 static struct re_pattern_buffer word_regexp;
172 /* True if changeword is not active. */
173 static bool default_word_regexp;
175 /* Reused memory for detecting matches in word detection. */
176 static struct re_registers regs;
178 #else /* !ENABLE_CHANGEWORD */
179 # define default_word_regexp true
180 #endif /* !ENABLE_CHANGEWORD */
182 /* Track the current quote age, determined by all significant
183 changequote, changecom, and changeword calls, since any one of
184 these can alter the rescan of a prior parameter in a quoted
185 context. */
186 static unsigned int current_quote_age;
188 /* Cache a quote pair. See quote_cache. */
189 static string_pair *cached_quote;
191 static bool pop_input (bool);
192 static void set_quote_age (void);
194 #ifdef DEBUG_INPUT
195 static const char *token_type_string (token_type);
196 #endif /* DEBUG_INPUT */
199 /*-------------------------------------------------------------------.
200 | Given an obstack OBS, capture any unfinished text as a link in the |
201 | chain that starts at *START and ends at *END. START may be NULL |
202 | if *END is non-NULL. |
203 `-------------------------------------------------------------------*/
204 void
205 make_text_link (struct obstack *obs, token_chain **start, token_chain **end)
207 token_chain *chain;
208 size_t len = obstack_object_size (obs);
210 assert (end && (start || *end));
211 if (len)
213 char *str = (char *) obstack_finish (obs);
214 chain = (token_chain *) obstack_alloc (obs, sizeof *chain);
215 if (*end)
216 (*end)->next = chain;
217 else
218 *start = chain;
219 *end = chain;
220 chain->next = NULL;
221 chain->type = CHAIN_STR;
222 chain->quote_age = 0;
223 chain->u.u_s.str = str;
224 chain->u.u_s.len = len;
225 chain->u.u_s.level = -1;
229 /*-------------------------------------------------------------------.
230 | push_file () pushes an input file on the input stack, saving the |
231 | current file name and line number. If next is non-NULL, this push |
232 | invalidates a call to push_string_init (), whose storage is |
233 | consequently released. If CLOSE, then close FP after EOF is |
234 | detected. TITLE is used as the location for text parsed from the |
235 | file (not necessarily the file name). |
236 `-------------------------------------------------------------------*/
238 void
239 push_file (FILE *fp, const char *title, bool close)
241 input_block *i;
243 if (next != NULL)
245 obstack_free (current_input, next);
246 next = NULL;
249 if (debug_level & DEBUG_TRACE_INPUT)
250 DEBUG_MESSAGE1 ("input read from %s", title);
252 i = (input_block *) obstack_alloc (current_input, sizeof *i);
253 i->type = INPUT_FILE;
254 i->file = (char *) obstack_copy0 (&file_names, title, strlen (title));
255 i->line = 1;
256 input_change = true;
258 i->u.u_f.fp = fp;
259 i->u.u_f.end = false;
260 i->u.u_f.close = close;
261 i->u.u_f.advance = start_of_input_line;
262 output_current_line = -1;
264 i->prev = isp;
265 isp = i;
268 /*-----------------------------------------------------------------.
269 | push_macro () pushes the builtin macro FUNC on the input stack. |
270 | If next is non-NULL, this push invalidates a call to |
271 | push_string_init (), whose storage is consequently released. |
272 `-----------------------------------------------------------------*/
274 void
275 push_macro (builtin_func *func)
277 input_block *i;
279 if (next != NULL)
281 obstack_free (current_input, next);
282 next = NULL;
285 assert (func);
286 i = (input_block *) obstack_alloc (current_input, sizeof *i);
287 i->type = INPUT_MACRO;
288 i->file = current_file;
289 i->line = current_line;
290 input_change = true;
292 i->u.func = func;
293 i->prev = isp;
294 isp = i;
297 /*--------------------------------------------------------------.
298 | First half of push_string (). The return value points to the |
299 | obstack where expansion text should be placed. |
300 `--------------------------------------------------------------*/
302 struct obstack *
303 push_string_init (void)
305 /* Free any memory occupied by completely parsed strings. */
306 assert (next == NULL);
307 while (isp && pop_input (false));
309 /* Reserve the next location on the obstack. */
310 next = (input_block *) obstack_alloc (current_input, sizeof *next);
311 next->type = INPUT_STRING;
312 next->file = current_file;
313 next->line = current_line;
315 return current_input;
318 /*--------------------------------------------------------------------.
319 | This function allows gathering input from multiple locations, |
320 | rather than copying everything consecutively onto the input stack. |
321 | Must be called between push_string_init and push_string_finish. |
323 | If TOKEN contains text, then convert the current input block into |
324 | a chain if it is not one already, and add the contents of TOKEN as |
325 | a new link in the chain. LEVEL describes the current expansion |
326 | level, or -1 if TOKEN is composite, its contents reside entirely |
327 | on the current_input stack, and TOKEN lives in temporary storage. |
328 | If TOKEN is a simple string, then it belongs to the current macro |
329 | expansion. If TOKEN is composite, then each text link has a level |
330 | of -1 if it belongs to the current macro expansion, otherwise it |
331 | is a back-reference where level tracks which stack it came from. |
332 | The resulting input block chain contains links with a level of -1 |
333 | if the text belongs to the input stack, otherwise the level where |
334 | the back-reference comes from. |
336 | Return true only if a reference was created to the contents of |
337 | TOKEN, in which case, LEVEL was non-negative and the lifetime of |
338 | TOKEN and its contents must last as long as the input engine can |
339 | parse references to it. INUSE determines whether composite tokens |
340 | should favor creating back-references or copying text. |
341 `--------------------------------------------------------------------*/
342 bool
343 push_token (token_data *token, int level, bool inuse)
345 token_chain *src_chain = NULL;
346 token_chain *chain;
348 assert (next);
350 /* Speed consideration - for short enough tokens, the speed and
351 memory overhead of parsing another INPUT_CHAIN link outweighs the
352 time to inline the token text. But don't re-copy text if it
353 already lives on the obstack. */
354 if (TOKEN_DATA_TYPE (token) == TOKEN_TEXT)
356 assert (level >= 0);
357 if (TOKEN_DATA_LEN (token) <= INPUT_INLINE_THRESHOLD)
359 obstack_grow (current_input, TOKEN_DATA_TEXT (token),
360 TOKEN_DATA_LEN (token));
361 return false;
364 else
366 /* For composite tokens, if argv is already in use, creating
367 additional references for long text segments is more
368 efficient in time. But if argv is not yet in use, and we
369 have a composite token, then the token must already contain a
370 back-reference, and memory usage is more efficient if we can
371 avoid using the current expand_macro, even if it means larger
372 copies. */
373 assert (TOKEN_DATA_TYPE (token) == TOKEN_COMP);
374 src_chain = token->u.u_c.chain;
375 while (level >= 0 && src_chain && src_chain->type == CHAIN_STR
376 && (src_chain->u.u_s.len <= INPUT_INLINE_THRESHOLD
377 || (!inuse && src_chain->u.u_s.level == -1)))
379 obstack_grow (current_input, src_chain->u.u_s.str,
380 src_chain->u.u_s.len);
381 src_chain = src_chain->next;
383 if (!src_chain)
384 return false;
387 if (next->type == INPUT_STRING)
389 next->type = INPUT_CHAIN;
390 next->u.u_c.chain = next->u.u_c.end = NULL;
392 make_text_link (current_input, &next->u.u_c.chain, &next->u.u_c.end);
393 if (TOKEN_DATA_TYPE (token) == TOKEN_TEXT)
395 chain = (token_chain *) obstack_alloc (current_input, sizeof *chain);
396 if (next->u.u_c.end)
397 next->u.u_c.end->next = chain;
398 else
399 next->u.u_c.chain = chain;
400 next->u.u_c.end = chain;
401 chain->next = NULL;
402 chain->type = CHAIN_STR;
403 chain->quote_age = TOKEN_DATA_QUOTE_AGE (token);
404 chain->u.u_s.str = TOKEN_DATA_TEXT (token);
405 chain->u.u_s.len = TOKEN_DATA_LEN (token);
406 chain->u.u_s.level = level;
407 adjust_refcount (level, true);
408 inuse = true;
410 while (src_chain)
412 if (level == -1)
414 /* Nothing to copy, since link already lives on obstack. */
415 assert (src_chain->type != CHAIN_STR
416 || src_chain->u.u_s.level == -1);
417 chain = src_chain;
419 else
421 /* Allow inlining the final link with subsequent text. */
422 if (!src_chain->next && src_chain->type == CHAIN_STR
423 && (src_chain->u.u_s.len <= INPUT_INLINE_THRESHOLD
424 || (!inuse && src_chain->u.u_s.level == -1)))
426 obstack_grow (current_input, src_chain->u.u_s.str,
427 src_chain->u.u_s.len);
428 break;
430 /* We must clone each link in the chain, since next_char
431 destructively modifies the chain it is parsing. */
432 chain = (token_chain *) obstack_copy (current_input, src_chain,
433 sizeof *chain);
434 if (chain->type == CHAIN_STR && chain->u.u_s.level == -1)
436 if (chain->u.u_s.len <= INPUT_INLINE_THRESHOLD || !inuse)
437 chain->u.u_s.str = (char *) obstack_copy (current_input,
438 chain->u.u_s.str,
439 chain->u.u_s.len);
440 else
442 chain->u.u_s.level = level;
443 inuse = true;
447 if (next->u.u_c.end)
448 next->u.u_c.end->next = chain;
449 else
450 next->u.u_c.chain = chain;
451 next->u.u_c.end = chain;
452 if (chain->type == CHAIN_ARGV)
454 assert (!chain->u.u_a.comma && !chain->u.u_a.skip_last);
455 inuse |= arg_adjust_refcount (chain->u.u_a.argv, true);
457 else if (chain->type == CHAIN_STR && chain->u.u_s.level >= 0)
458 adjust_refcount (chain->u.u_s.level, true);
459 src_chain = src_chain->next;
461 return inuse;
464 /*-------------------------------------------------------------------.
465 | Last half of push_string (). If next is now NULL, a call to |
466 | push_file () or push_macro () has invalidated the previous call to |
467 | push_string_init (), so we just give up. If the new object is |
468 | void, we do not push it. The function push_string_finish () |
469 | returns an opaque pointer to the finished object, which can then |
470 | be printed with input_print when tracing is enabled. This pointer |
471 | is only for temporary use, since reading the next token will |
472 | invalidate the object. |
473 `-------------------------------------------------------------------*/
475 const input_block *
476 push_string_finish (void)
478 input_block *ret = NULL;
479 size_t len = obstack_object_size (current_input);
481 if (next == NULL)
483 assert (!len);
484 return NULL;
487 if (len || next->type == INPUT_CHAIN)
489 if (next->type == INPUT_STRING)
491 next->u.u_s.str = (char *) obstack_finish (current_input);
492 next->u.u_s.len = len;
494 else
495 make_text_link (current_input, &next->u.u_c.chain, &next->u.u_c.end);
496 next->prev = isp;
497 isp = next;
498 input_change = true;
499 ret = isp;
501 else
502 obstack_free (current_input, next);
503 next = NULL;
504 return ret;
507 /*--------------------------------------------------------------.
508 | The function push_wrapup_init () returns an obstack ready for |
509 | direct expansion of wrapup text, and should be followed by |
510 | push_wrapup_finish (). |
511 `--------------------------------------------------------------*/
513 struct obstack *
514 push_wrapup_init (void)
516 input_block *i;
517 i = (input_block *) obstack_alloc (wrapup_stack, sizeof *i);
518 i->prev = wsp;
519 i->type = INPUT_STRING;
520 i->file = current_file;
521 i->line = current_line;
522 wsp = i;
523 return wrapup_stack;
526 /*---------------------------------------------------------------.
527 | After pushing wrapup text, push_wrapup_finish () completes the |
528 | bookkeeping. |
529 `---------------------------------------------------------------*/
530 void
531 push_wrapup_finish (void)
533 input_block *i = wsp;
534 if (obstack_object_size (wrapup_stack) == 0)
536 wsp = i->prev;
537 obstack_free (wrapup_stack, i);
539 else
541 i->u.u_s.len = obstack_object_size (wrapup_stack);
542 i->u.u_s.str = (char *) obstack_finish (wrapup_stack);
547 /*-------------------------------------------------------------------.
548 | The function pop_input () pops one level of input sources. If |
549 | CLEANUP, and the popped input_block is a file, current_file and |
550 | current_line are reset to the saved values before the memory for |
551 | the input_block is released. The return value is false if cleanup |
552 | is still required, or if the current input source is not |
553 | exhausted. |
554 `-------------------------------------------------------------------*/
556 static bool
557 pop_input (bool cleanup)
559 input_block *tmp = isp->prev;
560 token_chain *chain;
562 switch (isp->type)
564 case INPUT_STRING:
565 assert (!cleanup || !isp->u.u_s.len);
566 if (isp->u.u_s.len)
567 return false;
568 break;
570 case INPUT_MACRO:
571 if (!cleanup)
572 return false;
573 break;
575 case INPUT_CHAIN:
576 chain = isp->u.u_c.chain;
577 assert (!chain || !cleanup);
578 while (chain)
580 switch (chain->type)
582 case CHAIN_STR:
583 if (chain->u.u_s.len)
584 return false;
585 if (chain->u.u_s.level >= 0)
586 adjust_refcount (chain->u.u_s.level, false);
587 break;
588 case CHAIN_ARGV:
589 if (chain->u.u_a.index < arg_argc (chain->u.u_a.argv))
590 return false;
591 arg_adjust_refcount (chain->u.u_a.argv, false);
592 break;
593 default:
594 assert (!"pop_input");
595 abort ();
597 isp->u.u_c.chain = chain = chain->next;
599 break;
601 case INPUT_FILE:
602 if (!cleanup)
603 return false;
604 if (debug_level & DEBUG_TRACE_INPUT)
606 if (tmp)
607 DEBUG_MESSAGE2 ("input reverted to %s, line %d",
608 tmp->file, tmp->line);
609 else
610 DEBUG_MESSAGE ("input exhausted");
613 if (ferror (isp->u.u_f.fp))
615 m4_error (0, 0, NULL, _("read error"));
616 if (isp->u.u_f.close)
617 fclose (isp->u.u_f.fp);
619 else if (isp->u.u_f.close && fclose (isp->u.u_f.fp) == EOF)
620 m4_error (0, errno, NULL, _("error reading file"));
621 start_of_input_line = isp->u.u_f.advance;
622 output_current_line = -1;
623 break;
625 default:
626 assert (!"pop_input");
627 abort ();
629 obstack_free (current_input, isp);
630 cached_quote = NULL;
631 next = NULL; /* might be set in push_string_init () */
633 isp = tmp;
634 input_change = true;
635 return true;
638 /*------------------------------------------------------------------------.
639 | To switch input over to the wrapup stack, main () calls pop_wrapup (). |
640 | Since wrapup text can install new wrapup text, pop_wrapup () returns |
641 | false when there is no wrapup text on the stack, and true otherwise. |
642 `------------------------------------------------------------------------*/
644 bool
645 pop_wrapup (void)
647 next = NULL;
648 obstack_free (current_input, NULL);
649 free (current_input);
651 if (wsp == NULL)
653 /* End of the program. Free all memory even though we are about
654 to exit, since it makes leak detection easier. */
655 obstack_free (&token_stack, NULL);
656 obstack_free (&file_names, NULL);
657 obstack_free (wrapup_stack, NULL);
658 free (wrapup_stack);
659 #ifdef ENABLE_CHANGEWORD
660 regfree (&word_regexp);
661 #endif /* ENABLE_CHANGEWORD */
662 return false;
665 current_input = wrapup_stack;
666 wrapup_stack = (struct obstack *) xmalloc (sizeof *wrapup_stack);
667 obstack_init (wrapup_stack);
669 isp = wsp;
670 wsp = NULL;
671 input_change = true;
673 return true;
676 /*--------------------------------------------------------------.
677 | Dump a representation of INPUT to the obstack OBS, for use in |
678 | tracing. |
679 `--------------------------------------------------------------*/
680 void
681 input_print (struct obstack *obs, const input_block *input)
683 int maxlen = max_debug_argument_length;
684 token_chain *chain;
686 assert (input);
687 switch (input->type)
689 case INPUT_STRING:
690 shipout_string_trunc (obs, input->u.u_s.str, input->u.u_s.len, &maxlen);
691 break;
692 case INPUT_FILE:
693 obstack_grow (obs, "<file: ", strlen ("<file: "));
694 obstack_grow (obs, input->file, strlen (input->file));
695 obstack_1grow (obs, '>');
696 break;
697 case INPUT_MACRO:
698 func_print (obs, find_builtin_by_addr (input->u.func), false, NULL);
699 break;
700 case INPUT_CHAIN:
701 chain = input->u.u_c.chain;
702 while (chain)
704 switch (chain->type)
706 case CHAIN_STR:
707 if (shipout_string_trunc (obs, chain->u.u_s.str,
708 chain->u.u_s.len, &maxlen))
709 return;
710 break;
711 case CHAIN_ARGV:
712 assert (!chain->u.u_a.comma);
713 if (arg_print (obs, chain->u.u_a.argv, chain->u.u_a.index,
714 quote_cache (NULL, chain->quote_age,
715 chain->u.u_a.quotes),
716 chain->u.u_a.flatten, NULL, &maxlen, false))
717 return;
718 break;
719 default:
720 assert (!"input_print");
721 abort ();
723 chain = chain->next;
725 break;
726 default:
727 assert (!"input_print");
728 abort ();
733 /*------------------------------------------------------------------.
734 | Low level input is done a character at a time. The function |
735 | peek_input () is used to look at the next character in the input |
736 | stream. At any given time, it reads from the input_block on the |
737 | top of the current input stack. The return value is an unsigned |
738 | char, CHAR_EOF if there is no more input, CHAR_MACRO if a builtin |
739 | token occurs next, or CHAR_ARGV if ALLOW_ARGV and the input is |
740 | visiting an argv reference with the correct quoting. |
741 `------------------------------------------------------------------*/
743 static int
744 peek_input (bool allow_argv)
746 int ch;
747 input_block *block = isp;
748 token_chain *chain;
750 while (1)
752 if (block == NULL)
753 return CHAR_EOF;
755 switch (block->type)
757 case INPUT_STRING:
758 if (!block->u.u_s.len)
759 break;
760 return to_uchar (block->u.u_s.str[0]);
762 case INPUT_FILE:
763 ch = getc (block->u.u_f.fp);
764 if (ch != EOF)
766 ungetc (ch, block->u.u_f.fp);
767 return ch;
769 block->u.u_f.end = true;
770 break;
772 case INPUT_MACRO:
773 return CHAR_MACRO;
775 case INPUT_CHAIN:
776 chain = block->u.u_c.chain;
777 while (chain)
779 unsigned int argc;
780 switch (chain->type)
782 case CHAIN_STR:
783 if (chain->u.u_s.len)
784 return to_uchar (*chain->u.u_s.str);
785 break;
786 case CHAIN_ARGV:
787 argc = arg_argc (chain->u.u_a.argv);
788 if (chain->u.u_a.index == argc)
789 break;
790 if (chain->u.u_a.comma)
791 return ',';
792 /* Only return a reference if the quoting is correct
793 and the reference has more than one argument
794 left. */
795 if (allow_argv && chain->quote_age == current_quote_age
796 && chain->u.u_a.quotes && chain->u.u_a.index + 1 < argc)
797 return CHAR_ARGV;
798 /* Rather than directly parse argv here, we push
799 another input block containing the next unparsed
800 argument from argv. */
801 push_string_init ();
802 push_arg_quote (current_input, chain->u.u_a.argv,
803 chain->u.u_a.index,
804 quote_cache (NULL, chain->quote_age,
805 chain->u.u_a.quotes));
806 chain->u.u_a.index++;
807 chain->u.u_a.comma = true;
808 push_string_finish ();
809 return peek_input (allow_argv);
810 default:
811 assert (!"peek_input");
812 abort ();
814 chain = chain->next;
816 break;
818 default:
819 assert (!"peek_input");
820 abort ();
822 block = block->prev;
826 /*-------------------------------------------------------------------.
827 | The function next_char () is used to read and advance the input to |
828 | the next character. It also manages line numbers for error |
829 | messages, so they do not get wrong due to lookahead. The token |
830 | consisting of a newline alone is taken as belonging to the line it |
831 | ends, and the current line number is not incremented until the |
832 | next character is read. 99.9% of all calls will read from a |
833 | string, so factor that out into a macro for speed. If |
834 | ALLOW_QUOTE, and the current input matches the current quote age, |
835 | return CHAR_QUOTE and leave consumption of data for |
836 | append_quote_token. |
837 `-------------------------------------------------------------------*/
839 #define next_char(AQ) \
840 (isp && isp->type == INPUT_STRING && isp->u.u_s.len && !input_change \
841 ? (isp->u.u_s.len--, to_uchar (*isp->u.u_s.str++)) \
842 : next_char_1 (AQ))
844 static int
845 next_char_1 (bool allow_quote)
847 int ch;
848 token_chain *chain;
850 while (1)
852 if (isp == NULL)
854 current_file = "";
855 current_line = 0;
856 return CHAR_EOF;
859 if (input_change)
861 current_file = isp->file;
862 current_line = isp->line;
863 input_change = false;
866 switch (isp->type)
868 case INPUT_STRING:
869 if (!isp->u.u_s.len)
870 break;
871 isp->u.u_s.len--;
872 return to_uchar (*isp->u.u_s.str++);
874 case INPUT_FILE:
875 if (start_of_input_line)
877 start_of_input_line = false;
878 current_line = ++isp->line;
881 /* If stdin is a terminal, calling getc after peek_input
882 already called it would make the user have to hit ^D
883 twice to quit. */
884 ch = isp->u.u_f.end ? EOF : getc (isp->u.u_f.fp);
885 if (ch != EOF)
887 if (ch == '\n')
888 start_of_input_line = true;
889 return ch;
891 break;
893 case INPUT_MACRO:
894 /* INPUT_MACRO input sources has only one token */
895 pop_input (true);
896 return CHAR_MACRO;
898 case INPUT_CHAIN:
899 chain = isp->u.u_c.chain;
900 while (chain)
902 if (allow_quote && chain->quote_age == current_quote_age)
903 return CHAR_QUOTE;
904 switch (chain->type)
906 case CHAIN_STR:
907 if (chain->u.u_s.len)
909 /* Partial consumption invalidates quote age. */
910 chain->quote_age = 0;
911 chain->u.u_s.len--;
912 return to_uchar (*chain->u.u_s.str++);
914 if (chain->u.u_s.level >= 0)
915 adjust_refcount (chain->u.u_s.level, false);
916 break;
917 case CHAIN_ARGV:
918 if (chain->u.u_a.index == arg_argc (chain->u.u_a.argv))
920 arg_adjust_refcount (chain->u.u_a.argv, false);
921 break;
923 if (chain->u.u_a.comma)
925 chain->u.u_a.comma = false;
926 return ',';
928 /* Rather than directly parse argv here, we push
929 another input block containing the next unparsed
930 argument from argv. */
931 push_string_init ();
932 push_arg_quote (current_input, chain->u.u_a.argv,
933 chain->u.u_a.index,
934 quote_cache (NULL, chain->quote_age,
935 chain->u.u_a.quotes));
936 chain->u.u_a.index++;
937 chain->u.u_a.comma = true;
938 push_string_finish ();
939 return next_char_1 (allow_quote);
940 default:
941 assert (!"next_char_1");
942 abort ();
944 isp->u.u_c.chain = chain = chain->next;
946 break;
948 default:
949 assert (!"next_char_1");
950 abort ();
953 /* End of input source --- pop one level. */
954 pop_input (true);
958 /*-------------------------------------------------------------------.
959 | skip_line () simply discards all immediately following characters, |
960 | up to the first newline. It is only used from m4_dnl (). Report |
961 | warnings on behalf of NAME. |
962 `-------------------------------------------------------------------*/
964 void
965 skip_line (const char *name)
967 int ch;
968 const char *file = current_file;
969 int line = current_line;
971 while ((ch = next_char (false)) != CHAR_EOF && ch != '\n')
973 if (ch == CHAR_EOF)
974 /* current_file changed to "" if we see CHAR_EOF, use the
975 previous value we stored earlier. */
976 m4_warn_at_line (0, file, line, name,
977 _("end of file treated as newline"));
978 /* On the rare occasion that dnl crosses include file boundaries
979 (either the input file did not end in a newline, or changeword
980 was used), calling next_char can update current_file and
981 current_line, and that update will be undone as we return to
982 expand_macro. This informs next_char to fix things again. */
983 if (file != current_file || line != current_line)
984 input_change = true;
987 /*-------------------------------------------------------------------.
988 | When a MACRO token is seen, next_token () uses init_macro_token () |
989 | to retrieve the value of the function pointer and store it in TD. |
990 `-------------------------------------------------------------------*/
992 static void
993 init_macro_token (token_data *td)
995 assert (isp->type == INPUT_MACRO);
996 TOKEN_DATA_TYPE (td) = TOKEN_FUNC;
997 TOKEN_DATA_FUNC (td) = isp->u.func;
1000 /*-------------------------------------------------------------------.
1001 | When a QUOTE token is seen, convert TD to a composite (if it is |
1002 | not one already), consisting of any unfinished text on OBS, as |
1003 | well as the quoted token from the top of the input stack. Use OBS |
1004 | for any additional allocations needed to store the token chain. |
1005 `-------------------------------------------------------------------*/
1006 static void
1007 append_quote_token (struct obstack *obs, token_data *td)
1009 token_chain *src_chain = isp->u.u_c.chain;
1010 token_chain *chain;
1012 assert (isp->type == INPUT_CHAIN && obs && current_quote_age);
1013 isp->u.u_c.chain = src_chain->next;
1015 /* Speed consideration - for short enough tokens, the speed and
1016 memory overhead of parsing another INPUT_CHAIN link outweighs the
1017 time to inline the token text. */
1018 if (src_chain->type == CHAIN_STR
1019 && src_chain->u.u_s.len <= INPUT_INLINE_THRESHOLD)
1021 assert (src_chain->u.u_s.level >= 0);
1022 obstack_grow (obs, src_chain->u.u_s.str, src_chain->u.u_s.len);
1023 adjust_refcount (src_chain->u.u_s.level, false);
1024 return;
1027 if (TOKEN_DATA_TYPE (td) == TOKEN_VOID)
1029 TOKEN_DATA_TYPE (td) = TOKEN_COMP;
1030 td->u.u_c.chain = td->u.u_c.end = NULL;
1031 td->u.u_c.wrapper = td->u.u_c.has_func = false;
1033 assert (TOKEN_DATA_TYPE (td) == TOKEN_COMP);
1034 make_text_link (obs, &td->u.u_c.chain, &td->u.u_c.end);
1035 chain = (token_chain *) obstack_copy (obs, src_chain, sizeof *chain);
1036 if (td->u.u_c.end)
1037 td->u.u_c.end->next = chain;
1038 else
1039 td->u.u_c.chain = chain;
1040 td->u.u_c.end = chain;
1041 if (chain->type == CHAIN_ARGV && chain->u.u_a.has_func)
1042 td->u.u_c.has_func = true;
1043 chain->next = NULL;
1047 /*-------------------------------------------------------------------.
1048 | When an ARGV token is seen, convert TD to point to it via a |
1049 | composite token. Use OBS for any additional allocations needed to |
1050 | store the token chain. |
1051 `-------------------------------------------------------------------*/
1052 static void
1053 init_argv_token (struct obstack *obs, token_data *td)
1055 token_chain *src_chain;
1056 token_chain *chain;
1057 int ch = next_char (true);
1059 assert (ch == CHAR_QUOTE && TOKEN_DATA_TYPE (td) == TOKEN_VOID
1060 && isp->type == INPUT_CHAIN && isp->u.u_c.chain->type == CHAIN_ARGV
1061 && obs && obstack_object_size (obs) == 0);
1063 src_chain = isp->u.u_c.chain;
1064 isp->u.u_c.chain = src_chain->next;
1065 TOKEN_DATA_TYPE (td) = TOKEN_COMP;
1066 /* Clone the link, since the input will be discarded soon. */
1067 chain = (token_chain *) obstack_copy (obs, src_chain, sizeof *chain);
1068 td->u.u_c.chain = td->u.u_c.end = chain;
1069 td->u.u_c.wrapper = true;
1070 td->u.u_c.has_func = chain->u.u_a.has_func;
1071 chain->next = NULL;
1073 /* If the next character is not ',' or ')', then unlink the last
1074 argument from argv and schedule it for reparsing. This way,
1075 expand_argument never has to deal with concatenation of argv with
1076 arbitrary text. Note that the implementation of safe_quotes
1077 ensures peek_input won't return CHAR_ARGV if the user is perverse
1078 enough to mix comment delimiters with argument separators:
1080 define(n,`$#')define(echo,$*)changecom(`,,',`)')n(echo(a,`,b`)'',c))
1081 => 2 (not 3)
1083 Therefore, we do not have to worry about calling MATCH, and thus
1084 do not have to worry about pop_input being called and
1085 invalidating the argv reference.
1087 When the $@ ref is used unchanged, we completely bypass the
1088 decrement of the argv refcount in next_char_1, since the ref is
1089 still live via the current collect_arguments. However, when the
1090 last element of the $@ ref is reparsed, we must increase the argv
1091 refcount here, to compensate for the fact that it will be
1092 decreased once the final element is parsed. */
1093 assert (*curr_comm.str1 != ',' && *curr_comm.str1 != ')'
1094 && *curr_comm.str1 != *curr_quote.str1);
1095 ch = peek_input (false);
1096 if (ch != ',' && ch != ')')
1098 isp->u.u_c.chain = src_chain;
1099 src_chain->u.u_a.index = arg_argc (chain->u.u_a.argv) - 1;
1100 src_chain->u.u_a.comma = true;
1101 chain->u.u_a.skip_last = true;
1102 arg_adjust_refcount (chain->u.u_a.argv, true);
1107 /*------------------------------------------------------------------.
1108 | This function is for matching a string against a prefix of the |
1109 | input stream. If the string S matches the input and CONSUME is |
1110 | true, the input is discarded; otherwise any characters read are |
1111 | pushed back again. The function is used only when multicharacter |
1112 | quotes or comment delimiters are used. |
1113 `------------------------------------------------------------------*/
1115 static bool
1116 match_input (const char *s, bool consume)
1118 int n; /* number of characters matched */
1119 int ch; /* input character */
1120 const char *t;
1121 bool result = false;
1123 ch = peek_input (false);
1124 if (ch != to_uchar (*s))
1125 return false; /* fail */
1127 if (s[1] == '\0')
1129 if (consume)
1130 next_char (false);
1131 return true; /* short match */
1134 next_char (false);
1135 for (n = 1, t = s++; (ch = peek_input (false)) == to_uchar (*s++); )
1137 next_char (false);
1138 n++;
1139 if (*s == '\0') /* long match */
1141 if (consume)
1142 return true;
1143 result = true;
1144 break;
1148 /* Failed or shouldn't consume, push back input. */
1149 push_string_init ();
1150 obstack_grow (current_input, t, n);
1151 push_string_finish ();
1152 return result;
1155 /*--------------------------------------------------------------------.
1156 | The macro MATCH() is used to match a string S against the input. |
1157 | The first character is handled inline, for speed. Hopefully, this |
1158 | will not hurt efficiency too much when single character quotes and |
1159 | comment delimiters are used. If CONSUME, then CH is the result of |
1160 | next_char, and a successful match will discard the matched string. |
1161 | Otherwise, CH is the result of peek_input, and the input stream is |
1162 | effectively unchanged. |
1163 `--------------------------------------------------------------------*/
1165 #define MATCH(ch, s, consume) \
1166 (to_uchar ((s)[0]) == (ch) \
1167 && (ch) != '\0' \
1168 && ((s)[1] == '\0' || (match_input ((s) + (consume), consume))))
1171 /*----------------------------------------------------------.
1172 | Inititialize input stacks, and quote/comment characters. |
1173 `----------------------------------------------------------*/
1175 void
1176 input_init (void)
1178 current_file = "";
1179 current_line = 0;
1181 current_input = (struct obstack *) xmalloc (sizeof *current_input);
1182 obstack_init (current_input);
1183 wrapup_stack = (struct obstack *) xmalloc (sizeof *wrapup_stack);
1184 obstack_init (wrapup_stack);
1186 obstack_init (&file_names);
1188 /* Allocate an object in the current chunk, so that obstack_free
1189 will always work even if the first token parsed spills to a new
1190 chunk. */
1191 obstack_init (&token_stack);
1192 token_bottom = obstack_finish (&token_stack);
1194 isp = NULL;
1195 wsp = NULL;
1196 next = NULL;
1198 start_of_input_line = false;
1200 curr_quote.str1 = xstrdup (DEF_LQUOTE);
1201 curr_quote.len1 = strlen (curr_quote.str1);
1202 curr_quote.str2 = xstrdup (DEF_RQUOTE);
1203 curr_quote.len2 = strlen (curr_quote.str2);
1204 curr_comm.str1 = xstrdup (DEF_BCOMM);
1205 curr_comm.len1 = strlen (curr_comm.str1);
1206 curr_comm.str2 = xstrdup (DEF_ECOMM);
1207 curr_comm.len2 = strlen (curr_comm.str2);
1209 #ifdef ENABLE_CHANGEWORD
1210 set_word_regexp (NULL, user_word_regexp);
1211 #endif /* ENABLE_CHANGEWORD */
1213 set_quote_age ();
1217 /*--------------------------------------------------------------------.
1218 | Set the quote delimiters to LQ and RQ. Used by m4_changequote (). |
1219 | Pass NULL if the argument was not present, to distinguish from an |
1220 | explicit empty string. |
1221 `--------------------------------------------------------------------*/
1223 void
1224 set_quotes (const char *lq, const char *rq)
1226 /* POSIX states that with 0 arguments, the default quotes are used.
1227 POSIX XCU ERN 112 states that behavior is implementation-defined
1228 if there was only one argument, or if there is an empty string in
1229 either position when there are two arguments. We allow an empty
1230 left quote to disable quoting, but a non-empty left quote will
1231 always create a non-empty right quote. See the texinfo for what
1232 some other implementations do. */
1233 if (!lq)
1235 lq = DEF_LQUOTE;
1236 rq = DEF_RQUOTE;
1238 else if (!rq || (*lq && !*rq))
1239 rq = DEF_RQUOTE;
1241 if (strcmp (curr_quote.str1, lq) == 0 && strcmp (curr_quote.str2, rq) == 0)
1242 return;
1244 free (curr_quote.str1);
1245 free (curr_quote.str2);
1246 curr_quote.str1 = xstrdup (lq);
1247 curr_quote.len1 = strlen (curr_quote.str1);
1248 curr_quote.str2 = xstrdup (rq);
1249 curr_quote.len2 = strlen (curr_quote.str2);
1250 set_quote_age ();
1253 /*--------------------------------------------------------------------.
1254 | Set the comment delimiters to BC and EC. Used by m4_changecom (). |
1255 | Pass NULL if the argument was not present, to distinguish from an |
1256 | explicit empty string. |
1257 `--------------------------------------------------------------------*/
1259 void
1260 set_comment (const char *bc, const char *ec)
1262 /* POSIX requires no arguments to disable comments. It requires
1263 empty arguments to be used as-is, but this is counter to
1264 traditional behavior, because a non-null begin and null end makes
1265 it impossible to end a comment. An aardvark has been filed:
1266 http://www.opengroup.org/austin/mailarchives/ag-review/msg02168.html
1267 This implementation assumes the aardvark will be approved. See
1268 the texinfo for what some other implementations do. */
1269 if (!bc)
1270 bc = ec = "";
1271 else if (!ec || (*bc && !*ec))
1272 ec = DEF_ECOMM;
1274 if (strcmp (curr_comm.str1, bc) == 0 && strcmp (curr_comm.str2, ec) == 0)
1275 return;
1277 free (curr_comm.str1);
1278 free (curr_comm.str2);
1279 curr_comm.str1 = xstrdup (bc);
1280 curr_comm.len1 = strlen (curr_comm.str1);
1281 curr_comm.str2 = xstrdup (ec);
1282 curr_comm.len2 = strlen (curr_comm.str2);
1283 set_quote_age ();
1286 #ifdef ENABLE_CHANGEWORD
1288 /*-------------------------------------------------------------------.
1289 | Set the regular expression for recognizing words to REGEXP, and |
1290 | report errors on behalf of CALLER. If REGEXP is NULL, revert back |
1291 | to the default parsing rules. |
1292 `-------------------------------------------------------------------*/
1294 void
1295 set_word_regexp (const char *caller, const char *regexp)
1297 const char *msg;
1298 struct re_pattern_buffer new_word_regexp;
1300 if (!*regexp || !strcmp (regexp, DEFAULT_WORD_REGEXP))
1302 default_word_regexp = true;
1303 set_quote_age ();
1304 return;
1307 /* Dry run to see whether the new expression is compilable. */
1308 init_pattern_buffer (&new_word_regexp, NULL);
1309 msg = re_compile_pattern (regexp, strlen (regexp), &new_word_regexp);
1310 regfree (&new_word_regexp);
1312 if (msg != NULL)
1314 m4_warn (0, caller, _("bad regular expression `%s': %s"), regexp, msg);
1315 return;
1318 /* If compilation worked, retry using the word_regexp struct. We
1319 can't rely on struct assigns working, so redo the compilation.
1320 The fastmap can be reused between compilations, and will be freed
1321 by the final regfree. */
1322 if (!word_regexp.fastmap)
1323 word_regexp.fastmap = xcharalloc (UCHAR_MAX + 1);
1324 msg = re_compile_pattern (regexp, strlen (regexp), &word_regexp);
1325 assert (!msg);
1326 re_set_registers (&word_regexp, &regs, regs.num_regs, regs.start, regs.end);
1327 if (re_compile_fastmap (&word_regexp))
1328 assert (false);
1330 default_word_regexp = false;
1331 set_quote_age ();
1334 #endif /* ENABLE_CHANGEWORD */
1336 /* Call this when changing anything that might impact the quote age,
1337 so that quote_age and safe_quotes will reflect the change. */
1338 static void
1339 set_quote_age (void)
1341 /* Multi-character quotes are inherently unsafe, since concatenation
1342 of individual characters can result in a quote delimiter,
1343 consider:
1345 define(echo,``$1'')define(a,A)changequote(<[,]>)echo(<[]]><[>a]>)
1346 => A]> (not ]>a)
1348 Also, unquoted close delimiters are unsafe, consider:
1350 define(echo,``$1'')define(a,A)echo(`a''`a')
1351 => aA' (not a'a)
1353 Comment delimiters that overlap with quote delimiters or active
1354 characters also present a problem, consider:
1356 define(echo,$*)echo(a,a,a`'define(a,A)changecom(`,',`,'))
1357 => A,a,A (not A,A,A)
1359 And let's not even think about the impact of changeword, since it
1360 will disappear for M4 2.0.
1362 So rather than check every token for an unquoted delimiter, we
1363 merely encode current_quote_age to 0 when things are unsafe, and
1364 non-zero when safe (namely, to the 16-bit value composed of the
1365 single-character start and end quote delimiters). There may be
1366 other situations which are safe even when this algorithm sets the
1367 quote_age to zero, but at least a quote_age of zero always produces
1368 correct results (although it may take more time in doing so). */
1370 /* Hueristic of characters that might impact rescan if they appear in
1371 a quote delimiter. */
1372 #define Letters "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
1373 static const char unsafe[] = Letters "_0123456789(,) \t\n\r\f\v";
1374 #undef Letters
1376 if (curr_quote.len1 == 1 && curr_quote.len2 == 1
1377 && strpbrk (curr_quote.str1, unsafe) == NULL
1378 && strpbrk (curr_quote.str2, unsafe) == NULL
1379 && default_word_regexp && *curr_quote.str1 != *curr_quote.str2
1380 && *curr_comm.str1 != '(' && *curr_comm.str1 != ','
1381 && *curr_comm.str1 != ')' && *curr_comm.str1 != *curr_quote.str1)
1382 current_quote_age = (((*curr_quote.str1 & 0xff) << 8)
1383 | (*curr_quote.str2 & 0xff));
1384 else
1385 current_quote_age = 0;
1386 cached_quote = NULL;
1389 /* Return the current quote age. Each non-trivial changequote alters
1390 this value; the idea is that if quoting hasn't changed, then we can
1391 skip parsing a single argument, quoted or unquoted, within the
1392 context of a quoted string, as well as skip parsing a series of
1393 quoted arguments within the context of argument collection. */
1394 unsigned int
1395 quote_age (void)
1397 /* This accessor is a function, so that the implementation can
1398 change if needed. See set_quote_age for the current
1399 implementation. */
1400 return current_quote_age;
1403 /* Return true if the current quote delimiters guarantee that
1404 reparsing the current token in the context of a quoted string will
1405 be safe. This could always return false and behavior would still
1406 be correct, just slower. */
1407 bool
1408 safe_quotes (void)
1410 return current_quote_age != 0;
1413 /* Interface for caching frequently used quote pairs, using AGE for
1414 optimization. If QUOTES is NULL, don't use quoting. If OBS is
1415 non-NULL, AGE should be the current quote age, and QUOTES should be
1416 &curr_quote; the return value will be a cached quote pair, where
1417 the pointer is valid at least as long as OBS is not reset, but
1418 whose contents are only guaranteed until the next changequote or
1419 quote_cache. Otherwise, OBS is NULL, AGE should be the same as
1420 before, and QUOTES should be a previously returned cache value;
1421 used to refresh the contents of the result. */
1422 const string_pair *
1423 quote_cache (struct obstack *obs, unsigned int age, const string_pair *quotes)
1425 static char lquote[2];
1426 static char rquote[2];
1427 static string_pair simple = {lquote, 1, rquote, 1};
1429 /* Implementation - if AGE is non-zero, then the implementation of
1430 set_quote_age guarantees that we can recreate the return value on
1431 the fly; so we use static storage, and the contents must be used
1432 immediately. If AGE is zero, then we must copy QUOTES onto OBS
1433 (since changequote will invalidate the original), but we might as
1434 well cache that copy (in case the current expansion contains more
1435 than one instance of $@). */
1436 if (!quotes)
1437 return NULL;
1438 if (age)
1440 *lquote = (age >> 8) & 0xff;
1441 *rquote = age & 0xff;
1442 return &simple;
1444 if (!obs)
1445 return quotes;
1446 assert (next && quotes == &curr_quote);
1447 if (!cached_quote)
1449 assert (obs == current_input && obstack_object_size (obs) == 0);
1450 cached_quote = (string_pair *) obstack_copy (obs, quotes,
1451 sizeof *quotes);
1452 cached_quote->str1 = (char *) obstack_copy0 (obs, quotes->str1,
1453 quotes->len1);
1454 cached_quote->str2 = (char *) obstack_copy0 (obs, quotes->str2,
1455 quotes->len2);
1457 return cached_quote;
1461 /*--------------------------------------------------------------------.
1462 | Parse a single token from the input stream, set TD to its |
1463 | contents, and return its type. A token is TOKEN_EOF if the |
1464 | input_stack is empty; TOKEN_STRING for a quoted string or comment; |
1465 | TOKEN_WORD for something that is a potential macro name; and |
1466 | TOKEN_SIMPLE for any single character that is not a part of any of |
1467 | the previous types. If LINE is not NULL, set *LINE to the line |
1468 | where the token starts. If OBS is not NULL, expand TOKEN_STRING |
1469 | directly into OBS rather than in token_stack temporary storage |
1470 | area, and TD could be a TOKEN_COMP instead of the usual |
1471 | TOKEN_TEXT. If ALLOW_ARGV, OBS must be non-NULL, and an entire |
1472 | series of arguments can be returned as TOKEN_ARGV when a $@ |
1473 | reference is encountered. Report errors (unterminated comments or |
1474 | strings) on behalf of CALLER, if non-NULL. |
1476 | Next_token () returns the token type, and passes back a pointer to |
1477 | the token data through TD. Non-string token text is collected on |
1478 | the obstack token_stack, which never contains more than one token |
1479 | text at a time. The storage pointed to by the fields in TD is |
1480 | therefore subject to change the next time next_token () is called. |
1481 `--------------------------------------------------------------------*/
1483 token_type
1484 next_token (token_data *td, int *line, struct obstack *obs, bool allow_argv,
1485 const char *caller)
1487 int ch;
1488 int quote_level;
1489 token_type type;
1490 #ifdef ENABLE_CHANGEWORD
1491 char *orig_text = NULL;
1492 #endif /* ENABLE_CHANGEWORD */
1493 const char *file;
1494 int dummy;
1495 /* The obstack where token data is stored. Generally token_stack,
1496 for tokens where argument collection might not use the literal
1497 token. But for comments and strings, we can output directly into
1498 the argument collection obstack obs, if one was provided. */
1499 struct obstack *obs_td = &token_stack;
1501 obstack_free (&token_stack, token_bottom);
1502 if (!line)
1503 line = &dummy;
1505 /* Can't consume character until after CHAR_MACRO is handled. */
1506 TOKEN_DATA_TYPE (td) = TOKEN_VOID;
1507 ch = peek_input (allow_argv && current_quote_age);
1508 if (ch == CHAR_EOF)
1510 #ifdef DEBUG_INPUT
1511 xfprintf (stderr, "next_token -> EOF\n");
1512 #endif /* DEBUG_INPUT */
1513 next_char (false);
1514 return TOKEN_EOF;
1516 if (ch == CHAR_MACRO)
1518 init_macro_token (td);
1519 next_char (false);
1520 #ifdef DEBUG_INPUT
1521 xfprintf (stderr, "next_token -> MACDEF (%s)\n",
1522 find_builtin_by_addr (TOKEN_DATA_FUNC (td))->name);
1523 #endif /* DEBUG_INPUT */
1524 return TOKEN_MACDEF;
1526 if (ch == CHAR_ARGV)
1528 init_argv_token (obs, td);
1529 #ifdef DEBUG_INPUT
1530 xfprintf (stderr, "next_token -> ARGV (%d args)\n",
1531 (arg_argc (td->u.u_c.chain->u.u_a.argv)
1532 - td->u.u_c.chain->u.u_a.index
1533 - (td->u.u_c.chain->u.u_a.skip_last ? 1 : 0)));
1534 #endif
1535 return TOKEN_ARGV;
1538 next_char (false); /* Consume character we already peeked at. */
1539 file = current_file;
1540 *line = current_line;
1541 if (MATCH (ch, curr_comm.str1, true))
1543 if (obs)
1544 obs_td = obs;
1545 obstack_grow (obs_td, curr_comm.str1, curr_comm.len1);
1546 while ((ch = next_char (false)) < CHAR_EOF
1547 && !MATCH (ch, curr_comm.str2, true))
1548 obstack_1grow (obs_td, ch);
1549 if (ch != CHAR_EOF)
1551 assert (ch < CHAR_EOF);
1552 obstack_grow (obs_td, curr_comm.str2, curr_comm.len2);
1554 else
1555 /* Current_file changed to "" if we see CHAR_EOF, use the
1556 previous value we stored earlier. */
1557 m4_error_at_line (EXIT_FAILURE, 0, file, *line, caller,
1558 _("end of file in comment"));
1560 type = TOKEN_STRING;
1562 else if (default_word_regexp && (isalpha (ch) || ch == '_'))
1564 obstack_1grow (&token_stack, ch);
1565 while ((ch = peek_input (false)) < CHAR_EOF
1566 && (isalnum (ch) || ch == '_'))
1568 obstack_1grow (&token_stack, ch);
1569 next_char (false);
1571 type = TOKEN_WORD;
1574 #ifdef ENABLE_CHANGEWORD
1576 else if (!default_word_regexp && word_regexp.fastmap[ch])
1578 obstack_1grow (&token_stack, ch);
1579 while (1)
1581 ch = peek_input (false);
1582 if (ch >= CHAR_EOF)
1583 break;
1584 obstack_1grow (&token_stack, ch);
1585 if (re_match (&word_regexp, (char *) obstack_base (&token_stack),
1586 obstack_object_size (&token_stack), 0, &regs)
1587 != obstack_object_size (&token_stack))
1589 obstack_blank (&token_stack, -1);
1590 break;
1592 next_char (false);
1595 obstack_1grow (&token_stack, '\0');
1596 orig_text = (char *) obstack_finish (&token_stack);
1598 if (regs.start[1] != -1)
1599 obstack_grow (&token_stack, orig_text + regs.start[1],
1600 regs.end[1] - regs.start[1]);
1601 else
1602 obstack_grow (&token_stack, orig_text, regs.end[0]);
1604 type = TOKEN_WORD;
1607 #endif /* ENABLE_CHANGEWORD */
1609 else if (!MATCH (ch, curr_quote.str1, true))
1611 switch (ch)
1613 case '(':
1614 type = TOKEN_OPEN;
1615 break;
1616 case ',':
1617 type = TOKEN_COMMA;
1618 break;
1619 case ')':
1620 type = TOKEN_CLOSE;
1621 break;
1622 default:
1623 type = TOKEN_SIMPLE;
1624 break;
1626 obstack_1grow (&token_stack, ch);
1628 else
1630 if (obs)
1631 obs_td = obs;
1632 quote_level = 1;
1633 while (1)
1635 ch = next_char (obs != NULL && current_quote_age);
1636 if (ch == CHAR_EOF)
1637 /* Current_file changed to "" if we see CHAR_EOF, use
1638 the previous value we stored earlier. */
1639 m4_error_at_line (EXIT_FAILURE, 0, file, *line, caller,
1640 _("end of file in string"));
1642 if (ch == CHAR_QUOTE)
1643 append_quote_token (obs, td);
1644 else if (MATCH (ch, curr_quote.str2, true))
1646 if (--quote_level == 0)
1647 break;
1648 obstack_grow (obs_td, curr_quote.str2, curr_quote.len2);
1650 else if (MATCH (ch, curr_quote.str1, true))
1652 quote_level++;
1653 obstack_grow (obs_td, curr_quote.str1, curr_quote.len1);
1655 else
1657 assert (ch < CHAR_EOF);
1658 obstack_1grow (obs_td, ch);
1661 type = TOKEN_STRING;
1664 if (TOKEN_DATA_TYPE (td) == TOKEN_VOID)
1666 TOKEN_DATA_TYPE (td) = TOKEN_TEXT;
1667 TOKEN_DATA_LEN (td) = obstack_object_size (obs_td);
1668 if (obs_td != obs)
1670 obstack_1grow (obs_td, '\0');
1671 TOKEN_DATA_TEXT (td) = (char *) obstack_finish (obs_td);
1673 else
1674 TOKEN_DATA_TEXT (td) = NULL;
1675 TOKEN_DATA_QUOTE_AGE (td) = current_quote_age;
1676 #ifdef ENABLE_CHANGEWORD
1677 if (orig_text == NULL)
1678 TOKEN_DATA_ORIG_TEXT (td) = TOKEN_DATA_TEXT (td);
1679 else
1681 TOKEN_DATA_ORIG_TEXT (td) = orig_text;
1682 TOKEN_DATA_LEN (td) = strlen (orig_text);
1684 #endif /* ENABLE_CHANGEWORD */
1685 #ifdef DEBUG_INPUT
1686 xfprintf (stderr, "next_token -> %s (%s), len %zu\n",
1687 token_type_string (type), TOKEN_DATA_TEXT (td),
1688 TOKEN_DATA_LEN (td));
1689 #endif /* DEBUG_INPUT */
1691 else
1693 assert (TOKEN_DATA_TYPE (td) == TOKEN_COMP && type == TOKEN_STRING);
1694 #ifdef DEBUG_INPUT
1696 token_chain *chain;
1697 size_t len = 0;
1698 int links = 0;
1699 chain = td->u.u_c.chain;
1700 xfprintf (stderr, "next_token -> %s <chain> (",
1701 token_type_string (type));
1702 while (chain)
1704 switch (chain->type)
1706 case CHAIN_STR:
1707 xfprintf (stderr, "%s", chain->u.u_s.str);
1708 len += chain->u.u_s.len;
1709 break;
1710 case CHAIN_ARGV:
1711 xfprintf (stderr, "{$@}");
1712 break;
1713 default:
1714 assert (!"next_token");
1715 abort ();
1717 links++;
1718 chain = chain->next;
1720 xfprintf (stderr, "), %d links, len %zu\n",
1721 links, len);
1723 #endif /* DEBUG_INPUT */
1725 return type;
1728 /*-----------------------------------------------.
1729 | Peek at the next token from the input stream. |
1730 `-----------------------------------------------*/
1732 token_type
1733 peek_token (void)
1735 token_type result;
1736 int ch = peek_input (false);
1738 if (ch == CHAR_EOF)
1740 result = TOKEN_EOF;
1742 else if (ch == CHAR_MACRO)
1744 result = TOKEN_MACDEF;
1746 else if (MATCH (ch, curr_comm.str1, false))
1748 result = TOKEN_STRING;
1750 else if ((default_word_regexp && (isalpha (ch) || ch == '_'))
1751 #ifdef ENABLE_CHANGEWORD
1752 || (!default_word_regexp && word_regexp.fastmap[ch])
1753 #endif /* ENABLE_CHANGEWORD */
1756 result = TOKEN_WORD;
1758 else if (MATCH (ch, curr_quote.str1, false))
1760 result = TOKEN_STRING;
1762 else
1763 switch (ch)
1765 case '(':
1766 result = TOKEN_OPEN;
1767 break;
1768 case ',':
1769 result = TOKEN_COMMA;
1770 break;
1771 case ')':
1772 result = TOKEN_CLOSE;
1773 break;
1774 default:
1775 result = TOKEN_SIMPLE;
1778 #ifdef DEBUG_INPUT
1779 xfprintf (stderr, "peek_token -> %s\n", token_type_string (result));
1780 #endif /* DEBUG_INPUT */
1781 return result;
1785 #ifdef DEBUG_INPUT
1787 static const char *
1788 token_type_string (token_type t)
1790 switch (t)
1791 { /* TOKSW */
1792 case TOKEN_EOF:
1793 return "EOF";
1794 case TOKEN_STRING:
1795 return "STRING";
1796 case TOKEN_WORD:
1797 return "WORD";
1798 case TOKEN_OPEN:
1799 return "OPEN";
1800 case TOKEN_COMMA:
1801 return "COMMA";
1802 case TOKEN_CLOSE:
1803 return "CLOSE";
1804 case TOKEN_SIMPLE:
1805 return "SIMPLE";
1806 case TOKEN_MACDEF:
1807 return "MACDEF";
1808 default:
1809 abort ();
1813 static void
1814 print_token (const char *s, token_type t, token_data *td)
1816 xfprintf (stderr, "%s: ", s);
1817 switch (t)
1818 { /* TOKSW */
1819 case TOKEN_OPEN:
1820 case TOKEN_COMMA:
1821 case TOKEN_CLOSE:
1822 case TOKEN_SIMPLE:
1823 xfprintf (stderr, "char:");
1824 break;
1826 case TOKEN_WORD:
1827 xfprintf (stderr, "word:");
1828 break;
1830 case TOKEN_STRING:
1831 xfprintf (stderr, "string:");
1832 break;
1834 case TOKEN_MACDEF:
1835 xfprintf (stderr, "macro: %p\n", TOKEN_DATA_FUNC (td));
1836 break;
1838 case TOKEN_EOF:
1839 xfprintf (stderr, "eof\n");
1840 break;
1842 xfprintf (stderr, "\t\"%s\"\n", TOKEN_DATA_TEXT (td));
1845 static void M4_GNUC_UNUSED
1846 lex_debug (void)
1848 token_type t;
1849 token_data td;
1851 while ((t = next_token (&td, NULL, NULL, false, "<debug>")) != TOKEN_EOF)
1852 print_token ("lex", t, &td);
1854 #endif /* DEBUG_INPUT */