maint: bump copyright year
[m4.git] / m4 / syntax.c
blob4bde123484090f3a5fd395bfb60c0f0141c2113d
1 /* GNU m4 -- A simple macro processor
2 Copyright (C) 1989-1994, 2002, 2004, 2006-2010, 2013-2014, 2017 Free
3 Software Foundation, Inc.
5 This file is part of GNU M4.
7 GNU M4 is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 GNU M4 is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
21 #include <config.h>
23 #include "m4private.h"
25 /* Define this to see runtime debug info. Implied by DEBUG. */
26 /*#define DEBUG_SYNTAX */
28 /* THE SYNTAX TABLE
30 The input is read character by character and grouped together
31 according to a syntax table. The character groups are (definitions
32 are all in m4module.h, those marked with a * are not yet in use):
34 Basic (all characters fall in one of these mutually exclusive bins)
35 M4_SYNTAX_IGNORE *Character to be deleted from input as if not present
36 M4_SYNTAX_OTHER Any character with no special meaning to m4
37 M4_SYNTAX_SPACE Whitespace (ignored when leading macro arguments)
38 M4_SYNTAX_OPEN Open list of macro arguments
39 M4_SYNTAX_CLOSE Close list of macro arguments
40 M4_SYNTAX_COMMA Separates macro arguments
41 M4_SYNTAX_ACTIVE This character is a macro name by itself
42 M4_SYNTAX_ESCAPE Use this character to prefix all macro names
44 M4_SYNTAX_ALPHA Alphabetic characters (can start macro names)
45 M4_SYNTAX_NUM Numeric characters (can form macro names)
47 M4_SYNTAX_LQUOTE A single character left quote
48 M4_SYNTAX_BCOMM A single character begin comment delimiter
50 Attribute (these are context sensitive, and exist in addition to basic)
51 M4_SYNTAX_RQUOTE A single character right quote
52 M4_SYNTAX_ECOMM A single character end comment delimiter
53 M4_SYNTAX_DOLLAR Indicates macro argument in user macros
54 M4_SYNTAX_LBRACE *Indicates start of extended macro argument
55 M4_SYNTAX_RBRACE *Indicates end of extended macro argument
57 Besides adding new facilities, the use of a syntax table will reduce
58 the number of calls to next_token (). Now groups of OTHER, NUM and
59 SPACE characters can be returned as a single token, since next_token
60 () knows they have no special syntactical meaning to m4. This is,
61 however, only possible if only single character quotes comments
62 comments are used, because otherwise the quote and comment characters
63 will not show up in the syntax-table.
65 Having a syntax table allows new facilities. The new builtin
66 "changesyntax" allows the user to change the category of any
67 character.
69 By default, '\n' is both ECOMM and SPACE, depending on the context.
70 Hence we have basic categories (mutually exclusive, can introduce a
71 context, and can be empty sets), and attribute categories
72 (additive, only recognized in context, and will never be empty).
74 The precedence as implemented by next_token () is:
76 M4_SYNTAX_IGNORE *Filtered out below next_token ()
77 M4_SYNTAX_ESCAPE Reads macro name iff set, else next character
78 M4_SYNTAX_ALPHA Reads M4_SYNTAX_ALPHA and M4_SYNTAX_NUM as macro name
79 M4_SYNTAX_LQUOTE Reads all until balanced M4_SYNTAX_RQUOTE
80 M4_SYNTAX_BCOMM Reads all until M4_SYNTAX_ECOMM
82 M4_SYNTAX_OTHER } Reads all M4_SYNTAX_OTHER, M4_SYNTAX_NUM
83 M4_SYNTAX_NUM }
85 M4_SYNTAX_SPACE Reads all M4_SYNTAX_SPACE, depending on buffering
86 M4_SYNTAX_ACTIVE Returns a single char as a macro name
88 M4_SYNTAX_OPEN } Returned as a single char
89 M4_SYNTAX_CLOSE }
90 M4_SYNTAX_COMMA }
92 M4_SYNTAX_RQUOTE and M4_SYNTAX_ECOMM are context-sensitive, and
93 close out M4_SYNTAX_LQUOTE and M4_SYNTAX_BCOMM, respectively.
94 Also, M4_SYNTAX_DOLLAR, M4_SYNTAX_LBRACE, and M4_SYNTAX_RBRACE are
95 context-sensitive, only mattering when expanding macro definitions.
97 There are several optimizations that can be performed depending on
98 known states of the syntax table. For example, when searching for
99 quotes, if there is only a single start quote and end quote
100 delimiter, we can use memchr2 and search a word at a time, instead
101 of performing a table lookup a byte at a time. The is_single_*
102 flags track whether quotes and comments have a single delimiter
103 (always the case if changequote/changecom were used, and
104 potentially the case after changesyntax). Since we frequently need
105 to access quotes, we store the oldest valid quote outside the
106 lookup table; the suspect flag tracks whether a cleanup pass is
107 needed to restore our invariants. On the other hand, coalescing
108 multiple M4_SYNTAX_OTHER bytes could form a delimiter, so many
109 optimizations must be disabled if a multi-byte delimiter exists;
110 this is handled by m4__safe_quotes. Meanwhile, quotes and comments
111 can be disabled if the leading delimiter is length 0. */
113 static int add_syntax_attribute (m4_syntax_table *, char, int);
114 static int remove_syntax_attribute (m4_syntax_table *, char, int);
115 static void set_quote_age (m4_syntax_table *, bool, bool);
117 m4_syntax_table *
118 m4_syntax_create (void)
120 m4_syntax_table *syntax = (m4_syntax_table *) xzalloc (sizeof *syntax);
121 int ch;
123 /* Set up default table. This table never changes during operation,
124 and contains no context attributes. */
125 for (ch = UCHAR_MAX + 1; --ch >= 0; )
126 switch (ch)
128 case '(':
129 syntax->orig[ch] = M4_SYNTAX_OPEN;
130 break;
131 case ')':
132 syntax->orig[ch] = M4_SYNTAX_CLOSE;
133 break;
134 case ',':
135 syntax->orig[ch] = M4_SYNTAX_COMMA;
136 break;
137 case '`':
138 syntax->orig[ch] = M4_SYNTAX_LQUOTE;
139 break;
140 case '#':
141 syntax->orig[ch] = M4_SYNTAX_BCOMM;
142 break;
143 default:
144 if (isspace (ch))
145 syntax->orig[ch] = M4_SYNTAX_SPACE;
146 else if (isalpha (ch) || ch == '_')
147 syntax->orig[ch] = M4_SYNTAX_ALPHA;
148 else if (isdigit (ch))
149 syntax->orig[ch] = M4_SYNTAX_NUM;
150 else
151 syntax->orig[ch] = M4_SYNTAX_OTHER;
154 /* Set up current table to match default. */
155 m4_reset_syntax (syntax);
156 syntax->cached_simple.str1 = syntax->cached_lquote;
157 syntax->cached_simple.len1 = 1;
158 syntax->cached_simple.str2 = syntax->cached_rquote;
159 syntax->cached_simple.len2 = 1;
160 return syntax;
163 void
164 m4_syntax_delete (m4_syntax_table *syntax)
166 assert (syntax);
168 free (syntax->quote.str1);
169 free (syntax->quote.str2);
170 free (syntax->comm.str1);
171 free (syntax->comm.str2);
172 free (syntax);
176 m4_syntax_code (char ch)
178 int code;
180 switch (ch)
182 /* Sorted according to the order of M4_SYNTAX_* in m4module.h. */
183 /* FIXME - revisit the ignore syntax attribute. */
184 case 'I': case 'i': code = M4_SYNTAX_IGNORE; break;
185 /* Basic categories. */
186 case '@': code = M4_SYNTAX_ESCAPE; break;
187 case 'W': case 'w': code = M4_SYNTAX_ALPHA; break;
188 case 'L': case 'l': code = M4_SYNTAX_LQUOTE; break;
189 case 'B': case 'b': code = M4_SYNTAX_BCOMM; break;
190 case 'A': case 'a': code = M4_SYNTAX_ACTIVE; break;
191 case 'D': case 'd': code = M4_SYNTAX_NUM; break;
192 case 'S': case 's': code = M4_SYNTAX_SPACE; break;
193 case '(': code = M4_SYNTAX_OPEN; break;
194 case ')': code = M4_SYNTAX_CLOSE; break;
195 case ',': code = M4_SYNTAX_COMMA; break;
196 case 'O': case 'o': code = M4_SYNTAX_OTHER; break;
197 /* Context categories. */
198 case '$': code = M4_SYNTAX_DOLLAR; break;
199 case '{': code = M4_SYNTAX_LBRACE; break;
200 case '}': code = M4_SYNTAX_RBRACE; break;
201 case 'R': case 'r': code = M4_SYNTAX_RQUOTE; break;
202 case 'E': case 'e': code = M4_SYNTAX_ECOMM; break;
204 default: code = -1; break;
207 return code;
212 /* Functions to manipulate the syntax table. */
213 static int
214 add_syntax_attribute (m4_syntax_table *syntax, char ch, int code)
216 int c = to_uchar (ch);
217 if (code & M4_SYNTAX_MASKS)
219 syntax->table[c] |= code;
220 syntax->suspect = true;
222 else
224 if ((code & (M4_SYNTAX_SUSPECT)) != 0
225 || m4_has_syntax (syntax, c, M4_SYNTAX_SUSPECT))
226 syntax->suspect = true;
227 syntax->table[c] = ((syntax->table[c] & M4_SYNTAX_MASKS) | code);
230 #ifdef DEBUG_SYNTAX
231 xfprintf(stderr, "Set syntax %o %c = %04X\n", c, isprint(c) ? c : '-',
232 syntax->table[c]);
233 #endif
235 return syntax->table[c];
238 static int
239 remove_syntax_attribute (m4_syntax_table *syntax, char ch, int code)
241 int c = to_uchar (ch);
242 assert (code & M4_SYNTAX_MASKS);
243 syntax->table[c] &= ~code;
244 syntax->suspect = true;
246 #ifdef DEBUG_SYNTAX
247 xfprintf(stderr, "Unset syntax %o %c = %04X\n", c, isprint(c) ? c : '-',
248 syntax->table[c]);
249 #endif
251 return syntax->table[c];
254 /* Add the set CHARS of length LEN to syntax category CODE, removing
255 them from whatever category they used to be in. */
256 static void
257 add_syntax_set (m4_syntax_table *syntax, const char *chars, size_t len,
258 int code)
260 while (len--)
261 add_syntax_attribute (syntax, *chars++, code);
264 /* Remove the set CHARS of length LEN from syntax category CODE,
265 adding them to category M4_SYNTAX_OTHER instead. */
266 static void
267 subtract_syntax_set (m4_syntax_table *syntax, const char *chars, size_t len,
268 int code)
270 while (len--)
272 char ch = *chars++;
273 if ((code & M4_SYNTAX_MASKS) != 0)
274 remove_syntax_attribute (syntax, ch, code);
275 else if (m4_has_syntax (syntax, ch, code))
276 add_syntax_attribute (syntax, ch, M4_SYNTAX_OTHER);
280 /* Make the set CHARS of length LEN become syntax category CODE,
281 removing CHARS from any other categories, and sending all bytes in
282 the category but not in CHARS to category M4_SYNTAX_OTHER
283 instead. */
284 static void
285 set_syntax_set (m4_syntax_table *syntax, const char *chars, size_t len,
286 int code)
288 int ch;
289 /* Explicit set of characters to install with this category; all
290 other characters that used to have the category get reset to
291 OTHER. */
292 for (ch = UCHAR_MAX + 1; --ch >= 0; )
294 if ((code & M4_SYNTAX_MASKS) != 0)
295 remove_syntax_attribute (syntax, ch, code);
296 else if (m4_has_syntax (syntax, ch, code))
297 add_syntax_attribute (syntax, ch, M4_SYNTAX_OTHER);
299 while (len--)
301 ch = *chars++;
302 add_syntax_attribute (syntax, ch, code);
306 /* Reset syntax category CODE to its default state, sending all other
307 characters in the category back to their default state. */
308 static void
309 reset_syntax_set (m4_syntax_table *syntax, int code)
311 int ch;
312 for (ch = UCHAR_MAX + 1; --ch >= 0; )
314 /* Reset the category back to its default state. All other
315 characters that used to have this category get reset to
316 their default state as well. */
317 if (code == M4_SYNTAX_RQUOTE)
319 if (ch == '\'')
320 add_syntax_attribute (syntax, ch, code);
321 else
322 remove_syntax_attribute (syntax, ch, code);
324 else if (code == M4_SYNTAX_ECOMM)
326 if (ch == '\n')
327 add_syntax_attribute (syntax, ch, code);
328 else
329 remove_syntax_attribute (syntax, ch, code);
331 else if (code == M4_SYNTAX_DOLLAR)
333 if (ch == '$')
334 add_syntax_attribute (syntax, ch, code);
335 else
336 remove_syntax_attribute (syntax, ch, code);
338 else if (code == M4_SYNTAX_LBRACE)
340 if (ch == '{')
341 add_syntax_attribute (syntax, ch, code);
342 else
343 remove_syntax_attribute (syntax, ch, code);
345 else if (code == M4_SYNTAX_RBRACE)
347 if (ch == '}')
348 add_syntax_attribute (syntax, ch, code);
349 else
350 remove_syntax_attribute (syntax, ch, code);
352 else if (syntax->orig[ch] == code || m4_has_syntax (syntax, ch, code))
353 add_syntax_attribute (syntax, ch, syntax->orig[ch]);
357 /* Reset the syntax table to its default state. */
358 void
359 m4_reset_syntax (m4_syntax_table *syntax)
361 /* Restore the default syntax, which has known quote and comment
362 properties. */
363 memcpy (syntax->table, syntax->orig, sizeof syntax->orig);
365 free (syntax->quote.str1);
366 free (syntax->quote.str2);
367 free (syntax->comm.str1);
368 free (syntax->comm.str2);
370 /* The use of xmemdup0 is exploited by input.c. */
371 syntax->quote.str1 = xmemdup0 (DEF_LQUOTE, 1);
372 syntax->quote.len1 = 1;
373 syntax->quote.str2 = xmemdup0 (DEF_RQUOTE, 1);
374 syntax->quote.len2 = 1;
375 syntax->comm.str1 = xmemdup0 (DEF_BCOMM, 1);
376 syntax->comm.len1 = 1;
377 syntax->comm.str2 = xmemdup0 (DEF_ECOMM, 1);
378 syntax->comm.len2 = 1;
379 syntax->dollar = '$';
381 add_syntax_attribute (syntax, syntax->quote.str2[0], M4_SYNTAX_RQUOTE);
382 add_syntax_attribute (syntax, syntax->comm.str2[0], M4_SYNTAX_ECOMM);
383 add_syntax_attribute (syntax, '$', M4_SYNTAX_DOLLAR);
384 add_syntax_attribute (syntax, '{', M4_SYNTAX_LBRACE);
385 add_syntax_attribute (syntax, '}', M4_SYNTAX_RBRACE);
387 syntax->is_single_quotes = true;
388 syntax->is_single_comments = true;
389 syntax->is_single_dollar = true;
390 syntax->is_macro_escaped = false;
391 set_quote_age (syntax, true, false);
394 /* Alter the syntax for category KEY, according to ACTION: '+' to add,
395 '-' to subtract, '=' to set, or '\0' to reset. The array CHARS of
396 length LEN describes the characters to modify; it is ignored if
397 ACTION is '\0'. Return -1 if KEY is invalid, otherwise return the
398 syntax category matching KEY. */
400 m4_set_syntax (m4_syntax_table *syntax, char key, char action,
401 const char *chars, size_t len)
403 int code;
405 assert (syntax && chars);
406 code = m4_syntax_code (key);
407 if (code < 0)
409 return -1;
411 syntax->suspect = false;
412 switch (action)
414 case '+':
415 add_syntax_set (syntax, chars, len, code);
416 break;
417 case '-':
418 subtract_syntax_set (syntax, chars, len, code);
419 break;
420 case '=':
421 set_syntax_set (syntax, chars, len, code);
422 break;
423 case '\0':
424 assert (!len);
425 reset_syntax_set (syntax, code);
426 break;
427 default:
428 assert (false);
431 /* Check for any cleanup needed. */
432 if (syntax->suspect)
434 int ch;
435 int lquote = -1;
436 int rquote = -1;
437 int bcomm = -1;
438 int ecomm = -1;
439 bool single_quote_possible = true;
440 bool single_comm_possible = true;
441 int dollar = -1;
442 if (m4_has_syntax (syntax, syntax->quote.str1[0], M4_SYNTAX_LQUOTE))
444 assert (syntax->quote.len1 == 1);
445 lquote = to_uchar (syntax->quote.str1[0]);
447 if (m4_has_syntax (syntax, syntax->quote.str2[0], M4_SYNTAX_RQUOTE))
449 assert (syntax->quote.len2 == 1);
450 rquote = to_uchar (syntax->quote.str2[0]);
452 if (m4_has_syntax (syntax, syntax->comm.str1[0], M4_SYNTAX_BCOMM))
454 assert (syntax->comm.len1 == 1);
455 bcomm = to_uchar (syntax->comm.str1[0]);
457 if (m4_has_syntax (syntax, syntax->comm.str2[0], M4_SYNTAX_ECOMM))
459 assert (syntax->comm.len2 == 1);
460 ecomm = to_uchar (syntax->comm.str2[0]);
462 syntax->is_single_dollar = false;
463 syntax->is_macro_escaped = false;
464 /* Find candidates for each category. */
465 for (ch = UCHAR_MAX + 1; --ch >= 0; )
467 if (m4_has_syntax (syntax, ch, M4_SYNTAX_LQUOTE))
469 if (lquote == -1)
470 lquote = ch;
471 else if (lquote != ch)
472 single_quote_possible = false;
474 if (m4_has_syntax (syntax, ch, M4_SYNTAX_RQUOTE))
476 if (rquote == -1)
477 rquote = ch;
478 else if (rquote != ch)
479 single_quote_possible = false;
481 if (m4_has_syntax (syntax, ch, M4_SYNTAX_BCOMM))
483 if (bcomm == -1)
484 bcomm = ch;
485 else if (bcomm != ch)
486 single_comm_possible = false;
488 if (m4_has_syntax (syntax, ch, M4_SYNTAX_ECOMM))
490 if (ecomm == -1)
491 ecomm = ch;
492 else if (ecomm != ch)
493 single_comm_possible = false;
495 if (m4_has_syntax (syntax, ch, M4_SYNTAX_DOLLAR))
497 if (dollar == -1)
499 syntax->dollar = dollar = ch;
500 syntax->is_single_dollar = true;
502 else
503 syntax->is_single_dollar = false;
505 if (m4_has_syntax (syntax, ch, M4_SYNTAX_ESCAPE))
506 syntax->is_macro_escaped = true;
508 /* Disable multi-character delimiters if we discovered
509 delimiters. */
510 if (!single_quote_possible)
511 syntax->is_single_quotes = false;
512 if (!single_comm_possible)
513 syntax->is_single_comments = false;
514 if ((1 < syntax->quote.len1 || 1 < syntax->quote.len2)
515 && (!syntax->is_single_quotes || lquote != -1 || rquote != -1))
517 if (syntax->quote.len1)
519 syntax->quote.len1 = lquote == to_uchar (syntax->quote.str1[0]);
520 syntax->quote.str1[syntax->quote.len1] = '\0';
522 if (syntax->quote.len2)
524 syntax->quote.len2 = rquote == to_uchar (syntax->quote.str2[0]);
525 syntax->quote.str2[syntax->quote.len2] = '\0';
528 if ((1 < syntax->comm.len1 || 1 < syntax->comm.len2)
529 && (!syntax->is_single_comments || bcomm != -1 || ecomm != -1))
531 if (syntax->comm.len1)
533 syntax->comm.len1 = bcomm == to_uchar (syntax->comm.str1[0]);
534 syntax->comm.str1[syntax->comm.len1] = '\0';
536 if (syntax->comm.len2)
538 syntax->comm.len2 = ecomm == to_uchar (syntax->comm.str2[0]);
539 syntax->comm.str2[syntax->comm.len2] = '\0';
542 /* Update the strings. */
543 if (lquote != -1)
545 if (single_quote_possible)
546 syntax->is_single_quotes = true;
547 if (syntax->quote.len1)
548 assert (syntax->quote.len1 == 1);
549 else
551 free (syntax->quote.str1);
552 syntax->quote.str1 = xcharalloc (2);
553 syntax->quote.str1[1] = '\0';
554 syntax->quote.len1 = 1;
556 syntax->quote.str1[0] = lquote;
557 if (rquote == -1)
559 rquote = '\'';
560 add_syntax_attribute (syntax, rquote, M4_SYNTAX_RQUOTE);
562 if (!syntax->quote.len2)
564 free (syntax->quote.str2);
565 syntax->quote.str2 = xcharalloc (2);
567 syntax->quote.str2[0] = rquote;
568 syntax->quote.str2[1] = '\0';
569 syntax->quote.len2 = 1;
571 if (bcomm != -1)
573 if (single_comm_possible)
574 syntax->is_single_comments = true;
575 if (syntax->comm.len1)
576 assert (syntax->comm.len1 == 1);
577 else
579 free (syntax->comm.str1);
580 syntax->comm.str1 = xcharalloc (2);
581 syntax->comm.str1[1] = '\0';
582 syntax->comm.len1 = 1;
584 syntax->comm.str1[0] = bcomm;
585 if (ecomm == -1)
587 ecomm = '\n';
588 add_syntax_attribute (syntax, ecomm, M4_SYNTAX_ECOMM);
590 if (!syntax->comm.len2)
592 free (syntax->comm.str2);
593 syntax->comm.str2 = xcharalloc (2);
595 syntax->comm.str2[0] = ecomm;
596 syntax->comm.str2[1] = '\0';
597 syntax->comm.len2 = 1;
600 set_quote_age (syntax, false, true);
601 m4__quote_uncache (syntax);
602 return code;
606 /* Functions for setting quotes and comment delimiters. Used by
607 m4_changecom () and m4_changequote (). Both functions override the
608 syntax table to maintain compatibility. */
610 /* Set the quote delimiters to LQ and RQ, with respective lengths
611 LQ_LEN and RQ_LEN. Pass NULL if the argument was not present, to
612 distinguish from an explicit empty string. */
613 void
614 m4_set_quotes (m4_syntax_table *syntax, const char *lq, size_t lq_len,
615 const char *rq, size_t rq_len)
617 int ch;
619 assert (syntax);
621 /* POSIX states that with 0 arguments, the default quotes are used.
622 POSIX XCU ERN 112 states that behavior is implementation-defined
623 if there was only one argument, or if there is an empty string in
624 either position when there are two arguments. We allow an empty
625 left quote to disable quoting, but a non-empty left quote will
626 always create a non-empty right quote. See the texinfo for what
627 some other implementations do. */
628 if (!lq)
630 lq = DEF_LQUOTE;
631 lq_len = 1;
632 rq = DEF_RQUOTE;
633 rq_len = 1;
635 else if (!rq || (lq_len && !rq_len))
637 rq = DEF_RQUOTE;
638 rq_len = 1;
641 if (syntax->quote.len1 == lq_len && syntax->quote.len2 == rq_len
642 && memcmp (syntax->quote.str1, lq, lq_len) == 0
643 && memcmp (syntax->quote.str2, rq, rq_len) == 0)
644 return;
646 free (syntax->quote.str1);
647 free (syntax->quote.str2);
648 /* The use of xmemdup0 is exploited by input.c. */
649 syntax->quote.str1 = xmemdup0 (lq, lq_len);
650 syntax->quote.len1 = lq_len;
651 syntax->quote.str2 = xmemdup0 (rq, rq_len);
652 syntax->quote.len2 = rq_len;
654 /* changequote overrides syntax_table, but be careful when it is
655 used to select a start-quote sequence that is effectively
656 disabled. */
657 syntax->is_single_quotes = true;
658 for (ch = UCHAR_MAX + 1; --ch >= 0; )
660 if (m4_has_syntax (syntax, ch, M4_SYNTAX_LQUOTE))
661 add_syntax_attribute (syntax, ch,
662 (syntax->orig[ch] == M4_SYNTAX_LQUOTE
663 ? M4_SYNTAX_OTHER : syntax->orig[ch]));
664 if (m4_has_syntax (syntax, ch, M4_SYNTAX_RQUOTE))
665 remove_syntax_attribute (syntax, ch, M4_SYNTAX_RQUOTE);
668 if (!m4_has_syntax (syntax, *syntax->quote.str1,
669 (M4_SYNTAX_IGNORE | M4_SYNTAX_ESCAPE | M4_SYNTAX_ALPHA
670 | M4_SYNTAX_NUM)))
672 if (syntax->quote.len1 == 1)
673 add_syntax_attribute (syntax, syntax->quote.str1[0], M4_SYNTAX_LQUOTE);
674 if (syntax->quote.len2 == 1)
675 add_syntax_attribute (syntax, syntax->quote.str2[0], M4_SYNTAX_RQUOTE);
677 set_quote_age (syntax, false, false);
680 /* Set the comment delimiters to BC and EC, with respective lengths
681 BC_LEN and EC_LEN. Pass NULL if the argument was not present, to
682 distinguish from an explicit empty string. */
683 void
684 m4_set_comment (m4_syntax_table *syntax, const char *bc, size_t bc_len,
685 const char *ec, size_t ec_len)
687 int ch;
689 assert (syntax);
691 /* POSIX requires no arguments to disable comments, and that one
692 argument use newline as the close-comment. POSIX XCU ERN 131
693 states that empty arguments invoke implementation-defined
694 behavior. We allow an empty begin comment to disable comments,
695 and a non-empty begin comment will always create a non-empty end
696 comment. See the texinfo for what some other implementations
697 do. */
698 if (!bc)
700 bc = ec = "";
701 bc_len = ec_len = 0;
703 else if (!ec || (bc_len && !ec_len))
705 ec = DEF_ECOMM;
706 ec_len = 1;
709 if (syntax->comm.len1 == bc_len && syntax->comm.len2 == ec_len
710 && memcmp (syntax->comm.str1, bc, bc_len) == 0
711 && memcmp (syntax->comm.str2, ec, ec_len) == 0)
712 return;
714 free (syntax->comm.str1);
715 free (syntax->comm.str2);
716 /* The use of xmemdup0 is exploited by input.c. */
717 syntax->comm.str1 = xmemdup0 (bc, bc_len);
718 syntax->comm.len1 = bc_len;
719 syntax->comm.str2 = xmemdup0 (ec, ec_len);
720 syntax->comm.len2 = ec_len;
722 /* changecom overrides syntax_table, but be careful when it is used
723 to select a start-comment sequence that is effectively
724 disabled. */
725 syntax->is_single_comments = true;
726 for (ch = UCHAR_MAX + 1; --ch >= 0; )
728 if (m4_has_syntax (syntax, ch, M4_SYNTAX_BCOMM))
729 add_syntax_attribute (syntax, ch,
730 (syntax->orig[ch] == M4_SYNTAX_BCOMM
731 ? M4_SYNTAX_OTHER : syntax->orig[ch]));
732 if (m4_has_syntax (syntax, ch, M4_SYNTAX_ECOMM))
733 remove_syntax_attribute (syntax, ch, M4_SYNTAX_ECOMM);
735 if (!m4_has_syntax (syntax, *syntax->comm.str1,
736 (M4_SYNTAX_IGNORE | M4_SYNTAX_ESCAPE | M4_SYNTAX_ALPHA
737 | M4_SYNTAX_NUM | M4_SYNTAX_LQUOTE)))
739 if (syntax->comm.len1 == 1)
740 add_syntax_attribute (syntax, syntax->comm.str1[0], M4_SYNTAX_BCOMM);
741 if (syntax->comm.len2 == 1)
742 add_syntax_attribute (syntax, syntax->comm.str2[0], M4_SYNTAX_ECOMM);
744 set_quote_age (syntax, false, false);
747 /* Call this when changing anything that might impact the quote age,
748 so that m4__quote_age and m4__safe_quotes will reflect the change.
749 If RESET, changesyntax was reset to its default stage; if CHANGE,
750 arbitrary syntax has changed; otherwise, just quotes or comment
751 delimiters have changed. */
752 static void
753 set_quote_age (m4_syntax_table *syntax, bool reset, bool change)
755 /* Multi-character quotes are inherently unsafe, since concatenation
756 of individual characters can result in a quote delimiter,
757 consider:
759 define(echo,``$1'')define(a,A)changequote(<[,]>)echo(<[]]><[>a]>)
760 => A]> (not ]>a)
762 Also, unquoted close delimiters are unsafe, consider:
764 define(echo,``$1'')define(a,A)echo(`a''`a')
765 => aA' (not a'a)
767 Duplicated start and end quote delimiters, as well as comment
768 delimiters that overlap with quote delimiters or active characters,
769 also present a problem, consider:
771 define(echo,$*)echo(a,a,a`'define(a,A)changecom(`,',`,'))
772 => A,a,A (not A,A,A)
774 The impact of arbitrary changesyntax is difficult to characterize.
775 So if things are in their default state, we use 0 for the upper 16
776 bits of quote_age; otherwise we increment syntax_age for each
777 changesyntax, but saturate it at 0xffff rather than wrapping
778 around. Perhaps a cache of other frequently used states is
779 warranted, if changesyntax becomes more popular.
781 Perhaps someday we will fix $@ expansion to use the current
782 settings of the comma category, or even allow multi-character
783 argument separators via changesyntax. Until then, we use a literal
784 `,' in $@ expansion, therefore we must insist that `,' be an
785 argument separator for quote_age to be non-zero.
787 Rather than check every token for an unquoted delimiter, we merely
788 encode current_quote_age to 0 when things are unsafe, and non-zero
789 when safe (namely, the syntax_age in the upper 16 bits, coupled
790 with the 16-bit value composed of the single-character start and
791 end quote delimiters). There may be other situations which are
792 safe even when this algorithm sets the quote_age to zero, but at
793 least a quote_age of zero always produces correct results (although
794 it may take more time in doing so). */
796 unsigned short local_syntax_age;
797 if (reset)
798 local_syntax_age = 0;
799 else if (change && syntax->syntax_age < 0xffff)
800 local_syntax_age = ++syntax->syntax_age;
801 else
802 local_syntax_age = syntax->syntax_age;
803 if (local_syntax_age < 0xffff && syntax->is_single_quotes
804 && syntax->quote.len1 == 1 && syntax->quote.len2 == 1
805 && !m4_has_syntax (syntax, *syntax->quote.str1,
806 (M4_SYNTAX_ALPHA | M4_SYNTAX_NUM | M4_SYNTAX_OPEN
807 | M4_SYNTAX_COMMA | M4_SYNTAX_CLOSE
808 | M4_SYNTAX_SPACE))
809 && !m4_has_syntax (syntax, *syntax->quote.str2,
810 (M4_SYNTAX_ALPHA | M4_SYNTAX_NUM | M4_SYNTAX_OPEN
811 | M4_SYNTAX_COMMA | M4_SYNTAX_CLOSE
812 | M4_SYNTAX_SPACE))
813 && *syntax->quote.str1 != *syntax->quote.str2
814 && (!syntax->comm.len1
815 || (*syntax->comm.str1 != *syntax->quote.str2
816 && !m4_has_syntax (syntax, *syntax->comm.str1,
817 (M4_SYNTAX_OPEN | M4_SYNTAX_COMMA
818 | M4_SYNTAX_CLOSE))))
819 && m4_has_syntax (syntax, ',', M4_SYNTAX_COMMA))
821 syntax->quote_age = ((local_syntax_age << 16)
822 | ((*syntax->quote.str1 & 0xff) << 8)
823 | (*syntax->quote.str2 & 0xff));
825 else
826 syntax->quote_age = 0;
829 /* Interface for caching frequently used quote pairs, independently of
830 the current quote delimiters (for example, consider a text macro
831 expansion that includes several copies of $@), and using AGE for
832 optimization. If QUOTES is NULL, don't use quoting. If OBS is
833 non-NULL, AGE should be the current quote age, and QUOTES should be
834 m4_get_syntax_quotes; the return value will be a cached quote pair,
835 where the pointer is valid at least as long as OBS is not reset,
836 but whose contents are only guaranteed until the next changequote
837 or quote_cache. Otherwise, OBS is NULL, AGE should be the same as
838 before, and QUOTES should be a previously returned cache value;
839 used to refresh the contents of the result. */
840 const m4_string_pair *
841 m4__quote_cache (m4_syntax_table *syntax, m4_obstack *obs, unsigned int age,
842 const m4_string_pair *quotes)
844 /* Implementation - if AGE is non-zero, then the implementation of
845 set_quote_age guarantees that we can recreate the return value on
846 the fly; so we use static storage, and the contents must be used
847 immediately. If AGE is zero, then we must copy QUOTES onto OBS,
848 but we might as well cache that copy. */
849 if (!quotes)
850 return NULL;
851 if (age)
853 *syntax->cached_lquote = (age >> 8) & 0xff;
854 *syntax->cached_rquote = age & 0xff;
855 return &syntax->cached_simple;
857 if (!obs)
858 return quotes;
859 assert (quotes == &syntax->quote);
860 if (!syntax->cached_quote)
862 assert (obstack_object_size (obs) == 0);
863 syntax->cached_quote = (m4_string_pair *) obstack_copy (obs, quotes,
864 sizeof *quotes);
865 syntax->cached_quote->str1 = (char *) obstack_copy0 (obs, quotes->str1,
866 quotes->len1);
867 syntax->cached_quote->str2 = (char *) obstack_copy0 (obs, quotes->str2,
868 quotes->len2);
870 return syntax->cached_quote;
874 /* Define these functions at the end, so that calls in the file use the
875 faster macro version from m4module.h. */
876 #undef m4_get_syntax_lquote
877 const char *
878 m4_get_syntax_lquote (m4_syntax_table *syntax)
880 assert (syntax);
881 return syntax->quote.str1;
884 #undef m4_get_syntax_rquote
885 const char *
886 m4_get_syntax_rquote (m4_syntax_table *syntax)
888 assert (syntax);
889 return syntax->quote.str2;
892 #undef m4_get_syntax_quotes
893 const m4_string_pair *
894 m4_get_syntax_quotes (m4_syntax_table *syntax)
896 assert (syntax);
897 return &syntax->quote;
900 #undef m4_is_syntax_single_quotes
901 bool
902 m4_is_syntax_single_quotes (m4_syntax_table *syntax)
904 assert (syntax);
905 return syntax->is_single_quotes;
908 #undef m4_get_syntax_bcomm
909 const char *
910 m4_get_syntax_bcomm (m4_syntax_table *syntax)
912 assert (syntax);
913 return syntax->comm.str1;
916 #undef m4_get_syntax_ecomm
917 const char *
918 m4_get_syntax_ecomm (m4_syntax_table *syntax)
920 assert (syntax);
921 return syntax->comm.str2;
924 #undef m4_get_syntax_comments
925 const m4_string_pair *
926 m4_get_syntax_comments (m4_syntax_table *syntax)
928 assert (syntax);
929 return &syntax->comm;
932 #undef m4_is_syntax_single_comments
933 bool
934 m4_is_syntax_single_comments (m4_syntax_table *syntax)
936 assert (syntax);
937 return syntax->is_single_comments;
940 #undef m4_is_syntax_single_dollar
941 bool
942 m4_is_syntax_single_dollar (m4_syntax_table *syntax)
944 assert (syntax);
945 return syntax->is_single_dollar;
948 #undef m4_is_syntax_macro_escaped
949 bool
950 m4_is_syntax_macro_escaped (m4_syntax_table *syntax)
952 assert (syntax);
953 return syntax->is_macro_escaped;