Fix gas's 'macro count' test for various targets
[binutils-gdb.git] / gas / app.c
blob041941a19266164ebf1e61755fb8a3db0719ddd7
1 /* This is the Assembler Pre-Processor
2 Copyright (C) 1987-2024 Free Software Foundation, Inc.
4 This file is part of GAS, the GNU Assembler.
6 GAS is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GAS is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
14 License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GAS; see the file COPYING. If not, write to the Free
18 Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
19 02110-1301, USA. */
21 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90. */
22 /* App, the assembler pre-processor. This pre-processor strips out
23 excess spaces, turns single-quoted characters into a decimal
24 constant, and turns the # in # <number> <filename> <garbage> into a
25 .linefile. This needs better error-handling. */
27 #include "as.h"
29 #if (__STDC__ != 1)
30 #ifndef const
31 #define const /* empty */
32 #endif
33 #endif
35 #ifdef H_TICK_HEX
36 int enable_h_tick_hex = 0;
37 #endif
39 #ifdef TC_M68K
40 /* Whether we are scrubbing in m68k MRI mode. This is different from
41 flag_m68k_mri, because the two flags will be affected by the .mri
42 pseudo-op at different times. */
43 static int scrub_m68k_mri;
45 /* The pseudo-op which switches in and out of MRI mode. See the
46 comment in do_scrub_chars. */
47 static const char mri_pseudo[] = ".mri 0";
48 #else
49 #define scrub_m68k_mri 0
50 #endif
52 #if defined TC_ARM && defined OBJ_ELF
53 /* The pseudo-op for which we need to special-case `@' characters.
54 See the comment in do_scrub_chars. */
55 static const char symver_pseudo[] = ".symver";
56 static const char * symver_state;
57 #endif
59 static char last_char;
61 static char lex[256];
62 static const char symbol_chars[] =
63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
65 #define LEX_IS_SYMBOL_COMPONENT 1
66 #define LEX_IS_WHITESPACE 2
67 #define LEX_IS_LINE_SEPARATOR 3
68 #define LEX_IS_COMMENT_START 4
69 #define LEX_IS_LINE_COMMENT_START 5
70 #define LEX_IS_TWOCHAR_COMMENT_1ST 6
71 #define LEX_IS_STRINGQUOTE 8
72 #define LEX_IS_COLON 9
73 #define LEX_IS_NEWLINE 10
74 #define LEX_IS_ONECHAR_QUOTE 11
75 #ifdef TC_V850
76 #define LEX_IS_DOUBLEDASH_1ST 12
77 #endif
78 #ifdef TC_M32R
79 #define DOUBLEBAR_PARALLEL
80 #endif
81 #ifdef DOUBLEBAR_PARALLEL
82 #define LEX_IS_DOUBLEBAR_1ST 13
83 #endif
84 #define LEX_IS_PARALLEL_SEPARATOR 14
85 #ifdef H_TICK_HEX
86 #define LEX_IS_H 15
87 #endif
88 #define IS_SYMBOL_COMPONENT(c) (lex[c] == LEX_IS_SYMBOL_COMPONENT)
89 #define IS_WHITESPACE(c) (lex[c] == LEX_IS_WHITESPACE)
90 #define IS_LINE_SEPARATOR(c) (lex[c] == LEX_IS_LINE_SEPARATOR)
91 #define IS_PARALLEL_SEPARATOR(c) (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
92 #define IS_COMMENT(c) (lex[c] == LEX_IS_COMMENT_START)
93 #define IS_LINE_COMMENT(c) (lex[c] == LEX_IS_LINE_COMMENT_START)
94 #define IS_NEWLINE(c) (lex[c] == LEX_IS_NEWLINE)
96 static int process_escape (int);
98 /* FIXME-soon: The entire lexer/parser thingy should be
99 built statically at compile time rather than dynamically
100 each and every time the assembler is run. xoxorich. */
102 void
103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
105 const char *p;
106 int c;
108 lex[' '] = LEX_IS_WHITESPACE;
109 lex['\t'] = LEX_IS_WHITESPACE;
110 lex['\r'] = LEX_IS_WHITESPACE;
111 lex['\n'] = LEX_IS_NEWLINE;
112 lex[':'] = LEX_IS_COLON;
114 #ifdef TC_M68K
115 scrub_m68k_mri = m68k_mri;
117 if (! m68k_mri)
118 #endif
120 lex['"'] = LEX_IS_STRINGQUOTE;
122 #if ! defined (TC_HPPA)
123 lex['\''] = LEX_IS_ONECHAR_QUOTE;
124 #endif
126 #ifdef SINGLE_QUOTE_STRINGS
127 lex['\''] = LEX_IS_STRINGQUOTE;
128 #endif
131 /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
132 in state 5 of do_scrub_chars must be changed. */
134 /* Note that these override the previous defaults, e.g. if ';' is a
135 comment char, then it isn't a line separator. */
136 for (p = symbol_chars; *p; ++p)
137 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
139 for (c = 128; c < 256; ++c)
140 lex[c] = LEX_IS_SYMBOL_COMPONENT;
142 #ifdef tc_symbol_chars
143 /* This macro permits the processor to specify all characters which
144 may appears in an operand. This will prevent the scrubber from
145 discarding meaningful whitespace in certain cases. The i386
146 backend uses this to support prefixes, which can confuse the
147 scrubber as to whether it is parsing operands or opcodes. */
148 for (p = tc_symbol_chars; *p; ++p)
149 lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
150 #endif
152 /* The m68k backend wants to be able to change comment_chars. */
153 #ifndef tc_comment_chars
154 #define tc_comment_chars comment_chars
155 #endif
156 for (p = tc_comment_chars; *p; p++)
157 lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
159 for (p = line_comment_chars; *p; p++)
160 lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
162 #ifndef tc_line_separator_chars
163 #define tc_line_separator_chars line_separator_chars
164 #endif
165 for (p = tc_line_separator_chars; *p; p++)
166 lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
168 #ifdef tc_parallel_separator_chars
169 /* This macro permits the processor to specify all characters which
170 separate parallel insns on the same line. */
171 for (p = tc_parallel_separator_chars; *p; p++)
172 lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
173 #endif
175 /* Only allow slash-star comments if slash is not in use.
176 FIXME: This isn't right. We should always permit them. */
177 if (lex['/'] == 0)
178 lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
180 #ifdef TC_M68K
181 if (m68k_mri)
183 lex['\''] = LEX_IS_STRINGQUOTE;
184 lex[';'] = LEX_IS_COMMENT_START;
185 lex['*'] = LEX_IS_LINE_COMMENT_START;
186 /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
187 then it can't be used in an expression. */
188 lex['!'] = LEX_IS_LINE_COMMENT_START;
190 #endif
192 #ifdef TC_V850
193 lex['-'] = LEX_IS_DOUBLEDASH_1ST;
194 #endif
195 #ifdef DOUBLEBAR_PARALLEL
196 lex['|'] = LEX_IS_DOUBLEBAR_1ST;
197 #endif
198 #ifdef TC_D30V
199 /* Must do this is we want VLIW instruction with "->" or "<-". */
200 lex['-'] = LEX_IS_SYMBOL_COMPONENT;
201 #endif
203 #ifdef H_TICK_HEX
204 if (enable_h_tick_hex)
206 lex['h'] = LEX_IS_H;
207 lex['H'] = LEX_IS_H;
209 #endif
212 /* Saved state of the scrubber. */
213 static int state;
214 static int old_state;
215 static const char *out_string;
216 static char out_buf[20];
217 static int add_newlines;
218 static char *saved_input;
219 static size_t saved_input_len;
220 static char input_buffer[32 * 1024];
221 static const char *mri_state;
222 static char mri_last_ch;
224 /* Data structure for saving the state of app across #include's. Note that
225 app is called asynchronously to the parsing of the .include's, so our
226 state at the time .include is interpreted is completely unrelated.
227 That's why we have to save it all. */
229 struct app_save
231 int state;
232 int old_state;
233 const char * out_string;
234 char out_buf[sizeof (out_buf)];
235 int add_newlines;
236 char * saved_input;
237 size_t saved_input_len;
238 #ifdef TC_M68K
239 int scrub_m68k_mri;
240 #endif
241 const char * mri_state;
242 char mri_last_ch;
243 #if defined TC_ARM && defined OBJ_ELF
244 const char * symver_state;
245 #endif
246 char last_char;
249 char *
250 app_push (void)
252 struct app_save *saved;
254 saved = XNEW (struct app_save);
255 saved->state = state;
256 saved->old_state = old_state;
257 saved->out_string = out_string;
258 memcpy (saved->out_buf, out_buf, sizeof (out_buf));
259 saved->add_newlines = add_newlines;
260 if (saved_input == NULL)
261 saved->saved_input = NULL;
262 else
264 saved->saved_input = XNEWVEC (char, saved_input_len);
265 memcpy (saved->saved_input, saved_input, saved_input_len);
266 saved->saved_input_len = saved_input_len;
268 #ifdef TC_M68K
269 saved->scrub_m68k_mri = scrub_m68k_mri;
270 #endif
271 saved->mri_state = mri_state;
272 saved->mri_last_ch = mri_last_ch;
273 #if defined TC_ARM && defined OBJ_ELF
274 saved->symver_state = symver_state;
275 #endif
276 saved->last_char = last_char;
278 /* do_scrub_begin() is not useful, just wastes time. */
280 state = 0;
281 saved_input = NULL;
282 add_newlines = 0;
284 return (char *) saved;
287 void
288 app_pop (char *arg)
290 struct app_save *saved = (struct app_save *) arg;
292 /* There is no do_scrub_end (). */
293 state = saved->state;
294 old_state = saved->old_state;
295 out_string = saved->out_string;
296 memcpy (out_buf, saved->out_buf, sizeof (out_buf));
297 add_newlines = saved->add_newlines;
298 if (saved->saved_input == NULL)
299 saved_input = NULL;
300 else
302 gas_assert (saved->saved_input_len <= sizeof (input_buffer));
303 memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
304 saved_input = input_buffer;
305 saved_input_len = saved->saved_input_len;
306 free (saved->saved_input);
308 #ifdef TC_M68K
309 scrub_m68k_mri = saved->scrub_m68k_mri;
310 #endif
311 mri_state = saved->mri_state;
312 mri_last_ch = saved->mri_last_ch;
313 #if defined TC_ARM && defined OBJ_ELF
314 symver_state = saved->symver_state;
315 #endif
316 last_char = saved->last_char;
318 free (arg);
321 /* @@ This assumes that \n &c are the same on host and target. This is not
322 necessarily true. */
324 static int
325 process_escape (int ch)
327 switch (ch)
329 case 'b':
330 return '\b';
331 case 'f':
332 return '\f';
333 case 'n':
334 return '\n';
335 case 'r':
336 return '\r';
337 case 't':
338 return '\t';
339 case '\'':
340 return '\'';
341 case '"':
342 return '\"';
343 default:
344 return ch;
348 #define MULTIBYTE_WARN_COUNT_LIMIT 10
349 static unsigned int multibyte_warn_count = 0;
351 bool
352 scan_for_multibyte_characters (const unsigned char * start,
353 const unsigned char * end,
354 bool warn)
356 if (end <= start)
357 return false;
359 if (warn && multibyte_warn_count > MULTIBYTE_WARN_COUNT_LIMIT)
360 return false;
362 bool found = false;
364 while (start < end)
366 unsigned char c;
368 if ((c = * start++) <= 0x7f)
369 continue;
371 if (!warn)
372 return true;
374 found = true;
376 const char * filename;
377 unsigned int lineno;
379 filename = as_where (& lineno);
380 if (filename == NULL)
381 as_warn (_("multibyte character (%#x) encountered in input"), c);
382 else if (lineno == 0)
383 as_warn (_("multibyte character (%#x) encountered in %s"), c, filename);
384 else
385 as_warn (_("multibyte character (%#x) encountered in %s at or near line %u"), c, filename, lineno);
387 if (++ multibyte_warn_count == MULTIBYTE_WARN_COUNT_LIMIT)
389 as_warn (_("further multibyte character warnings suppressed"));
390 break;
394 return found;
397 /* This function is called to process input characters. The GET
398 parameter is used to retrieve more input characters. GET should
399 set its parameter to point to a buffer, and return the length of
400 the buffer; it should return 0 at end of file. The scrubbed output
401 characters are put into the buffer starting at TOSTART; the TOSTART
402 buffer is TOLEN bytes in length. The function returns the number
403 of scrubbed characters put into TOSTART. This will be TOLEN unless
404 end of file was seen. This function is arranged as a state
405 machine, and saves its state so that it may return at any point.
406 This is the way the old code used to work. */
408 size_t
409 do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
411 char *to = tostart;
412 char *toend = tostart + tolen;
413 char *from;
414 char *fromend;
415 size_t fromlen;
416 int ch, ch2 = 0;
417 /* Character that started the string we're working on. */
418 static char quotechar;
420 /*State 0: beginning of normal line
421 1: After first whitespace on line (flush more white)
422 2: After first non-white (opcode) on line (keep 1white)
423 3: after second white on line (into operands) (flush white)
424 4: after putting out a .linefile, put out digits
425 5: parsing a string, then go to old-state
426 6: putting out \ escape in a "d string.
427 7: no longer used
428 8: no longer used
429 9: After seeing symbol char in state 3 (keep 1white after symchar)
430 10: After seeing whitespace in state 9 (keep white before symchar)
431 11: After seeing a symbol character in state 0 (eg a label definition)
432 -1: output string in out_string and go to the state in old_state
433 -2: flush text until a '*' '/' is seen, then go to state old_state
434 #ifdef TC_V850
435 12: After seeing a dash, looking for a second dash as a start
436 of comment.
437 #endif
438 #ifdef DOUBLEBAR_PARALLEL
439 13: After seeing a vertical bar, looking for a second
440 vertical bar as a parallel expression separator.
441 #endif
442 #ifdef TC_PREDICATE_START_CHAR
443 14: After seeing a predicate start character at state 0, looking
444 for a predicate end character as predicate.
445 15: After seeing a predicate start character at state 1, looking
446 for a predicate end character as predicate.
447 #endif
448 #ifdef TC_Z80
449 16: After seeing an 'a' or an 'A' at the start of a symbol
450 17: After seeing an 'f' or an 'F' in state 16
451 #endif
454 /* I added states 9 and 10 because the MIPS ECOFF assembler uses
455 constructs like ``.loc 1 20''. This was turning into ``.loc
456 120''. States 9 and 10 ensure that a space is never dropped in
457 between characters which could appear in an identifier. Ian
458 Taylor, ian@cygnus.com.
460 I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
461 correctly on the PA (and any other target where colons are optional).
462 Jeff Law, law@cs.utah.edu.
464 I added state 13 so that something like "cmp r1, r2 || trap #1" does not
465 get squashed into "cmp r1,r2||trap#1", with the all important space
466 between the 'trap' and the '#1' being eliminated. nickc@cygnus.com */
468 /* This macro gets the next input character. */
470 #define GET() \
471 (from < fromend \
472 ? * (unsigned char *) (from++) \
473 : (saved_input = NULL, \
474 fromlen = (*get) (input_buffer, sizeof input_buffer), \
475 from = input_buffer, \
476 fromend = from + fromlen, \
477 (fromlen == 0 \
478 ? EOF \
479 : * (unsigned char *) (from++))))
481 /* This macro pushes a character back on the input stream. */
483 #define UNGET(uch) (*--from = (uch))
485 /* This macro puts a character into the output buffer. If this
486 character fills the output buffer, this macro jumps to the label
487 TOFULL. We use this rather ugly approach because we need to
488 handle two different termination conditions: EOF on the input
489 stream, and a full output buffer. It would be simpler if we
490 always read in the entire input stream before processing it, but
491 I don't want to make such a significant change to the assembler's
492 memory usage. */
494 #define PUT(pch) \
495 do \
497 *to++ = (pch); \
498 if (to >= toend) \
499 goto tofull; \
501 while (0)
503 if (saved_input != NULL)
505 from = saved_input;
506 fromend = from + saved_input_len;
508 else
510 fromlen = (*get) (input_buffer, sizeof input_buffer);
511 if (fromlen == 0)
512 return 0;
513 from = input_buffer;
514 fromend = from + fromlen;
516 if (multibyte_handling == multibyte_warn)
517 (void) scan_for_multibyte_characters ((const unsigned char *) from,
518 (const unsigned char* ) fromend,
519 true /* Generate warnings. */);
522 while (1)
524 /* The cases in this switch end with continue, in order to
525 branch back to the top of this while loop and generate the
526 next output character in the appropriate state. */
527 switch (state)
529 case -1:
530 ch = *out_string++;
531 if (*out_string == '\0')
533 state = old_state;
534 old_state = 3;
536 PUT (ch);
537 continue;
539 case -2:
540 for (;;)
544 ch = GET ();
546 if (ch == EOF)
548 as_warn (_("end of file in comment"));
549 goto fromeof;
552 if (ch == '\n')
553 PUT ('\n');
555 while (ch != '*');
557 while ((ch = GET ()) == '*')
560 if (ch == EOF)
562 as_warn (_("end of file in comment"));
563 goto fromeof;
566 if (ch == '/')
567 break;
569 UNGET (ch);
572 state = old_state;
573 UNGET (' ');
574 continue;
576 case 4:
577 ch = GET ();
578 if (ch == EOF)
579 goto fromeof;
580 else if (ch >= '0' && ch <= '9')
581 PUT (ch);
582 else
584 while (ch != EOF && IS_WHITESPACE (ch))
585 ch = GET ();
586 if (ch == '"')
588 quotechar = ch;
589 state = 5;
590 old_state = 3;
591 PUT (ch);
593 else
595 while (ch != EOF && ch != '\n')
596 ch = GET ();
597 state = 0;
598 PUT (ch);
601 continue;
603 case 5:
604 /* We are going to copy everything up to a quote character,
605 with special handling for a backslash. We try to
606 optimize the copying in the simple case without using the
607 GET and PUT macros. */
609 char *s;
610 ptrdiff_t len;
612 for (s = from; s < fromend; s++)
614 ch = *s;
615 if (ch == '\\'
616 || ch == quotechar
617 || ch == '\n')
618 break;
620 len = s - from;
621 if (len > toend - to)
622 len = toend - to;
623 if (len > 0)
625 memcpy (to, from, len);
626 to += len;
627 from += len;
628 if (to >= toend)
629 goto tofull;
633 ch = GET ();
634 if (ch == EOF)
636 /* This buffer is here specifically so
637 that the UNGET below will work. */
638 static char one_char_buf[1];
640 as_warn (_("end of file in string; '%c' inserted"), quotechar);
641 state = old_state;
642 from = fromend = one_char_buf + 1;
643 fromlen = 1;
644 UNGET ('\n');
645 PUT (quotechar);
647 else if (ch == quotechar)
649 state = old_state;
650 PUT (ch);
652 else if (TC_STRING_ESCAPES && ch == '\\')
654 state = 6;
655 PUT (ch);
657 else if (scrub_m68k_mri && ch == '\n')
659 /* Just quietly terminate the string. This permits lines like
660 bne label loop if we haven't reach end yet. */
661 state = old_state;
662 UNGET (ch);
663 PUT ('\'');
665 else
667 PUT (ch);
669 continue;
671 case 6:
672 state = 5;
673 ch = GET ();
674 switch (ch)
676 /* Handle strings broken across lines, by turning '\n' into
677 '\\' and 'n'. */
678 case '\n':
679 UNGET ('n');
680 add_newlines++;
681 PUT ('\\');
682 continue;
684 case EOF:
685 as_warn (_("end of file in string; '%c' inserted"), quotechar);
686 PUT (quotechar);
687 continue;
689 /* These two are used inside macros. */
690 case '@':
691 case '+':
692 break;
694 case '"':
695 case '\\':
696 case 'b':
697 case 'f':
698 case 'n':
699 case 'r':
700 case 't':
701 case 'v':
702 case 'x':
703 case 'X':
704 case '0':
705 case '1':
706 case '2':
707 case '3':
708 case '4':
709 case '5':
710 case '6':
711 case '7':
712 break;
714 default:
715 #ifdef ONLY_STANDARD_ESCAPES
716 as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
717 #endif
718 break;
720 PUT (ch);
721 continue;
723 #ifdef DOUBLEBAR_PARALLEL
724 case 13:
725 ch = GET ();
726 if (ch != '|')
727 abort ();
729 /* Reset back to state 1 and pretend that we are parsing a
730 line from just after the first white space. */
731 state = 1;
732 PUT ('|');
733 #ifdef TC_TIC6X
734 /* "||^" is used for SPMASKed instructions. */
735 ch = GET ();
736 if (ch == EOF)
737 goto fromeof;
738 else if (ch == '^')
739 PUT ('^');
740 else
741 UNGET (ch);
742 #endif
743 continue;
744 #endif
745 #ifdef TC_Z80
746 case 16:
747 /* We have seen an 'a' at the start of a symbol, look for an 'f'. */
748 ch = GET ();
749 if (ch == 'f' || ch == 'F')
751 state = 17;
752 PUT (ch);
754 else
756 if (ch != EOF)
757 UNGET (ch);
758 state = 9;
759 break;
761 /* Fall through. */
762 case 17:
763 /* We have seen "af" at the start of a symbol,
764 a ' here is a part of that symbol. */
765 ch = GET ();
766 state = 9;
767 if (ch == '\'')
768 /* Change to avoid warning about unclosed string. */
769 PUT ('`');
770 else if (ch != EOF)
771 UNGET (ch);
772 break;
773 #endif
776 /* OK, we are somewhere in states 0 through 4 or 9 through 11. */
778 /* flushchar: */
779 ch = GET ();
781 #ifdef TC_PREDICATE_START_CHAR
782 if (ch == TC_PREDICATE_START_CHAR && (state == 0 || state == 1))
784 state += 14;
785 PUT (ch);
786 continue;
788 else if (state == 14 || state == 15)
790 if (ch == TC_PREDICATE_END_CHAR)
792 state -= 14;
793 PUT (ch);
794 ch = GET ();
796 else
798 PUT (ch);
799 continue;
802 #endif
804 recycle:
806 #if defined TC_ARM && defined OBJ_ELF
807 /* We need to watch out for .symver directives. See the comment later
808 in this function. */
809 if (symver_state == NULL)
811 if ((state == 0 || state == 1) && ch == symver_pseudo[0])
812 symver_state = symver_pseudo + 1;
814 else
816 /* We advance to the next state if we find the right
817 character. */
818 if (ch != '\0' && (*symver_state == ch))
819 ++symver_state;
820 else if (*symver_state != '\0')
821 /* We did not get the expected character, or we didn't
822 get a valid terminating character after seeing the
823 entire pseudo-op, so we must go back to the beginning. */
824 symver_state = NULL;
825 else
827 /* We've read the entire pseudo-op. If this is the end
828 of the line, go back to the beginning. */
829 if (IS_NEWLINE (ch))
830 symver_state = NULL;
833 #endif /* TC_ARM && OBJ_ELF */
835 #ifdef TC_M68K
836 /* We want to have pseudo-ops which control whether we are in
837 MRI mode or not. Unfortunately, since m68k MRI mode affects
838 the scrubber, that means that we need a special purpose
839 recognizer here. */
840 if (mri_state == NULL)
842 if ((state == 0 || state == 1)
843 && ch == mri_pseudo[0])
844 mri_state = mri_pseudo + 1;
846 else
848 /* We advance to the next state if we find the right
849 character, or if we need a space character and we get any
850 whitespace character, or if we need a '0' and we get a
851 '1' (this is so that we only need one state to handle
852 ``.mri 0'' and ``.mri 1''). */
853 if (ch != '\0'
854 && (*mri_state == ch
855 || (*mri_state == ' '
856 && lex[ch] == LEX_IS_WHITESPACE)
857 || (*mri_state == '0'
858 && ch == '1')))
860 mri_last_ch = ch;
861 ++mri_state;
863 else if (*mri_state != '\0'
864 || (lex[ch] != LEX_IS_WHITESPACE
865 && lex[ch] != LEX_IS_NEWLINE))
867 /* We did not get the expected character, or we didn't
868 get a valid terminating character after seeing the
869 entire pseudo-op, so we must go back to the
870 beginning. */
871 mri_state = NULL;
873 else
875 /* We've read the entire pseudo-op. mips_last_ch is
876 either '0' or '1' indicating whether to enter or
877 leave MRI mode. */
878 do_scrub_begin (mri_last_ch == '1');
879 mri_state = NULL;
881 /* We continue handling the character as usual. The
882 main gas reader must also handle the .mri pseudo-op
883 to control expression parsing and the like. */
886 #endif
888 if (ch == EOF)
890 if (state != 0)
892 as_warn (_("end of file not at end of a line; newline inserted"));
893 state = 0;
894 PUT ('\n');
896 goto fromeof;
899 switch (lex[ch])
901 case LEX_IS_WHITESPACE:
904 ch = GET ();
906 while (ch != EOF && IS_WHITESPACE (ch));
907 if (ch == EOF)
908 goto fromeof;
910 if (state == 0)
912 /* Preserve a single whitespace character at the
913 beginning of a line. */
914 state = 1;
915 UNGET (ch);
916 PUT (' ');
917 break;
920 #ifdef KEEP_WHITE_AROUND_COLON
921 if (lex[ch] == LEX_IS_COLON)
923 /* Only keep this white if there's no white *after* the
924 colon. */
925 ch2 = GET ();
926 if (ch2 != EOF)
927 UNGET (ch2);
928 if (!IS_WHITESPACE (ch2))
930 state = 9;
931 UNGET (ch);
932 PUT (' ');
933 break;
936 #endif
937 if (IS_COMMENT (ch)
938 || IS_LINE_SEPARATOR (ch)
939 || IS_PARALLEL_SEPARATOR (ch))
941 if (scrub_m68k_mri)
943 /* In MRI mode, we keep these spaces. */
944 UNGET (ch);
945 PUT (' ');
946 break;
948 goto recycle;
951 /* If we're in state 2 or 11, we've seen a non-white
952 character followed by whitespace. If the next character
953 is ':', this is whitespace after a label name which we
954 normally must ignore. In MRI mode, though, spaces are
955 not permitted between the label and the colon. */
956 if ((state == 2 || state == 11)
957 && lex[ch] == LEX_IS_COLON
958 && ! scrub_m68k_mri)
960 state = 1;
961 PUT (ch);
962 break;
965 switch (state)
967 case 1:
968 /* We can arrive here if we leave a leading whitespace
969 character at the beginning of a line. */
970 goto recycle;
971 case 2:
972 state = 3;
973 if (to + 1 < toend)
975 /* Optimize common case by skipping UNGET/GET. */
976 PUT (' '); /* Sp after opco */
977 goto recycle;
979 UNGET (ch);
980 PUT (' ');
981 break;
982 case 3:
983 #ifndef TC_KEEP_OPERAND_SPACES
984 /* For TI C6X, we keep these spaces as they may separate
985 functional unit specifiers from operands. */
986 if (scrub_m68k_mri)
987 #endif
989 /* In MRI mode, we keep these spaces. */
990 UNGET (ch);
991 PUT (' ');
992 break;
994 goto recycle; /* Sp in operands */
995 case 9:
996 case 10:
997 #ifndef TC_KEEP_OPERAND_SPACES
998 if (scrub_m68k_mri)
999 #endif
1001 /* In MRI mode, we keep these spaces. */
1002 state = 3;
1003 UNGET (ch);
1004 PUT (' ');
1005 break;
1007 state = 10; /* Sp after symbol char */
1008 goto recycle;
1009 case 11:
1010 if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
1011 state = 1;
1012 else
1014 /* We know that ch is not ':', since we tested that
1015 case above. Therefore this is not a label, so it
1016 must be the opcode, and we've just seen the
1017 whitespace after it. */
1018 state = 3;
1020 UNGET (ch);
1021 PUT (' '); /* Sp after label definition. */
1022 break;
1023 default:
1024 BAD_CASE (state);
1026 break;
1028 case LEX_IS_TWOCHAR_COMMENT_1ST:
1029 ch2 = GET ();
1030 if (ch2 == '*')
1032 for (;;)
1036 ch2 = GET ();
1037 if (ch2 != EOF && IS_NEWLINE (ch2))
1038 add_newlines++;
1040 while (ch2 != EOF && ch2 != '*');
1042 while (ch2 == '*')
1043 ch2 = GET ();
1045 if (ch2 == EOF || ch2 == '/')
1046 break;
1048 /* This UNGET will ensure that we count newlines
1049 correctly. */
1050 UNGET (ch2);
1053 if (ch2 == EOF)
1054 as_warn (_("end of file in multiline comment"));
1056 ch = ' ';
1057 goto recycle;
1059 #ifdef DOUBLESLASH_LINE_COMMENTS
1060 else if (ch2 == '/')
1064 ch = GET ();
1066 while (ch != EOF && !IS_NEWLINE (ch));
1067 if (ch == EOF)
1068 as_warn ("end of file in comment; newline inserted");
1069 state = 0;
1070 PUT ('\n');
1071 break;
1073 #endif
1074 else
1076 if (ch2 != EOF)
1077 UNGET (ch2);
1078 if (state == 9 || state == 10)
1079 state = 3;
1080 PUT (ch);
1082 break;
1084 case LEX_IS_STRINGQUOTE:
1085 quotechar = ch;
1086 if (state == 10)
1088 /* Preserve the whitespace in foo "bar". */
1089 UNGET (ch);
1090 state = 3;
1091 PUT (' ');
1093 /* PUT didn't jump out. We could just break, but we
1094 know what will happen, so optimize a bit. */
1095 ch = GET ();
1096 old_state = 9;
1098 else if (state == 3)
1099 old_state = 9;
1100 else
1101 old_state = state;
1102 state = 5;
1103 PUT (ch);
1104 break;
1106 case LEX_IS_ONECHAR_QUOTE:
1107 #ifdef H_TICK_HEX
1108 if (state == 9 && enable_h_tick_hex)
1110 char c;
1112 c = GET ();
1113 as_warn ("'%c found after symbol", c);
1114 UNGET (c);
1116 #endif
1117 if (state == 10)
1119 /* Preserve the whitespace in foo 'b'. */
1120 UNGET (ch);
1121 state = 3;
1122 PUT (' ');
1123 break;
1125 ch = GET ();
1126 if (ch == EOF)
1128 as_warn (_("end of file after a one-character quote; \\0 inserted"));
1129 ch = 0;
1131 if (ch == '\\')
1133 ch = GET ();
1134 if (ch == EOF)
1136 as_warn (_("end of file in escape character"));
1137 ch = '\\';
1139 else
1140 ch = process_escape (ch);
1142 sprintf (out_buf, "%d", (int) (unsigned char) ch);
1144 /* None of these 'x constants for us. We want 'x'. */
1145 if ((ch = GET ()) != '\'')
1147 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1148 as_warn (_("missing close quote; (assumed)"));
1149 #else
1150 if (ch != EOF)
1151 UNGET (ch);
1152 #endif
1154 if (strlen (out_buf) == 1)
1156 PUT (out_buf[0]);
1157 break;
1159 if (state == 9)
1160 old_state = 3;
1161 else
1162 old_state = state;
1163 state = -1;
1164 out_string = out_buf;
1165 PUT (*out_string++);
1166 break;
1168 case LEX_IS_COLON:
1169 #ifdef KEEP_WHITE_AROUND_COLON
1170 state = 9;
1171 #else
1172 if (state == 9 || state == 10)
1173 state = 3;
1174 else if (state != 3)
1175 state = 1;
1176 #endif
1177 PUT (ch);
1178 break;
1180 case LEX_IS_NEWLINE:
1181 /* Roll out a bunch of newlines from inside comments, etc. */
1182 if (add_newlines)
1184 --add_newlines;
1185 UNGET (ch);
1187 /* Fall through. */
1189 case LEX_IS_LINE_SEPARATOR:
1190 state = 0;
1191 PUT (ch);
1192 break;
1194 case LEX_IS_PARALLEL_SEPARATOR:
1195 state = 1;
1196 PUT (ch);
1197 break;
1199 #ifdef TC_V850
1200 case LEX_IS_DOUBLEDASH_1ST:
1201 ch2 = GET ();
1202 if (ch2 != '-')
1204 if (ch2 != EOF)
1205 UNGET (ch2);
1206 goto de_fault;
1208 /* Read and skip to end of line. */
1211 ch = GET ();
1213 while (ch != EOF && ch != '\n');
1215 if (ch == EOF)
1216 as_warn (_("end of file in comment; newline inserted"));
1218 state = 0;
1219 PUT ('\n');
1220 break;
1221 #endif
1222 #ifdef DOUBLEBAR_PARALLEL
1223 case LEX_IS_DOUBLEBAR_1ST:
1224 ch2 = GET ();
1225 if (ch2 != EOF)
1226 UNGET (ch2);
1227 if (ch2 != '|')
1228 goto de_fault;
1230 /* Handle '||' in two states as invoking PUT twice might
1231 result in the first one jumping out of this loop. We'd
1232 then lose track of the state and one '|' char. */
1233 state = 13;
1234 PUT ('|');
1235 break;
1236 #endif
1237 case LEX_IS_LINE_COMMENT_START:
1238 /* FIXME-someday: The two character comment stuff was badly
1239 thought out. On i386, we want '/' as line comment start
1240 AND we want C style comments. hence this hack. The
1241 whole lexical process should be reworked. xoxorich. */
1242 if (ch == '/')
1244 ch2 = GET ();
1245 if (ch2 == '*')
1247 old_state = 3;
1248 state = -2;
1249 break;
1251 else if (ch2 != EOF)
1253 UNGET (ch2);
1257 if (state == 0 || state == 1) /* Only comment at start of line. */
1259 int startch;
1261 startch = ch;
1265 ch = GET ();
1267 while (ch != EOF && IS_WHITESPACE (ch));
1269 if (ch == EOF)
1271 as_warn (_("end of file in comment; newline inserted"));
1272 PUT ('\n');
1273 break;
1276 if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1278 /* Not a cpp line. */
1279 while (ch != EOF && !IS_NEWLINE (ch))
1280 ch = GET ();
1281 if (ch == EOF)
1283 as_warn (_("end of file in comment; newline inserted"));
1284 PUT ('\n');
1286 else /* IS_NEWLINE (ch) */
1288 /* To process non-zero add_newlines. */
1289 UNGET (ch);
1291 state = 0;
1292 break;
1294 /* Looks like `# 123 "filename"' from cpp. */
1295 UNGET (ch);
1296 old_state = 4;
1297 state = -1;
1298 if (scrub_m68k_mri)
1299 out_string = "\tlinefile ";
1300 else
1301 out_string = "\t.linefile ";
1302 PUT (*out_string++);
1303 break;
1306 #ifdef TC_D10V
1307 /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1308 Trap is the only short insn that has a first operand that is
1309 neither register nor label.
1310 We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1311 We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1312 already LEX_IS_LINE_COMMENT_START. However, it is the
1313 only character in line_comment_chars for d10v, hence we
1314 can recognize it as such. */
1315 /* An alternative approach would be to reset the state to 1 when
1316 we see '||', '<'- or '->', but that seems to be overkill. */
1317 if (state == 10)
1318 PUT (' ');
1319 #endif
1320 /* We have a line comment character which is not at the
1321 start of a line. If this is also a normal comment
1322 character, fall through. Otherwise treat it as a default
1323 character. */
1324 if (strchr (tc_comment_chars, ch) == NULL
1325 && (! scrub_m68k_mri
1326 || (ch != '!' && ch != '*')))
1327 goto de_fault;
1328 if (scrub_m68k_mri
1329 && (ch == '!' || ch == '*' || ch == '#')
1330 && state != 1
1331 && state != 10)
1332 goto de_fault;
1333 /* Fall through. */
1334 case LEX_IS_COMMENT_START:
1335 #if defined TC_ARM && defined OBJ_ELF
1336 /* On the ARM, `@' is the comment character.
1337 Unfortunately this is also a special character in ELF .symver
1338 directives (and .type, though we deal with those another way).
1339 So we check if this line is such a directive, and treat
1340 the character as default if so. This is a hack. */
1341 if ((symver_state != NULL) && (*symver_state == 0))
1342 goto de_fault;
1343 #endif
1345 /* Care is needed not to damage occurrences of \<comment-char>
1346 by stripping the <comment-char> onwards. Yuck. */
1347 if ((to > tostart ? to[-1] : last_char) == '\\')
1348 /* Do not treat the <comment-char> as a start-of-comment. */
1349 goto de_fault;
1351 #ifdef WARN_COMMENTS
1352 if (!found_comment)
1353 found_comment_file = as_where (&found_comment);
1354 #endif
1357 ch = GET ();
1359 while (ch != EOF && !IS_NEWLINE (ch));
1360 if (ch == EOF)
1361 as_warn (_("end of file in comment; newline inserted"));
1362 state = 0;
1363 PUT ('\n');
1364 break;
1366 #ifdef H_TICK_HEX
1367 case LEX_IS_H:
1368 /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1369 the H' with 0x to make them gas-style hex characters. */
1370 if (enable_h_tick_hex)
1372 char quot;
1374 quot = GET ();
1375 if (quot == '\'')
1377 UNGET ('x');
1378 ch = '0';
1380 else
1381 UNGET (quot);
1383 #endif
1384 /* Fall through. */
1386 case LEX_IS_SYMBOL_COMPONENT:
1387 if (state == 10)
1389 /* This is a symbol character following another symbol
1390 character, with whitespace in between. We skipped
1391 the whitespace earlier, so output it now. */
1392 UNGET (ch);
1393 state = 3;
1394 PUT (' ');
1395 break;
1398 #ifdef TC_Z80
1399 /* "af'" is a symbol containing '\''. */
1400 if (state == 3 && (ch == 'a' || ch == 'A'))
1402 state = 16;
1403 PUT (ch);
1404 ch = GET ();
1405 if (ch == 'f' || ch == 'F')
1407 state = 17;
1408 PUT (ch);
1409 break;
1411 else
1413 state = 9;
1414 if (ch == EOF || !IS_SYMBOL_COMPONENT (ch))
1416 if (ch != EOF)
1417 UNGET (ch);
1418 break;
1422 #endif
1423 if (state == 3)
1424 state = 9;
1426 /* This is a common case. Quickly copy CH and all the
1427 following symbol component or normal characters. */
1428 if (to + 1 < toend
1429 && mri_state == NULL
1430 #if defined TC_ARM && defined OBJ_ELF
1431 && symver_state == NULL
1432 #endif
1435 char *s;
1436 ptrdiff_t len;
1438 for (s = from; s < fromend; s++)
1440 int type;
1442 ch2 = *(unsigned char *) s;
1443 type = lex[ch2];
1444 if (type != 0
1445 && type != LEX_IS_SYMBOL_COMPONENT)
1446 break;
1449 if (s > from)
1450 /* Handle the last character normally, for
1451 simplicity. */
1452 --s;
1454 len = s - from;
1456 if (len > (toend - to) - 1)
1457 len = (toend - to) - 1;
1459 if (len > 0)
1461 PUT (ch);
1462 memcpy (to, from, len);
1463 to += len;
1464 from += len;
1465 if (to >= toend)
1466 goto tofull;
1467 ch = GET ();
1471 /* Fall through. */
1472 default:
1473 de_fault:
1474 /* Some relatively `normal' character. */
1475 if (state == 0)
1477 state = 11; /* Now seeing label definition. */
1479 else if (state == 1)
1481 state = 2; /* Ditto. */
1483 else if (state == 9)
1485 if (!IS_SYMBOL_COMPONENT (ch))
1486 state = 3;
1488 else if (state == 10)
1490 if (ch == '\\')
1492 /* Special handling for backslash: a backslash may
1493 be the beginning of a formal parameter (of a
1494 macro) following another symbol character, with
1495 whitespace in between. If that is the case, we
1496 output a space before the parameter. Strictly
1497 speaking, correct handling depends upon what the
1498 macro parameter expands into; if the parameter
1499 expands into something which does not start with
1500 an operand character, then we don't want to keep
1501 the space. We don't have enough information to
1502 make the right choice, so here we are making the
1503 choice which is more likely to be correct. */
1504 if (to + 1 >= toend)
1506 /* If we're near the end of the buffer, save the
1507 character for the next time round. Otherwise
1508 we'll lose our state. */
1509 UNGET (ch);
1510 goto tofull;
1512 *to++ = ' ';
1515 state = 3;
1517 PUT (ch);
1518 break;
1522 /*NOTREACHED*/
1524 fromeof:
1525 /* We have reached the end of the input. */
1526 if (to > tostart)
1527 last_char = to[-1];
1528 return to - tostart;
1530 tofull:
1531 /* The output buffer is full. Save any input we have not yet
1532 processed. */
1533 if (fromend > from)
1535 saved_input = from;
1536 saved_input_len = fromend - from;
1538 else
1539 saved_input = NULL;
1541 if (to > tostart)
1542 last_char = to[-1];
1543 return to - tostart;
1546 /* Return amount of pending input. */
1548 size_t
1549 do_scrub_pending (void)
1551 size_t len = 0;
1552 if (saved_input)
1553 len += saved_input_len;
1554 if (state == -1)
1555 len += strlen (out_string);
1556 return len;