gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
   3    1999, 2000, 2002
   4    Free Software Foundation, Inc.
   5
   6    This file is part of GAS, the GNU Assembler.
   7
   8    GAS is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 2, or (at your option)
  11    any later version.
  12
  13    GAS is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with GAS; see the file COPYING.  If not, write to the Free
  20    Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  21    02111-1307, USA.  */
  22
  23 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90 */
  24 /* App, the assembler pre-processor.  This pre-processor strips out excess
  25    spaces, turns single-quoted characters into a decimal constant, and turns
  26    # <number> <filename> <garbage> into a .line <number>\n.file <filename>
  27    pair.  This needs better error-handling.  */
  28
  29 #include <stdio.h>
  30 #include "as.h"                 /* For BAD_CASE() only */
  31
  32 #if (__STDC__ != 1)
  33 #ifndef const
  34 #define const  /* empty */
  35 #endif
  36 #endif
  37
  38 #ifdef TC_M68K
  39 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  40    flag_m68k_mri, because the two flags will be affected by the .mri
  41    pseudo-op at different times.  */
  42 static int scrub_m68k_mri;
  43
  44 /* The pseudo-op which switches in and out of MRI mode.  See the
  45    comment in do_scrub_chars.  */
  46 static const char mri_pseudo[] = ".mri 0";
  47 #else
  48 #define scrub_m68k_mri 0
  49 #endif
  50
  51 #if defined TC_ARM && defined OBJ_ELF
  52 /* The pseudo-op for which we need to special-case `@' characters.
  53    See the comment in do_scrub_chars.  */
  54 static const char   symver_pseudo[] = ".symver";
  55 static const char * symver_state;
  56 #endif
  57
  58 static char lex[256];
  59 static const char symbol_chars[] =
  60 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  61
  62 #define LEX_IS_SYMBOL_COMPONENT         1
  63 #define LEX_IS_WHITESPACE               2
  64 #define LEX_IS_LINE_SEPARATOR           3
  65 #define LEX_IS_COMMENT_START            4
  66 #define LEX_IS_LINE_COMMENT_START       5
  67 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  68 #define LEX_IS_STRINGQUOTE              8
  69 #define LEX_IS_COLON                    9
  70 #define LEX_IS_NEWLINE                  10
  71 #define LEX_IS_ONECHAR_QUOTE            11
  72 #ifdef TC_V850
  73 #define LEX_IS_DOUBLEDASH_1ST           12
  74 #endif
  75 #ifdef TC_M32R
  76 #define DOUBLEBAR_PARALLEL
  77 #endif
  78 #ifdef DOUBLEBAR_PARALLEL
  79 #define LEX_IS_DOUBLEBAR_1ST            13
  80 #endif
  81 #define LEX_IS_PARALLEL_SEPARATOR       14
  82 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  83 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  84 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  85 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  86 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  87 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  88 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  89
  90 static int process_escape PARAMS ((int));
  91
  92 /* FIXME-soon: The entire lexer/parser thingy should be
  93    built statically at compile time rather than dynamically
  94    each and every time the assembler is run.  xoxorich.  */
  95
  96 void
  97 do_scrub_begin (m68k_mri)
  98      int m68k_mri ATTRIBUTE_UNUSED;
  99 {
 100   const char *p;
 101   int c;
 102
 103   lex[' '] = LEX_IS_WHITESPACE;
 104   lex['\t'] = LEX_IS_WHITESPACE;
 105   lex['\r'] = LEX_IS_WHITESPACE;
 106   lex['\n'] = LEX_IS_NEWLINE;
 107   lex[':'] = LEX_IS_COLON;
 108
 109 #ifdef TC_M68K
 110   scrub_m68k_mri = m68k_mri;
 111
 112   if (! m68k_mri)
 113 #endif
 114     {
 115       lex['"'] = LEX_IS_STRINGQUOTE;
 116
 117 #if ! defined (TC_HPPA) && ! defined (TC_I370)
 118       /* I370 uses single-quotes to delimit integer, float constants */
 119       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 120 #endif
 121
 122 #ifdef SINGLE_QUOTE_STRINGS
 123       lex['\''] = LEX_IS_STRINGQUOTE;
 124 #endif
 125     }
 126
 127   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 128      in state 5 of do_scrub_chars must be changed.  */
 129
 130   /* Note that these override the previous defaults, e.g. if ';' is a
 131      comment char, then it isn't a line separator.  */
 132   for (p = symbol_chars; *p; ++p)
 133     {
 134       lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 135     }                           /* declare symbol characters */
 136
 137   for (c = 128; c < 256; ++c)
 138     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 139
 140 #ifdef tc_symbol_chars
 141   /* This macro permits the processor to specify all characters which
 142      may appears in an operand.  This will prevent the scrubber from
 143      discarding meaningful whitespace in certain cases.  The i386
 144      backend uses this to support prefixes, which can confuse the
 145      scrubber as to whether it is parsing operands or opcodes.  */
 146   for (p = tc_symbol_chars; *p; ++p)
 147     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 148 #endif
 149
 150   /* The m68k backend wants to be able to change comment_chars.  */
 151 #ifndef tc_comment_chars
 152 #define tc_comment_chars comment_chars
 153 #endif
 154   for (p = tc_comment_chars; *p; p++)
 155     {
 156       lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 157     }                           /* declare comment chars */
 158
 159   for (p = line_comment_chars; *p; p++)
 160     {
 161       lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 162     }                           /* declare line comment chars */
 163
 164   for (p = line_separator_chars; *p; p++)
 165     {
 166       lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 167     }                           /* declare line separators */
 168
 169 #ifdef tc_parallel_separator_chars
 170   /* This macro permits the processor to specify all characters which
 171      separate parallel insns on the same line.  */
 172   for (p = tc_parallel_separator_chars; *p; p++)
 173     {
 174       lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 175     }                           /* declare parallel separators */
 176 #endif
 177
 178   /* Only allow slash-star comments if slash is not in use.
 179      FIXME: This isn't right.  We should always permit them.  */
 180   if (lex['/'] == 0)
 181     {
 182       lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 183     }
 184
 185 #ifdef TC_M68K
 186   if (m68k_mri)
 187     {
 188       lex['\''] = LEX_IS_STRINGQUOTE;
 189       lex[';'] = LEX_IS_COMMENT_START;
 190       lex['*'] = LEX_IS_LINE_COMMENT_START;
 191       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 192          then it can't be used in an expression.  */
 193       lex['!'] = LEX_IS_LINE_COMMENT_START;
 194     }
 195 #endif
 196
 197 #ifdef TC_V850
 198   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 199 #endif
 200 #ifdef DOUBLEBAR_PARALLEL
 201   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 202 #endif
 203 #ifdef TC_D30V
 204   /* must do this is we want VLIW instruction with "->" or "<-" */
 205   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 206 #endif
 207 }                               /* do_scrub_begin() */
 208
 209 /* Saved state of the scrubber */
 210 static int state;
 211 static int old_state;
 212 static char *out_string;
 213 static char out_buf[20];
 214 static int add_newlines;
 215 static char *saved_input;
 216 static int saved_input_len;
 217 static char input_buffer[32 * 1024];
 218 static const char *mri_state;
 219 static char mri_last_ch;
 220
 221 /* Data structure for saving the state of app across #include's.  Note that
 222    app is called asynchronously to the parsing of the .include's, so our
 223    state at the time .include is interpreted is completely unrelated.
 224    That's why we have to save it all.  */
 225
 226 struct app_save {
 227   int          state;
 228   int          old_state;
 229   char *       out_string;
 230   char         out_buf[sizeof (out_buf)];
 231   int          add_newlines;
 232   char *       saved_input;
 233   int          saved_input_len;
 234 #ifdef TC_M68K
 235   int          scrub_m68k_mri;
 236 #endif
 237   const char * mri_state;
 238   char         mri_last_ch;
 239 #if defined TC_ARM && defined OBJ_ELF
 240   const char * symver_state;
 241 #endif
 242 };
 243
 244 char *
 245 app_push ()
 246 {
 247   register struct app_save *saved;
 248
 249   saved = (struct app_save *) xmalloc (sizeof (*saved));
 250   saved->state = state;
 251   saved->old_state = old_state;
 252   saved->out_string = out_string;
 253   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 254   saved->add_newlines = add_newlines;
 255   if (saved_input == NULL)
 256     saved->saved_input = NULL;
 257   else
 258     {
 259       saved->saved_input = xmalloc (saved_input_len);
 260       memcpy (saved->saved_input, saved_input, saved_input_len);
 261       saved->saved_input_len = saved_input_len;
 262     }
 263 #ifdef TC_M68K
 264   saved->scrub_m68k_mri = scrub_m68k_mri;
 265 #endif
 266   saved->mri_state = mri_state;
 267   saved->mri_last_ch = mri_last_ch;
 268 #if defined TC_ARM && defined OBJ_ELF
 269   saved->symver_state = symver_state;
 270 #endif
 271
 272   /* do_scrub_begin() is not useful, just wastes time.  */
 273
 274   state = 0;
 275   saved_input = NULL;
 276
 277   return (char *) saved;
 278 }
 279
 280 void
 281 app_pop (arg)
 282      char *arg;
 283 {
 284   register struct app_save *saved = (struct app_save *) arg;
 285
 286   /* There is no do_scrub_end ().  */
 287   state = saved->state;
 288   old_state = saved->old_state;
 289   out_string = saved->out_string;
 290   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 291   add_newlines = saved->add_newlines;
 292   if (saved->saved_input == NULL)
 293     saved_input = NULL;
 294   else
 295     {
 296       assert (saved->saved_input_len <= (int) (sizeof input_buffer));
 297       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 298       saved_input = input_buffer;
 299       saved_input_len = saved->saved_input_len;
 300       free (saved->saved_input);
 301     }
 302 #ifdef TC_M68K
 303   scrub_m68k_mri = saved->scrub_m68k_mri;
 304 #endif
 305   mri_state = saved->mri_state;
 306   mri_last_ch = saved->mri_last_ch;
 307 #if defined TC_ARM && defined OBJ_ELF
 308   symver_state = saved->symver_state;
 309 #endif
 310
 311   free (arg);
 312 }                               /* app_pop() */
 313
 314 /* @@ This assumes that \n &c are the same on host and target.  This is not
 315    necessarily true.  */
 316 static int
 317 process_escape (ch)
 318      int ch;
 319 {
 320   switch (ch)
 321     {
 322     case 'b':
 323       return '\b';
 324     case 'f':
 325       return '\f';
 326     case 'n':
 327       return '\n';
 328     case 'r':
 329       return '\r';
 330     case 't':
 331       return '\t';
 332     case '\'':
 333       return '\'';
 334     case '"':
 335       return '\"';
 336     default:
 337       return ch;
 338     }
 339 }
 340
 341 /* This function is called to process input characters.  The GET
 342    parameter is used to retrieve more input characters.  GET should
 343    set its parameter to point to a buffer, and return the length of
 344    the buffer; it should return 0 at end of file.  The scrubbed output
 345    characters are put into the buffer starting at TOSTART; the TOSTART
 346    buffer is TOLEN bytes in length.  The function returns the number
 347    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 348    end of file was seen.  This function is arranged as a state
 349    machine, and saves its state so that it may return at any point.
 350    This is the way the old code used to work.  */
 351
 352 int
 353 do_scrub_chars (get, tostart, tolen)
 354      int (*get) PARAMS ((char *, int));
 355      char *tostart;
 356      int tolen;
 357 {
 358   char *to = tostart;
 359   char *toend = tostart + tolen;
 360   char *from;
 361   char *fromend;
 362   int fromlen;
 363   register int ch, ch2 = 0;
 364
 365   /*State 0: beginning of normal line
 366           1: After first whitespace on line (flush more white)
 367           2: After first non-white (opcode) on line (keep 1white)
 368           3: after second white on line (into operands) (flush white)
 369           4: after putting out a .line, put out digits
 370           5: parsing a string, then go to old-state
 371           6: putting out \ escape in a "d string.
 372           7: After putting out a .appfile, put out string.
 373           8: After putting out a .appfile string, flush until newline.
 374           9: After seeing symbol char in state 3 (keep 1white after symchar)
 375          10: After seeing whitespace in state 9 (keep white before symchar)
 376          11: After seeing a symbol character in state 0 (eg a label definition)
 377          -1: output string in out_string and go to the state in old_state
 378          -2: flush text until a '*' '/' is seen, then go to state old_state
 379 #ifdef TC_V850
 380          12: After seeing a dash, looking for a second dash as a start of comment.
 381 #endif
 382 #ifdef DOUBLEBAR_PARALLEL
 383          13: After seeing a vertical bar, looking for a second vertical bar as a parallel expression seperator.
 384 #endif
 385           */
 386
 387   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 388      constructs like ``.loc 1 20''.  This was turning into ``.loc
 389      120''.  States 9 and 10 ensure that a space is never dropped in
 390      between characters which could appear in an identifier.  Ian
 391      Taylor, ian@cygnus.com.
 392
 393      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 394      correctly on the PA (and any other target where colons are optional).
 395      Jeff Law, law@cs.utah.edu.
 396
 397      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 398      get squashed into "cmp r1,r2||trap#1", with the all important space
 399      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 400
 401   /* This macro gets the next input character.  */
 402
 403 #define GET()                                                   \
 404   (from < fromend                                               \
 405    ? * (unsigned char *) (from++)                               \
 406    : (saved_input = NULL,                                       \
 407       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 408       from = input_buffer,                                      \
 409       fromend = from + fromlen,                                 \
 410       (fromlen == 0                                             \
 411        ? EOF                                                    \
 412        : * (unsigned char *) (from++))))
 413
 414   /* This macro pushes a character back on the input stream.  */
 415
 416 #define UNGET(uch) (*--from = (uch))
 417
 418   /* This macro puts a character into the output buffer.  If this
 419      character fills the output buffer, this macro jumps to the label
 420      TOFULL.  We use this rather ugly approach because we need to
 421      handle two different termination conditions: EOF on the input
 422      stream, and a full output buffer.  It would be simpler if we
 423      always read in the entire input stream before processing it, but
 424      I don't want to make such a significant change to the assembler's
 425      memory usage.  */
 426
 427 #define PUT(pch)                                \
 428   do                                            \
 429     {                                           \
 430       *to++ = (pch);                            \
 431       if (to >= toend)                          \
 432         goto tofull;                            \
 433     }                                           \
 434   while (0)
 435
 436   if (saved_input != NULL)
 437     {
 438       from = saved_input;
 439       fromend = from + saved_input_len;
 440     }
 441   else
 442     {
 443       fromlen = (*get) (input_buffer, sizeof input_buffer);
 444       if (fromlen == 0)
 445         return 0;
 446       from = input_buffer;
 447       fromend = from + fromlen;
 448     }
 449
 450   while (1)
 451     {
 452       /* The cases in this switch end with continue, in order to
 453          branch back to the top of this while loop and generate the
 454          next output character in the appropriate state.  */
 455       switch (state)
 456         {
 457         case -1:
 458           ch = *out_string++;
 459           if (*out_string == '\0')
 460             {
 461               state = old_state;
 462               old_state = 3;
 463             }
 464           PUT (ch);
 465           continue;
 466
 467         case -2:
 468           for (;;)
 469             {
 470               do
 471                 {
 472                   ch = GET ();
 473
 474                   if (ch == EOF)
 475                     {
 476                       as_warn (_("end of file in comment"));
 477                       goto fromeof;
 478                     }
 479
 480                   if (ch == '\n')
 481                     PUT ('\n');
 482                 }
 483               while (ch != '*');
 484
 485               while ((ch = GET ()) == '*')
 486                 ;
 487
 488               if (ch == EOF)
 489                 {
 490                   as_warn (_("end of file in comment"));
 491                   goto fromeof;
 492                 }
 493
 494               if (ch == '/')
 495                 break;
 496
 497               UNGET (ch);
 498             }
 499
 500           state = old_state;
 501           UNGET (' ');
 502           continue;
 503
 504         case 4:
 505           ch = GET ();
 506           if (ch == EOF)
 507             goto fromeof;
 508           else if (ch >= '0' && ch <= '9')
 509             PUT (ch);
 510           else
 511             {
 512               while (ch != EOF && IS_WHITESPACE (ch))
 513                 ch = GET ();
 514               if (ch == '"')
 515                 {
 516                   UNGET (ch);
 517                   if (scrub_m68k_mri)
 518                     out_string = "\n\tappfile ";
 519                   else
 520                     out_string = "\n\t.appfile ";
 521                   old_state = 7;
 522                   state = -1;
 523                   PUT (*out_string++);
 524                 }
 525               else
 526                 {
 527                   while (ch != EOF && ch != '\n')
 528                     ch = GET ();
 529                   state = 0;
 530                   PUT (ch);
 531                 }
 532             }
 533           continue;
 534
 535         case 5:
 536           /* We are going to copy everything up to a quote character,
 537              with special handling for a backslash.  We try to
 538              optimize the copying in the simple case without using the
 539              GET and PUT macros.  */
 540           {
 541             char *s;
 542             int len;
 543
 544             for (s = from; s < fromend; s++)
 545               {
 546                 ch = *s;
 547                 /* This condition must be changed if the type of any
 548                    other character can be LEX_IS_STRINGQUOTE.  */
 549                 if (ch == '\\'
 550                     || ch == '"'
 551                     || ch == '\''
 552                     || ch == '\n')
 553                   break;
 554               }
 555             len = s - from;
 556             if (len > toend - to)
 557               len = toend - to;
 558             if (len > 0)
 559               {
 560                 memcpy (to, from, len);
 561                 to += len;
 562                 from += len;
 563               }
 564           }
 565
 566           ch = GET ();
 567           if (ch == EOF)
 568             {
 569               as_warn (_("end of file in string; inserted '\"'"));
 570               state = old_state;
 571               UNGET ('\n');
 572               PUT ('"');
 573             }
 574           else if (lex[ch] == LEX_IS_STRINGQUOTE)
 575             {
 576               state = old_state;
 577               PUT (ch);
 578             }
 579 #ifndef NO_STRING_ESCAPES
 580           else if (ch == '\\')
 581             {
 582               state = 6;
 583               PUT (ch);
 584             }
 585 #endif
 586           else if (scrub_m68k_mri && ch == '\n')
 587             {
 588               /* Just quietly terminate the string.  This permits lines like
 589                    bne  label   loop if we haven't reach end yet
 590                  */
 591               state = old_state;
 592               UNGET (ch);
 593               PUT ('\'');
 594             }
 595           else
 596             {
 597               PUT (ch);
 598             }
 599           continue;
 600
 601         case 6:
 602           state = 5;
 603           ch = GET ();
 604           switch (ch)
 605             {
 606               /* Handle strings broken across lines, by turning '\n' into
 607                  '\\' and 'n'.  */
 608             case '\n':
 609               UNGET ('n');
 610               add_newlines++;
 611               PUT ('\\');
 612               continue;
 613
 614             case EOF:
 615               as_warn (_("end of file in string; '\"' inserted"));
 616               PUT ('"');
 617               continue;
 618
 619             case '"':
 620             case '\\':
 621             case 'b':
 622             case 'f':
 623             case 'n':
 624             case 'r':
 625             case 't':
 626             case 'v':
 627             case 'x':
 628             case 'X':
 629             case '0':
 630             case '1':
 631             case '2':
 632             case '3':
 633             case '4':
 634             case '5':
 635             case '6':
 636             case '7':
 637               break;
 638
 639             default:
 640 #ifdef ONLY_STANDARD_ESCAPES
 641               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 642 #endif
 643               break;
 644             }
 645           PUT (ch);
 646           continue;
 647
 648         case 7:
 649           ch = GET ();
 650           state = 5;
 651           old_state = 8;
 652           if (ch == EOF)
 653             goto fromeof;
 654           PUT (ch);
 655           continue;
 656
 657         case 8:
 658           do
 659             ch = GET ();
 660           while (ch != '\n' && ch != EOF);
 661           if (ch == EOF)
 662             goto fromeof;
 663           state = 0;
 664           PUT (ch);
 665           continue;
 666         }
 667
 668       /* OK, we are somewhere in states 0 through 4 or 9 through 11 */
 669
 670       /* flushchar: */
 671       ch = GET ();
 672
 673     recycle:
 674
 675 #if defined TC_ARM && defined OBJ_ELF
 676       /* We need to watch out for .symver directives.  See the comment later
 677          in this function.  */
 678       if (symver_state == NULL)
 679         {
 680           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 681             symver_state = symver_pseudo + 1;
 682         }
 683       else
 684         {
 685           /* We advance to the next state if we find the right
 686              character.  */
 687           if (ch != '\0' && (*symver_state == ch))
 688             ++symver_state;
 689           else if (*symver_state != '\0')
 690             /* We did not get the expected character, or we didn't
 691                get a valid terminating character after seeing the
 692                entire pseudo-op, so we must go back to the beginning.  */
 693             symver_state = NULL;
 694           else
 695             {
 696               /* We've read the entire pseudo-op.  If this is the end
 697                  of the line, go back to the beginning.  */
 698               if (IS_NEWLINE (ch))
 699                 symver_state = NULL;
 700             }
 701         }
 702 #endif /* TC_ARM && OBJ_ELF */
 703
 704 #ifdef TC_M68K
 705       /* We want to have pseudo-ops which control whether we are in
 706          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 707          the scrubber, that means that we need a special purpose
 708          recognizer here.  */
 709       if (mri_state == NULL)
 710         {
 711           if ((state == 0 || state == 1)
 712               && ch == mri_pseudo[0])
 713             mri_state = mri_pseudo + 1;
 714         }
 715       else
 716         {
 717           /* We advance to the next state if we find the right
 718              character, or if we need a space character and we get any
 719              whitespace character, or if we need a '0' and we get a
 720              '1' (this is so that we only need one state to handle
 721              ``.mri 0'' and ``.mri 1'').  */
 722           if (ch != '\0'
 723               && (*mri_state == ch
 724                   || (*mri_state == ' '
 725                       && lex[ch] == LEX_IS_WHITESPACE)
 726                   || (*mri_state == '0'
 727                       && ch == '1')))
 728             {
 729               mri_last_ch = ch;
 730               ++mri_state;
 731             }
 732           else if (*mri_state != '\0'
 733                    || (lex[ch] != LEX_IS_WHITESPACE
 734                        && lex[ch] != LEX_IS_NEWLINE))
 735             {
 736               /* We did not get the expected character, or we didn't
 737                  get a valid terminating character after seeing the
 738                  entire pseudo-op, so we must go back to the
 739                  beginning.  */
 740               mri_state = NULL;
 741             }
 742           else
 743             {
 744               /* We've read the entire pseudo-op.  mips_last_ch is
 745                  either '0' or '1' indicating whether to enter or
 746                  leave MRI mode.  */
 747               do_scrub_begin (mri_last_ch == '1');
 748               mri_state = NULL;
 749
 750               /* We continue handling the character as usual.  The
 751                  main gas reader must also handle the .mri pseudo-op
 752                  to control expression parsing and the like.  */
 753             }
 754         }
 755 #endif
 756
 757       if (ch == EOF)
 758         {
 759           if (state != 0)
 760             {
 761               as_warn (_("end of file not at end of a line; newline inserted"));
 762               state = 0;
 763               PUT ('\n');
 764             }
 765           goto fromeof;
 766         }
 767
 768       switch (lex[ch])
 769         {
 770         case LEX_IS_WHITESPACE:
 771           do
 772             {
 773               ch = GET ();
 774             }
 775           while (ch != EOF && IS_WHITESPACE (ch));
 776           if (ch == EOF)
 777             goto fromeof;
 778
 779           if (state == 0)
 780             {
 781               /* Preserve a single whitespace character at the
 782                  beginning of a line.  */
 783               state = 1;
 784               UNGET (ch);
 785               PUT (' ');
 786               break;
 787             }
 788
 789 #ifdef KEEP_WHITE_AROUND_COLON
 790           if (lex[ch] == LEX_IS_COLON)
 791             {
 792               /* Only keep this white if there's no white *after* the
 793                  colon.  */
 794               ch2 = GET ();
 795               UNGET (ch2);
 796               if (!IS_WHITESPACE (ch2))
 797                 {
 798                   state = 9;
 799                   UNGET (ch);
 800                   PUT (' ');
 801                   break;
 802                 }
 803             }
 804 #endif
 805           if (IS_COMMENT (ch)
 806               || ch == '/'
 807               || IS_LINE_SEPARATOR (ch)
 808               || IS_PARALLEL_SEPARATOR (ch))
 809             {
 810               if (scrub_m68k_mri)
 811                 {
 812                   /* In MRI mode, we keep these spaces.  */
 813                   UNGET (ch);
 814                   PUT (' ');
 815                   break;
 816                 }
 817               goto recycle;
 818             }
 819
 820           /* If we're in state 2 or 11, we've seen a non-white
 821              character followed by whitespace.  If the next character
 822              is ':', this is whitespace after a label name which we
 823              normally must ignore.  In MRI mode, though, spaces are
 824              not permitted between the label and the colon.  */
 825           if ((state == 2 || state == 11)
 826               && lex[ch] == LEX_IS_COLON
 827               && ! scrub_m68k_mri)
 828             {
 829               state = 1;
 830               PUT (ch);
 831               break;
 832             }
 833
 834           switch (state)
 835             {
 836             case 0:
 837               state++;
 838               goto recycle;     /* Punted leading sp */
 839             case 1:
 840               /* We can arrive here if we leave a leading whitespace
 841                  character at the beginning of a line.  */
 842               goto recycle;
 843             case 2:
 844               state = 3;
 845               if (to + 1 < toend)
 846                 {
 847                   /* Optimize common case by skipping UNGET/GET.  */
 848                   PUT (' ');    /* Sp after opco */
 849                   goto recycle;
 850                 }
 851               UNGET (ch);
 852               PUT (' ');
 853               break;
 854             case 3:
 855               if (scrub_m68k_mri)
 856                 {
 857                   /* In MRI mode, we keep these spaces.  */
 858                   UNGET (ch);
 859                   PUT (' ');
 860                   break;
 861                 }
 862               goto recycle;     /* Sp in operands */
 863             case 9:
 864             case 10:
 865               if (scrub_m68k_mri)
 866                 {
 867                   /* In MRI mode, we keep these spaces.  */
 868                   state = 3;
 869                   UNGET (ch);
 870                   PUT (' ');
 871                   break;
 872                 }
 873               state = 10;       /* Sp after symbol char */
 874               goto recycle;
 875             case 11:
 876               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
 877                 state = 1;
 878               else
 879                 {
 880                   /* We know that ch is not ':', since we tested that
 881                      case above.  Therefore this is not a label, so it
 882                      must be the opcode, and we've just seen the
 883                      whitespace after it.  */
 884                   state = 3;
 885                 }
 886               UNGET (ch);
 887               PUT (' ');        /* Sp after label definition.  */
 888               break;
 889             default:
 890               BAD_CASE (state);
 891             }
 892           break;
 893
 894         case LEX_IS_TWOCHAR_COMMENT_1ST:
 895           ch2 = GET ();
 896           if (ch2 == '*')
 897             {
 898               for (;;)
 899                 {
 900                   do
 901                     {
 902                       ch2 = GET ();
 903                       if (ch2 != EOF && IS_NEWLINE (ch2))
 904                         add_newlines++;
 905                     }
 906                   while (ch2 != EOF && ch2 != '*');
 907
 908                   while (ch2 == '*')
 909                     ch2 = GET ();
 910
 911                   if (ch2 == EOF || ch2 == '/')
 912                     break;
 913
 914                   /* This UNGET will ensure that we count newlines
 915                      correctly.  */
 916                   UNGET (ch2);
 917                 }
 918
 919               if (ch2 == EOF)
 920                 as_warn (_("end of file in multiline comment"));
 921
 922               ch = ' ';
 923               goto recycle;
 924             }
 925 #ifdef DOUBLESLASH_LINE_COMMENTS
 926           else if (ch2 == '/')
 927             {
 928               do
 929                 {
 930                   ch = GET ();
 931                 }
 932               while (ch != EOF && !IS_NEWLINE (ch));
 933               if (ch == EOF)
 934                 as_warn ("end of file in comment; newline inserted");
 935               state = 0;
 936               PUT ('\n');
 937               break;
 938             }
 939 #endif
 940           else
 941             {
 942               if (ch2 != EOF)
 943                 UNGET (ch2);
 944               if (state == 9 || state == 10)
 945                 state = 3;
 946               PUT (ch);
 947             }
 948           break;
 949
 950         case LEX_IS_STRINGQUOTE:
 951           if (state == 10)
 952             {
 953               /* Preserve the whitespace in foo "bar" */
 954               UNGET (ch);
 955               state = 3;
 956               PUT (' ');
 957
 958               /* PUT didn't jump out.  We could just break, but we
 959                  know what will happen, so optimize a bit.  */
 960               ch = GET ();
 961               old_state = 3;
 962             }
 963           else if (state == 9)
 964             old_state = 3;
 965           else
 966             old_state = state;
 967           state = 5;
 968           PUT (ch);
 969           break;
 970
 971 #ifndef IEEE_STYLE
 972         case LEX_IS_ONECHAR_QUOTE:
 973           if (state == 10)
 974             {
 975               /* Preserve the whitespace in foo 'b' */
 976               UNGET (ch);
 977               state = 3;
 978               PUT (' ');
 979               break;
 980             }
 981           ch = GET ();
 982           if (ch == EOF)
 983             {
 984               as_warn (_("end of file after a one-character quote; \\0 inserted"));
 985               ch = 0;
 986             }
 987           if (ch == '\\')
 988             {
 989               ch = GET ();
 990               if (ch == EOF)
 991                 {
 992                   as_warn (_("end of file in escape character"));
 993                   ch = '\\';
 994                 }
 995               else
 996                 ch = process_escape (ch);
 997             }
 998           sprintf (out_buf, "%d", (int) (unsigned char) ch);
 999
1000           /* None of these 'x constants for us.  We want 'x'.  */
1001           if ((ch = GET ()) != '\'')
1002             {
1003 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1004               as_warn (_("missing close quote; (assumed)"));
1005 #else
1006               if (ch != EOF)
1007                 UNGET (ch);
1008 #endif
1009             }
1010           if (strlen (out_buf) == 1)
1011             {
1012               PUT (out_buf[0]);
1013               break;
1014             }
1015           if (state == 9)
1016             old_state = 3;
1017           else
1018             old_state = state;
1019           state = -1;
1020           out_string = out_buf;
1021           PUT (*out_string++);
1022           break;
1023 #endif
1024
1025         case LEX_IS_COLON:
1026 #ifdef KEEP_WHITE_AROUND_COLON
1027           state = 9;
1028 #else
1029           if (state == 9 || state == 10)
1030             state = 3;
1031           else if (state != 3)
1032             state = 1;
1033 #endif
1034           PUT (ch);
1035           break;
1036
1037         case LEX_IS_NEWLINE:
1038           /* Roll out a bunch of newlines from inside comments, etc.  */
1039           if (add_newlines)
1040             {
1041               --add_newlines;
1042               UNGET (ch);
1043             }
1044           /* Fall through.  */
1045
1046         case LEX_IS_LINE_SEPARATOR:
1047           state = 0;
1048           PUT (ch);
1049           break;
1050
1051         case LEX_IS_PARALLEL_SEPARATOR:
1052           state = 1;
1053           PUT (ch);
1054           break;
1055
1056 #ifdef TC_V850
1057         case LEX_IS_DOUBLEDASH_1ST:
1058           ch2 = GET ();
1059           if (ch2 != '-')
1060             {
1061               UNGET (ch2);
1062               goto de_fault;
1063             }
1064           /* Read and skip to end of line.  */
1065           do
1066             {
1067               ch = GET ();
1068             }
1069           while (ch != EOF && ch != '\n');
1070           if (ch == EOF)
1071             {
1072               as_warn (_("end of file in comment; newline inserted"));
1073             }
1074           state = 0;
1075           PUT ('\n');
1076           break;
1077 #endif
1078 #ifdef DOUBLEBAR_PARALLEL
1079         case LEX_IS_DOUBLEBAR_1ST:
1080           ch2 = GET ();
1081           if (ch2 != '|')
1082             {
1083               UNGET (ch2);
1084               goto de_fault;
1085             }
1086           /* Reset back to state 1 and pretend that we are parsing a line from
1087              just after the first white space.  */
1088           state = 1;
1089           PUT ('|');
1090           PUT ('|');
1091           break;
1092 #endif
1093         case LEX_IS_LINE_COMMENT_START:
1094           /* FIXME-someday: The two character comment stuff was badly
1095              thought out.  On i386, we want '/' as line comment start
1096              AND we want C style comments.  hence this hack.  The
1097              whole lexical process should be reworked.  xoxorich.  */
1098           if (ch == '/')
1099             {
1100               ch2 = GET ();
1101               if (ch2 == '*')
1102                 {
1103                   old_state = 3;
1104                   state = -2;
1105                   break;
1106                 }
1107               else
1108                 {
1109                   UNGET (ch2);
1110                 }
1111             } /* bad hack */
1112
1113           if (state == 0 || state == 1) /* Only comment at start of line.  */
1114             {
1115               int startch;
1116
1117               startch = ch;
1118
1119               do
1120                 {
1121                   ch = GET ();
1122                 }
1123               while (ch != EOF && IS_WHITESPACE (ch));
1124               if (ch == EOF)
1125                 {
1126                   as_warn (_("end of file in comment; newline inserted"));
1127                   PUT ('\n');
1128                   break;
1129                 }
1130               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1131                 {
1132                   /* Not a cpp line.  */
1133                   while (ch != EOF && !IS_NEWLINE (ch))
1134                     ch = GET ();
1135                   if (ch == EOF)
1136                     as_warn (_("end of file in comment; newline inserted"));
1137                   state = 0;
1138                   PUT ('\n');
1139                   break;
1140                 }
1141               /* Looks like `# 123 "filename"' from cpp.  */
1142               UNGET (ch);
1143               old_state = 4;
1144               state = -1;
1145               if (scrub_m68k_mri)
1146                 out_string = "\tappline ";
1147               else
1148                 out_string = "\t.appline ";
1149               PUT (*out_string++);
1150               break;
1151             }
1152
1153 #ifdef TC_D10V
1154           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1155              Trap is the only short insn that has a first operand that is
1156              neither register nor label.
1157              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1158              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1159              already LEX_IS_LINE_COMMENT_START.  However, it is the
1160              only character in line_comment_chars for d10v, hence we
1161              can recognize it as such.  */
1162           /* An alternative approach would be to reset the state to 1 when
1163              we see '||', '<'- or '->', but that seems to be overkill.  */
1164           if (state == 10)
1165             PUT (' ');
1166 #endif
1167           /* We have a line comment character which is not at the
1168              start of a line.  If this is also a normal comment
1169              character, fall through.  Otherwise treat it as a default
1170              character.  */
1171           if (strchr (tc_comment_chars, ch) == NULL
1172               && (! scrub_m68k_mri
1173                   || (ch != '!' && ch != '*')))
1174             goto de_fault;
1175           if (scrub_m68k_mri
1176               && (ch == '!' || ch == '*' || ch == '#')
1177               && state != 1
1178               && state != 10)
1179             goto de_fault;
1180           /* Fall through.  */
1181         case LEX_IS_COMMENT_START:
1182 #if defined TC_ARM && defined OBJ_ELF
1183           /* On the ARM, `@' is the comment character.
1184              Unfortunately this is also a special character in ELF .symver
1185              directives (and .type, though we deal with those another way).
1186              So we check if this line is such a directive, and treat
1187              the character as default if so.  This is a hack.  */
1188           if ((symver_state != NULL) && (*symver_state == 0))
1189             goto de_fault;
1190 #endif
1191 #ifdef WARN_COMMENTS
1192           if (!found_comment)
1193             as_where (&found_comment_file, &found_comment);
1194 #endif
1195           do
1196             {
1197               ch = GET ();
1198             }
1199           while (ch != EOF && !IS_NEWLINE (ch));
1200           if (ch == EOF)
1201             as_warn (_("end of file in comment; newline inserted"));
1202           state = 0;
1203           PUT ('\n');
1204           break;
1205
1206         case LEX_IS_SYMBOL_COMPONENT:
1207           if (state == 10)
1208             {
1209               /* This is a symbol character following another symbol
1210                  character, with whitespace in between.  We skipped
1211                  the whitespace earlier, so output it now.  */
1212               UNGET (ch);
1213               state = 3;
1214               PUT (' ');
1215               break;
1216             }
1217
1218           if (state == 3)
1219             state = 9;
1220
1221           /* This is a common case.  Quickly copy CH and all the
1222              following symbol component or normal characters.  */
1223           if (to + 1 < toend
1224               && mri_state == NULL
1225 #if defined TC_ARM && defined OBJ_ELF
1226               && symver_state == NULL
1227 #endif
1228               )
1229             {
1230               char *s;
1231               int len;
1232
1233               for (s = from; s < fromend; s++)
1234                 {
1235                   int type;
1236
1237                   ch2 = *(unsigned char *) s;
1238                   type = lex[ch2];
1239                   if (type != 0
1240                       && type != LEX_IS_SYMBOL_COMPONENT)
1241                     break;
1242                 }
1243               if (s > from)
1244                 {
1245                   /* Handle the last character normally, for
1246                      simplicity.  */
1247                   --s;
1248                 }
1249               len = s - from;
1250               if (len > (toend - to) - 1)
1251                 len = (toend - to) - 1;
1252               if (len > 0)
1253                 {
1254                   PUT (ch);
1255                   if (len > 8)
1256                     {
1257                       memcpy (to, from, len);
1258                       to += len;
1259                       from += len;
1260                     }
1261                   else
1262                     {
1263                       switch (len)
1264                         {
1265                         case 8: *to++ = *from++;
1266                         case 7: *to++ = *from++;
1267                         case 6: *to++ = *from++;
1268                         case 5: *to++ = *from++;
1269                         case 4: *to++ = *from++;
1270                         case 3: *to++ = *from++;
1271                         case 2: *to++ = *from++;
1272                         case 1: *to++ = *from++;
1273                         }
1274                     }
1275                   ch = GET ();
1276                 }
1277             }
1278
1279           /* Fall through.  */
1280         default:
1281         de_fault:
1282           /* Some relatively `normal' character.  */
1283           if (state == 0)
1284             {
1285               state = 11;       /* Now seeing label definition */
1286             }
1287           else if (state == 1)
1288             {
1289               state = 2;        /* Ditto */
1290             }
1291           else if (state == 9)
1292             {
1293               if (lex[ch] != LEX_IS_SYMBOL_COMPONENT)
1294                 state = 3;
1295             }
1296           else if (state == 10)
1297             {
1298               if (ch == '\\')
1299                 {
1300                   /* Special handling for backslash: a backslash may
1301                      be the beginning of a formal parameter (of a
1302                      macro) following another symbol character, with
1303                      whitespace in between.  If that is the case, we
1304                      output a space before the parameter.  Strictly
1305                      speaking, correct handling depends upon what the
1306                      macro parameter expands into; if the parameter
1307                      expands into something which does not start with
1308                      an operand character, then we don't want to keep
1309                      the space.  We don't have enough information to
1310                      make the right choice, so here we are making the
1311                      choice which is more likely to be correct.  */
1312                   PUT (' ');
1313                 }
1314
1315               state = 3;
1316             }
1317           PUT (ch);
1318           break;
1319         }
1320     }
1321
1322   /*NOTREACHED*/
1323
1324  fromeof:
1325   /* We have reached the end of the input.  */
1326   return to - tostart;
1327
1328  tofull:
1329   /* The output buffer is full.  Save any input we have not yet
1330      processed.  */
1331   if (fromend > from)
1332     {
1333       saved_input = from;
1334       saved_input_len = fromend - from;
1335     }
1336   else
1337     saved_input = NULL;
1338
1339   return to - tostart;
1340 }
1341
1342 /* end of app.c */