gcc/java/lex.c

   1 /* Language lexer for the GNU compiler for the Java(TM) language.
   2    Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
   3    Free Software Foundation, Inc.
   4    Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GCC is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING.  If not, write to
  20 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
  21 Boston, MA 02110-1301, USA.
  22
  23 Java and all Java-based marks are trademarks or registered trademarks
  24 of Sun Microsystems, Inc. in the United States and other countries.
  25 The Free Software Foundation is independent of Sun Microsystems, Inc.  */
  26
  27 /* It defines java_lex (yylex) that reads a Java ASCII source file
  28    possibly containing Unicode escape sequence or utf8 encoded
  29    characters and returns a token for everything found but comments,
  30    white spaces and line terminators. When necessary, it also fills
  31    the java_lval (yylval) union. It's implemented to be called by a
  32    re-entrant parser generated by Bison.
  33
  34    The lexical analysis conforms to the Java grammar described in "The
  35    Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
  36    Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
  37
  38 #include "keyword.h"
  39 #include "flags.h"
  40 #include "chartables.h"
  41 #ifndef JC1_LITE
  42 #include "timevar.h"
  43 #endif
  44
  45 /* Function declarations.  */
  46 static char *java_sprint_unicode (int);
  47 static void java_unicode_2_utf8 (unicode_t);
  48 static void java_lex_error (const char *, int);
  49 #ifndef JC1_LITE
  50 static int do_java_lex (YYSTYPE *);
  51 static int java_lex (YYSTYPE *);
  52 static int java_is_eol (FILE *, int);
  53 static tree build_wfl_node (tree);
  54 #endif
  55 static int java_parse_escape_sequence (void);
  56 static int java_start_char_p (unicode_t);
  57 static int java_part_char_p (unicode_t);
  58 static int java_space_char_p (unicode_t);
  59 static void java_parse_doc_section (int);
  60 static void java_parse_end_comment (int);
  61 static int java_read_char (java_lexer *);
  62 static int java_get_unicode (void);
  63 static int java_peek_unicode (void);
  64 static void java_next_unicode (void);
  65 static int java_read_unicode (java_lexer *, int *);
  66 #ifndef JC1_LITE
  67 static int utf8_cmp (const unsigned char *, int, const char *);
  68 #endif
  69
  70 java_lexer *java_new_lexer (FILE *, const char *);
  71 #ifndef JC1_LITE
  72 static void error_if_numeric_overflow (tree);
  73 #endif
  74
  75 #ifdef HAVE_ICONV
  76 /* This is nonzero if we have initialized `need_byteswap'.  */
  77 static int byteswap_init = 0;
  78
  79 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
  80    big-endian order -- not native endian order.  We handle this by
  81    doing a conversion once at startup and seeing what happens.  This
  82    flag holds the results of this determination.  */
  83 static int need_byteswap = 0;
  84 #endif
  85
  86 void
  87 java_init_lex (FILE *finput, const char *encoding)
  88 {
  89 #ifndef JC1_LITE
  90   int java_lang_imported = 0;
  91
  92   if (!java_lang_id)
  93     java_lang_id = get_identifier ("java.lang");
  94   if (!inst_id)
  95     inst_id = get_identifier ("inst$");
  96   if (!wpv_id)
  97     wpv_id = get_identifier ("write_parm_value$");
  98
  99   if (!java_lang_imported)
 100     {
 101       tree node = build_tree_list (build_unknown_wfl (java_lang_id),
 102                                    NULL_TREE);
 103       read_import_dir (TREE_PURPOSE (node));
 104       TREE_CHAIN (node) = ctxp->import_demand_list;
 105       ctxp->import_demand_list = node;
 106       java_lang_imported = 1;
 107     }
 108
 109   if (!wfl_operator)
 110     {
 111 #ifndef JC1_LITE
 112 #ifdef USE_MAPPED_LOCATION
 113       wfl_operator = build_expr_wfl (NULL_TREE, input_location);
 114 #else
 115       wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
 116 #endif
 117 #endif
 118     }
 119   if (!label_id)
 120     label_id = get_identifier ("$L");
 121   if (!wfl_append)
 122     wfl_append = build_unknown_wfl (get_identifier ("append"));
 123   if (!wfl_string_buffer)
 124     wfl_string_buffer =
 125       build_unknown_wfl (get_identifier (flag_emit_class_files
 126                                       ? "java.lang.StringBuffer"
 127                                          : "gnu.gcj.runtime.StringBuffer"));
 128   if (!wfl_to_string)
 129     wfl_to_string = build_unknown_wfl (get_identifier ("toString"));
 130
 131   CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
 132     CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
 133
 134   memset (ctxp->modifier_ctx, 0, sizeof (ctxp->modifier_ctx));
 135   ctxp->current_parsed_class = NULL;
 136   ctxp->package = NULL_TREE;
 137 #endif
 138
 139 #ifndef JC1_LITE
 140   ctxp->save_location = input_location;
 141 #endif
 142   ctxp->java_error_flag = 0;
 143   ctxp->lexer = java_new_lexer (finput, encoding);
 144 }
 145
 146 static char *
 147 java_sprint_unicode (int c)
 148 {
 149   static char buffer [10];
 150   if (c < ' ' || c >= 127)
 151     sprintf (buffer, "\\u%04x", c);
 152   else
 153     {
 154       buffer [0] = c;
 155       buffer [1] = '\0';
 156     }
 157   return buffer;
 158 }
 159
 160 /* Create a new lexer object.  */
 161
 162 java_lexer *
 163 java_new_lexer (FILE *finput, const char *encoding)
 164 {
 165   java_lexer *lex = XNEW (java_lexer);
 166   int enc_error = 0;
 167
 168   lex->finput = finput;
 169   lex->bs_count = 0;
 170   lex->unget_value = 0;
 171   lex->next_unicode = 0;
 172   lex->avail_unicode = 0;
 173   lex->next_columns = 1;
 174   lex->encoding = encoding;
 175   lex->position.line = 1;
 176   lex->position.col = 1;
 177 #ifndef JC1_LITE
 178 #ifdef USE_MAPPED_LOCATION
 179       input_location
 180         = linemap_line_start (&line_table, 1, 120);
 181 #else
 182       input_line = 1;
 183 #endif
 184 #endif
 185
 186 #ifdef HAVE_ICONV
 187   lex->handle = iconv_open ("UCS-2", encoding);
 188   if (lex->handle != (iconv_t) -1)
 189     {
 190       lex->first = -1;
 191       lex->last = -1;
 192       lex->out_first = -1;
 193       lex->out_last = -1;
 194       lex->read_anything = 0;
 195       lex->use_fallback = 0;
 196
 197       /* Work around broken iconv() implementations by doing checking at
 198          runtime.  We assume that if the UTF-8 => UCS-2 encoder is broken,
 199          then all UCS-2 encoders will be broken.  Perhaps not a valid
 200          assumption.  */
 201       if (! byteswap_init)
 202         {
 203           iconv_t handle;
 204
 205           byteswap_init = 1;
 206
 207           handle = iconv_open ("UCS-2", "UTF-8");
 208           if (handle != (iconv_t) -1)
 209             {
 210               unicode_t result;
 211               unsigned char in[3];
 212               char *inp, *outp;
 213               size_t inc, outc, r;
 214
 215               /* This is the UTF-8 encoding of \ufeff.  */
 216               in[0] = 0xef;
 217               in[1] = 0xbb;
 218               in[2] = 0xbf;
 219
 220               inp = (char *) in;
 221               inc = 3;
 222               outp = (char *) &result;
 223               outc = 2;
 224
 225               r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
 226                          &outp, &outc);
 227               iconv_close (handle);
 228               /* Conversion must be complete for us to use the result.  */
 229               if (r != (size_t) -1 && inc == 0 && outc == 0)
 230                 need_byteswap = (result != 0xfeff);
 231             }
 232         }
 233
 234       lex->byte_swap = need_byteswap;
 235     }
 236   else
 237 #endif /* HAVE_ICONV */
 238     {
 239       /* If iconv failed, use the internal decoder if the default
 240          encoding was requested.  This code is used on platforms where
 241          iconv exists but is insufficient for our needs.  For
 242          instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
 243
 244          On Solaris the default encoding, as returned by nl_langinfo(),
 245          is `646' (aka ASCII), but the Solaris iconv_open() doesn't
 246          understand that.  We work around that by pretending
 247          `646' to be the same as UTF-8.   */
 248       if (strcmp (encoding, DEFAULT_ENCODING) && strcmp (encoding, "646"))
 249         enc_error = 1;
 250 #ifdef HAVE_ICONV
 251       else
 252         {
 253           lex->use_fallback = 1;
 254           lex->encoding = "UTF-8";
 255         }
 256 #endif /* HAVE_ICONV */
 257     }
 258
 259   if (enc_error)
 260     fatal_error ("unknown encoding: %qs\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation.  If you aren't trying\nto use a particular encoding for your input file, try the\n%<--encoding=UTF-8%> option", encoding);
 261
 262   return lex;
 263 }
 264
 265 void
 266 java_destroy_lexer (java_lexer *lex)
 267 {
 268 #ifdef HAVE_ICONV
 269   if (! lex->use_fallback)
 270     iconv_close (lex->handle);
 271 #endif
 272   free (lex);
 273 }
 274
 275 static int
 276 java_read_char (java_lexer *lex)
 277 {
 278 #ifdef HAVE_ICONV
 279   if (! lex->use_fallback)
 280     {
 281       size_t ir, inbytesleft, in_save, out_count, out_save;
 282       char *inp, *outp;
 283       unicode_t result;
 284
 285       /* If there is data which has already been converted, use it.  */
 286       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 287         {
 288           lex->out_first = 0;
 289           lex->out_last = 0;
 290
 291           while (1)
 292             {
 293               /* See if we need to read more data.  If FIRST == 0 then
 294                  the previous conversion attempt ended in the middle of
 295                  a character at the end of the buffer.  Otherwise we
 296                  only have to read if the buffer is empty.  */
 297               if (lex->first == 0 || lex->first >= lex->last)
 298                 {
 299                   int r;
 300
 301                   if (lex->first >= lex->last)
 302                     {
 303                       lex->first = 0;
 304                       lex->last = 0;
 305                     }
 306                   if (feof (lex->finput))
 307                     return UEOF;
 308                   r = fread (&lex->buffer[lex->last], 1,
 309                              sizeof (lex->buffer) - lex->last,
 310                              lex->finput);
 311                   lex->last += r;
 312                 }
 313
 314               inbytesleft = lex->last - lex->first;
 315               out_count = sizeof (lex->out_buffer) - lex->out_last;
 316
 317               if (inbytesleft == 0)
 318                 {
 319                   /* We've tried to read and there is nothing left.  */
 320                   return UEOF;
 321                 }
 322
 323               in_save = inbytesleft;
 324               out_save = out_count;
 325               inp = &lex->buffer[lex->first];
 326               outp = (char *) &lex->out_buffer[lex->out_last];
 327               ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
 328                           &inbytesleft, &outp, &out_count);
 329
 330               /* If we haven't read any bytes, then look to see if we
 331                  have read a BOM.  */
 332               if (! lex->read_anything && out_save - out_count >= 2)
 333                 {
 334                   unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
 335                   if (uc == 0xfeff)
 336                     {
 337                       lex->byte_swap = 0;
 338                       lex->out_first += 2;
 339                     }
 340                   else if (uc == 0xfffe)
 341                     {
 342                       lex->byte_swap = 1;
 343                       lex->out_first += 2;
 344                     }
 345                   lex->read_anything = 1;
 346                 }
 347
 348               if (lex->byte_swap)
 349                 {
 350                   unsigned int i;
 351                   for (i = 0; i < out_save - out_count; i += 2)
 352                     {
 353                       char t = lex->out_buffer[lex->out_last + i];
 354                       lex->out_buffer[lex->out_last + i]
 355                         = lex->out_buffer[lex->out_last + i + 1];
 356                       lex->out_buffer[lex->out_last + i + 1] = t;
 357                     }
 358                 }
 359
 360               lex->first += in_save - inbytesleft;
 361               lex->out_last += out_save - out_count;
 362
 363               /* If we converted anything at all, move along.  */
 364               if (out_count != out_save)
 365                 break;
 366
 367               if (ir == (size_t) -1)
 368                 {
 369                   if (errno == EINVAL)
 370                     {
 371                       /* This is ok.  This means that the end of our buffer
 372                          is in the middle of a character sequence.  We just
 373                          move the valid part of the buffer to the beginning
 374                          to force a read.  */
 375                       memmove (&lex->buffer[0], &lex->buffer[lex->first],
 376                                lex->last - lex->first);
 377                       lex->last -= lex->first;
 378                       lex->first = 0;
 379                     }
 380                   else
 381                     {
 382                       /* A more serious error.  */
 383                       char buffer[128];
 384                       sprintf (buffer,
 385                                "Unrecognized character for encoding '%s'",
 386                                lex->encoding);
 387                       java_lex_error (buffer, 0);
 388                       return UEOF;
 389                     }
 390                 }
 391             }
 392         }
 393
 394       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 395         {
 396           /* Don't have any data.  */
 397           return UEOF;
 398         }
 399
 400       /* Success.  */
 401       result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
 402       lex->out_first += 2;
 403       return result;
 404     }
 405   else
 406 #endif /* HAVE_ICONV */
 407     {
 408       int c, c1, c2;
 409       c = getc (lex->finput);
 410
 411       if (c == EOF)
 412         return UEOF;
 413       if (c < 128)
 414         return (unicode_t) c;
 415       else
 416         {
 417           if ((c & 0xe0) == 0xc0)
 418             {
 419               c1 = getc (lex->finput);
 420               if ((c1 & 0xc0) == 0x80)
 421                 {
 422                   unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
 423                   /* Check for valid 2-byte characters.  We explicitly
 424                      allow \0 because this encoding is common in the
 425                      Java world.  */
 426                   if (r == 0 || (r >= 0x80 && r <= 0x7ff))
 427                     return r;
 428                 }
 429             }
 430           else if ((c & 0xf0) == 0xe0)
 431             {
 432               c1 = getc (lex->finput);
 433               if ((c1 & 0xc0) == 0x80)
 434                 {
 435                   c2 = getc (lex->finput);
 436                   if ((c2 & 0xc0) == 0x80)
 437                     {
 438                       unicode_t r =  (unicode_t)(((c & 0xf) << 12) +
 439                                                  (( c1 & 0x3f) << 6)
 440                                                  + (c2 & 0x3f));
 441                       /* Check for valid 3-byte characters.
 442                          Don't allow surrogate, \ufffe or \uffff.  */
 443                       if (IN_RANGE (r, 0x800, 0xffff)
 444                           && ! IN_RANGE (r, 0xd800, 0xdfff)
 445                           && r != 0xfffe && r != 0xffff)
 446                         return r;
 447                     }
 448                 }
 449             }
 450
 451           /* We simply don't support invalid characters.  We also
 452              don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
 453              cannot be valid Java characters.  */
 454           java_lex_error ("malformed UTF-8 character", 0);
 455         }
 456     }
 457
 458   /* We only get here on error.  */
 459   return UEOF;
 460 }
 461
 462 static int
 463 java_read_unicode (java_lexer *lex, int *unicode_escape_p)
 464 {
 465   int c;
 466
 467   if (lex->unget_value)
 468     {
 469       c = lex->unget_value;
 470       lex->unget_value = 0;
 471     }
 472   else
 473     c = java_read_char (lex);
 474
 475   *unicode_escape_p = 0;
 476
 477   if (c != '\\')
 478     {
 479       lex->bs_count = 0;
 480       return c;
 481     }
 482
 483   ++lex->bs_count;
 484   if ((lex->bs_count) % 2 == 1)
 485     {
 486       /* Odd number of \ seen.  */
 487       c = java_read_char (lex);
 488       if (c == 'u')
 489         {
 490           unicode_t unicode = 0;
 491           int shift = 12;
 492
 493           /* Recognize any number of `u's in \u.  */
 494           while ((c = java_read_char (lex)) == 'u')
 495             ;
 496
 497           shift = 12;
 498           do
 499             {
 500               if (c == UEOF)
 501                 {
 502                   java_lex_error ("prematurely terminated \\u sequence", 0);
 503                   return UEOF;
 504                 }
 505
 506               if (hex_p (c))
 507                 unicode |= (unicode_t)(hex_value (c) << shift);
 508               else
 509                 {
 510                   java_lex_error ("non-hex digit in \\u sequence", 0);
 511                   break;
 512                 }
 513
 514               c = java_read_char (lex);
 515               shift -= 4;
 516             }
 517           while (shift >= 0);
 518
 519           if (c != UEOF)
 520             lex->unget_value = c;
 521
 522           lex->bs_count = 0;
 523           *unicode_escape_p = 1;
 524           return unicode;
 525         }
 526       lex->unget_value = c;
 527     }
 528   return (unicode_t) '\\';
 529 }
 530
 531 /* Get the next Unicode character (post-Unicode-escape-handling).
 532    Move the current position to just after returned character. */
 533
 534 static int
 535 java_get_unicode (void)
 536 {
 537   int next = java_peek_unicode ();
 538   java_next_unicode ();
 539   return next;
 540 }
 541
 542 /* Return the next Unicode character (post-Unicode-escape-handling).
 543    Do not move the current position, which remains just before
 544    the returned character. */
 545
 546 static int
 547 java_peek_unicode (void)
 548 {
 549   int unicode_escape_p;
 550   java_lexer *lex = ctxp->lexer;
 551   int next;
 552
 553   if (lex->avail_unicode)
 554     return lex->next_unicode;
 555
 556   next = java_read_unicode (lex, &unicode_escape_p);
 557
 558   if (next == '\r')
 559     {
 560       /* We have to read ahead to see if we got \r\n.
 561          In that case we return a single line terminator.  */
 562       int dummy;
 563       next = java_read_unicode (lex, &dummy);
 564       if (next != '\n' && next != UEOF)
 565         lex->unget_value = next;
 566       /* In either case we must return a newline.  */
 567       next = '\n';
 568     }
 569
 570   lex->next_unicode = next;
 571   lex->avail_unicode = 1;
 572
 573   if (next == UEOF)
 574     {
 575       lex->next_columns = 0;
 576       return next;
 577     }
 578
 579   if (next == '\n')
 580     {
 581       lex->next_columns = 1 - lex->position.col;
 582     }
 583   else if (next == '\t')
 584     {
 585       int cur_col = lex->position.col;
 586       lex->next_columns = ((cur_col + 7) & ~7) + 1 - cur_col;
 587
 588     }
 589   else
 590     {
 591       lex->next_columns = 1;
 592     }
 593   if (unicode_escape_p)
 594     lex->next_columns = 6;
 595   return next;
 596 }
 597
 598 /* Move forward one Unicode character (post-Unicode-escape-handling).
 599    Only allowed after java_peek_unicode.  The combination java_peek_unicode
 600    followed by java_next_unicode is equivalent to java_get_unicode.  */
 601
 602 static void java_next_unicode (void)
 603 {
 604   struct java_lexer *lex = ctxp->lexer;
 605   lex->position.col += lex->next_columns;
 606   if (lex->next_unicode == '\n')
 607     {
 608       lex->position.line++;
 609 #ifndef JC1_LITE
 610 #ifdef USE_MAPPED_LOCATION
 611       input_location
 612         = linemap_line_start (&line_table, lex->position.line, 120);
 613 #else
 614       input_line = lex->position.line;
 615 #endif
 616 #endif
 617     }
 618   lex->avail_unicode = 0;
 619 }
 620
 621 #if 0
 622 /* The inverse of java_next_unicode.
 623    Not currently used, but could be if it would be cleaner or faster.
 624    java_peek_unicode == java_get_unicode + java_unget_unicode.
 625    java_get_unicode == java_peek_unicode + java_next_unicode.
 626 */
 627 static void java_unget_unicode ()
 628 {
 629   struct java_lexer *lex = ctxp->lexer;
 630   if (lex->avail_unicode)
 631     fatal_error ("internal error - bad unget");
 632   lex->avail_unicode = 1;
 633   lex->position.col -= lex->next_columns;
 634 }
 635 #endif
 636
 637 /* Parse the end of a C style comment.
 638  * C is the first character following the '/' and '*'.  */
 639 static void
 640 java_parse_end_comment (int c)
 641 {
 642   for ( ;; c = java_get_unicode ())
 643     {
 644       switch (c)
 645         {
 646         case UEOF:
 647           java_lex_error ("Comment not terminated at end of input", 0);
 648           return;
 649         case '*':
 650           switch (c = java_peek_unicode ())
 651             {
 652             case UEOF:
 653               java_lex_error ("Comment not terminated at end of input", 0);
 654               return;
 655             case '/':
 656               java_next_unicode ();
 657               return;
 658             case '*':   /* Reparse only '*'.  */
 659               ;
 660             }
 661         }
 662     }
 663 }
 664
 665 /* Parse the documentation section. Keywords must be at the beginning
 666    of a documentation comment line (ignoring white space and any `*'
 667    character). Parsed keyword(s): @DEPRECATED.  */
 668
 669 static void
 670 java_parse_doc_section (int c)
 671 {
 672   int last_was_star;
 673
 674   /* We reset this here, because only the most recent doc comment
 675      applies to the following declaration.  */
 676   ctxp->deprecated = 0;
 677
 678   /* We loop over all the lines of the comment.  We'll eventually exit
 679      if we hit EOF prematurely, or when we see the comment
 680      terminator.  */
 681   while (1)
 682     {
 683       /* These first steps need only be done if we're still looking
 684          for the deprecated tag.  If we've already seen it, we might
 685          as well skip looking for it again.  */
 686       if (! ctxp->deprecated)
 687         {
 688           /* Skip whitespace and '*'s.  We must also check for the end
 689              of the comment here.  */
 690           while (JAVA_WHITE_SPACE_P (c) || c == '*')
 691             {
 692               last_was_star = (c == '*');
 693               c = java_get_unicode ();
 694               if (last_was_star && c == '/')
 695                 {
 696                   /* We just saw the comment terminator.  */
 697                   return;
 698                 }
 699             }
 700
 701           if (c == UEOF)
 702             goto eof;
 703
 704           if (c == '@')
 705             {
 706               const char *deprecated = "@deprecated";
 707               int i;
 708
 709               for (i = 0; deprecated[i]; ++i)
 710                 {
 711                   if (c != deprecated[i])
 712                     break;
 713                   /* We write the code in this way, with the
 714                      update at the end, so that after the loop
 715                      we're left with the next character in C.  */
 716                   c = java_get_unicode ();
 717                 }
 718
 719               if (c == UEOF)
 720                 goto eof;
 721
 722               /* @deprecated must be followed by a space or newline.
 723                  We also allow a '*' in case it appears just before
 724                  the end of a comment.  In this position only we also
 725                  must allow any Unicode space character.  */
 726               if (c == ' ' || c == '\n' || c == '*' || java_space_char_p (c))
 727                 {
 728                   if (! deprecated[i])
 729                     ctxp->deprecated = 1;
 730                 }
 731             }
 732         }
 733
 734       /* We've examined the relevant content from this line.  Now we
 735          skip the remaining characters and start over with the next
 736          line.  We also check for end of comment here.  */
 737       while (c != '\n' && c != UEOF)
 738         {
 739           last_was_star = (c == '*');
 740           c = java_get_unicode ();
 741           if (last_was_star && c == '/')
 742             return;
 743         }
 744
 745       if (c == UEOF)
 746         goto eof;
 747       /* We have to advance past the \n.  */
 748       c = java_get_unicode ();
 749       if (c == UEOF)
 750         goto eof;
 751     }
 752
 753  eof:
 754   java_lex_error ("Comment not terminated at end of input", 0);
 755 }
 756
 757 /* Return true if C is a valid start character for a Java identifier.
 758    This is only called if C >= 128 -- smaller values are handled
 759    inline.  However, this function handles all values anyway.  */
 760 static int
 761 java_start_char_p (unicode_t c)
 762 {
 763   unsigned int hi = c / 256;
 764   const char *const page = type_table[hi];
 765   unsigned long val = (unsigned long) page;
 766   int flags;
 767
 768   if ((val & ~ LETTER_MASK) != 0)
 769     flags = page[c & 255];
 770   else
 771     flags = val;
 772
 773   return flags & LETTER_START;
 774 }
 775
 776 /* Return true if C is a valid part character for a Java identifier.
 777    This is only called if C >= 128 -- smaller values are handled
 778    inline.  However, this function handles all values anyway.  */
 779 static int
 780 java_part_char_p (unicode_t c)
 781 {
 782   unsigned int hi = c / 256;
 783   const char *const page = type_table[hi];
 784   unsigned long val = (unsigned long) page;
 785   int flags;
 786
 787   if ((val & ~ LETTER_MASK) != 0)
 788     flags = page[c & 255];
 789   else
 790     flags = val;
 791
 792   return flags & LETTER_PART;
 793 }
 794
 795 /* Return true if C is whitespace.  */
 796 static int
 797 java_space_char_p (unicode_t c)
 798 {
 799   unsigned int hi = c / 256;
 800   const char *const page = type_table[hi];
 801   unsigned long val = (unsigned long) page;
 802   int flags;
 803
 804   if ((val & ~ LETTER_MASK) != 0)
 805     flags = page[c & 255];
 806   else
 807     flags = val;
 808
 809   return flags & LETTER_SPACE;
 810 }
 811
 812 static int
 813 java_parse_escape_sequence (void)
 814 {
 815   int c;
 816
 817   switch (c = java_get_unicode ())
 818     {
 819     case 'b':
 820       return (unicode_t)0x8;
 821     case 't':
 822       return (unicode_t)0x9;
 823     case 'n':
 824       return (unicode_t)0xa;
 825     case 'f':
 826       return (unicode_t)0xc;
 827     case 'r':
 828       return (unicode_t)0xd;
 829     case '"':
 830       return (unicode_t)0x22;
 831     case '\'':
 832       return (unicode_t)0x27;
 833     case '\\':
 834       return (unicode_t)0x5c;
 835     case '0': case '1': case '2': case '3': case '4':
 836     case '5': case '6': case '7':
 837       {
 838         int more = 3;
 839         unicode_t char_lit = 0;
 840
 841         if (c > '3')
 842           {
 843             /* According to the grammar, `\477' has a well-defined
 844                meaning -- it is `\47' followed by `7'.  */
 845             --more;
 846           }
 847         char_lit = 0;
 848         for (;;)
 849           {
 850             char_lit = 8 * char_lit + c - '0';
 851             if (--more == 0)
 852               break;
 853             c = java_peek_unicode ();
 854             if (! RANGE (c, '0', '7'))
 855               break;
 856             java_next_unicode ();
 857           }
 858
 859         return char_lit;
 860       }
 861     default:
 862       java_lex_error ("Invalid character in escape sequence", -1);
 863       return JAVA_CHAR_ERROR;
 864     }
 865 }
 866
 867 #ifndef JC1_LITE
 868 #define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)
 869
 870 /* Subroutine of java_lex: converts floating-point literals to tree
 871    nodes.  LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
 872    store the result.  FFLAG indicates whether the literal was tagged
 873    with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
 874    is the line number on which to report any error.  */
 875
 876 static void java_perform_atof (YYSTYPE *, char *, int, int);
 877
 878 static void
 879 java_perform_atof (YYSTYPE *java_lval, char *literal_token, int fflag,
 880                    int number_beginning)
 881 {
 882   REAL_VALUE_TYPE value;
 883   tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
 884
 885   SET_REAL_VALUE_ATOF (value,
 886                        REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));
 887
 888   if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
 889     {
 890       JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
 891       value = DCONST0;
 892     }
 893   else if (IS_ZERO (value))
 894     {
 895       /* We check to see if the value is really 0 or if we've found an
 896          underflow.  We do this in the most primitive imaginable way.  */
 897       int really_zero = 1;
 898       char *p = literal_token;
 899       if (*p == '-')
 900         ++p;
 901       while (*p && *p != 'e' && *p != 'E')
 902         {
 903           if (*p != '0' && *p != '.')
 904             {
 905               really_zero = 0;
 906               break;
 907             }
 908           ++p;
 909         }
 910       if (! really_zero)
 911         {
 912           int save_col = ctxp->lexer->position.col;
 913           ctxp->lexer->position.col = number_beginning;
 914           java_lex_error ("Floating point literal underflow", 0);
 915           ctxp->lexer->position.col = save_col;
 916         }
 917     }
 918
 919   SET_LVAL_NODE (build_real (type, value));
 920 }
 921 #endif
 922
 923 static int yylex (YYSTYPE *);
 924
 925 static int
 926 #ifdef JC1_LITE
 927 yylex (YYSTYPE *java_lval)
 928 #else
 929 do_java_lex (YYSTYPE *java_lval)
 930 #endif
 931 {
 932   int c;
 933   char *string;
 934
 935   /* Translation of the Unicode escape in the raw stream of Unicode
 936      characters. Takes care of line terminator.  */
 937  step1:
 938   /* Skip white spaces: SP, TAB and FF or ULT.  */
 939   for (;;)
 940     {
 941       c = java_peek_unicode ();
 942       if (c != '\n' && ! JAVA_WHITE_SPACE_P (c))
 943         break;
 944       java_next_unicode ();
 945     }
 946
 947   /* Handle EOF here.  */
 948   if (c == UEOF)        /* Should probably do something here...  */
 949     return 0;
 950
 951 #ifndef JC1_LITE
 952 #ifdef USE_MAPPED_LOCATION
 953   LINEMAP_POSITION_FOR_COLUMN (input_location, &line_table,
 954                                ctxp->lexer->position.col);
 955 #else
 956   ctxp->lexer->token_start = ctxp->lexer->position;
 957 #endif
 958 #endif
 959
 960   /* Numeric literals.  */
 961   if (JAVA_ASCII_DIGIT (c) || (c == '.'))
 962     {
 963       /* This section of code is borrowed from gcc/c-lex.c.  */
 964 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
 965       int parts[TOTAL_PARTS];
 966       HOST_WIDE_INT high, low;
 967       /* End borrowed section.  */
 968
 969 #define MAX_TOKEN_LEN 256
 970       char literal_token [MAX_TOKEN_LEN + 1];
 971       int  literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
 972       int  found_hex_digits = 0, found_non_octal_digits = -1;
 973       int  i;
 974 #ifndef JC1_LITE
 975       int  number_beginning = ctxp->lexer->position.col;
 976       tree value;
 977 #endif
 978
 979       for (i = 0; i < TOTAL_PARTS; i++)
 980         parts [i] = 0;
 981
 982       if (c == '0')
 983         {
 984           java_next_unicode ();
 985           c = java_peek_unicode ();
 986           if (c == 'x' || c == 'X')
 987             {
 988               radix = 16;
 989               java_next_unicode ();
 990               c = java_peek_unicode ();
 991             }
 992           else if (JAVA_ASCII_DIGIT (c))
 993             {
 994               literal_token [literal_index++] = '0';
 995               radix = 8;
 996             }
 997           else if (c == '.' || c == 'e' || c =='E')
 998             {
 999               literal_token [literal_index++] = '0';
1000               /* Handle C during floating-point parsing.  */
1001             }
1002           else
1003             {
1004               /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}.  */
1005               switch (c)
1006                 {
1007                 case 'L': case 'l':
1008                   java_next_unicode ();
1009                   SET_LVAL_NODE (long_zero_node);
1010                   return (INT_LIT_TK);
1011                 case 'f': case 'F':
1012                   java_next_unicode ();
1013                   SET_LVAL_NODE (float_zero_node);
1014                   return (FP_LIT_TK);
1015                 case 'd': case 'D':
1016                   java_next_unicode ();
1017                   SET_LVAL_NODE (double_zero_node);
1018                   return (FP_LIT_TK);
1019                 default:
1020                   SET_LVAL_NODE (integer_zero_node);
1021                   return (INT_LIT_TK);
1022                 }
1023             }
1024         }
1025
1026       /* Terminate LITERAL_TOKEN in case we bail out on large tokens.  */
1027       literal_token [MAX_TOKEN_LEN] = '\0';
1028
1029       /* Parse the first part of the literal, until we find something
1030          which is not a number.  */
1031       while ((radix == 16 ? JAVA_ASCII_HEXDIGIT (c) : JAVA_ASCII_DIGIT (c))
1032              && literal_index < MAX_TOKEN_LEN)
1033         {
1034           /* We store in a string (in case it turns out to be a FP) and in
1035              PARTS if we have to process a integer literal.  */
1036           int numeric = hex_value (c);
1037           int count;
1038
1039           /* Remember when we find a valid hexadecimal digit.  */
1040           if (radix == 16)
1041             found_hex_digits = 1;
1042           /* Remember when we find an invalid octal digit.  */
1043           else if (radix == 8 && numeric >= 8 && found_non_octal_digits < 0)
1044             found_non_octal_digits = literal_index;
1045
1046           literal_token [literal_index++] = c;
1047           /* This section of code if borrowed from gcc/c-lex.c.  */
1048           for (count = 0; count < TOTAL_PARTS; count++)
1049             {
1050               parts[count] *= radix;
1051               if (count)
1052                 {
1053                   parts[count]   += (parts[count-1] >> HOST_BITS_PER_CHAR);
1054                   parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1055                 }
1056               else
1057                 parts[0] += numeric;
1058             }
1059           if (parts [TOTAL_PARTS-1] != 0)
1060             overflow = 1;
1061           /* End borrowed section.  */
1062           java_next_unicode ();
1063           c = java_peek_unicode ();
1064         }
1065
1066       /* If we have something from the FP char set but not a digit, parse
1067          a FP literal.  */
1068       if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1069         {
1070           /* stage==0: seen digits only
1071            * stage==1: seen '.'
1072            * stage==2: seen 'e' or 'E'.
1073            * stage==3: seen '+' or '-' after 'e' or 'E'.
1074            * stage==4: seen type suffix ('f'/'F'/'d'/'D')
1075            */
1076           int stage = 0;
1077           int seen_digit = (literal_index ? 1 : 0);
1078           int seen_exponent = 0;
1079           int fflag = 0;        /* 1 for {f,F}, 0 for {d,D}. FP literal are
1080                                    double unless specified.  */
1081
1082           /* It is ok if the radix is 8 because this just means we've
1083              seen a leading `0'.  However, radix==16 is invalid.  */
1084           if (radix == 16)
1085             java_lex_error ("Can't express non-decimal FP literal", 0);
1086           radix = 10;
1087
1088           for (; literal_index < MAX_TOKEN_LEN;)
1089             {
1090               if (c == '.')
1091                 {
1092                   if (stage < 1)
1093                     {
1094                       stage = 1;
1095                       literal_token [literal_index++ ] = c;
1096                       java_next_unicode ();
1097                       c = java_peek_unicode ();
1098                       if (literal_index == 1 && !JAVA_ASCII_DIGIT (c))
1099                         BUILD_OPERATOR (DOT_TK);
1100                     }
1101                   else
1102                     java_lex_error ("Invalid character in FP literal", 0);
1103                 }
1104
1105               if ((c == 'e' || c == 'E') && literal_index < MAX_TOKEN_LEN)
1106                 {
1107                   if (stage < 2)
1108                     {
1109                       /* {E,e} must have seen at least a digit.  */
1110                       if (!seen_digit)
1111                         java_lex_error
1112                           ("Invalid FP literal, mantissa must have digit", 0);
1113                       seen_digit = 0;
1114                       seen_exponent = 1;
1115                       stage = 2;
1116                       literal_token [literal_index++] = c;
1117                       java_next_unicode ();
1118                       c = java_peek_unicode ();
1119                     }
1120                   else
1121                     java_lex_error ("Invalid character in FP literal", 0);
1122                 }
1123               if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1124                 {
1125                   fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1126                   stage = 4;    /* So we fall through.  */
1127                 }
1128
1129               if ((c=='-' || c =='+') && stage == 2
1130                   && literal_index < MAX_TOKEN_LEN)
1131                 {
1132                   stage = 3;
1133                   literal_token [literal_index++] = c;
1134                   java_next_unicode ();
1135                   c = java_peek_unicode ();
1136                 }
1137
1138               if (((stage == 0 && JAVA_ASCII_FPCHAR (c))
1139                    || (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.'))
1140                    || (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c)))
1141                    || (stage == 3 && JAVA_ASCII_DIGIT (c)))
1142                   && literal_index < MAX_TOKEN_LEN)
1143                 {
1144                   if (JAVA_ASCII_DIGIT (c))
1145                     seen_digit = 1;
1146                   if (stage == 2)
1147                     stage = 3;
1148                   literal_token [literal_index++ ] = c;
1149                   java_next_unicode ();
1150                   c = java_peek_unicode ();
1151                 }
1152               else if (literal_index < MAX_TOKEN_LEN)
1153                 {
1154                   if (stage == 4) /* Don't push back fF/dD.  */
1155                     java_next_unicode ();
1156
1157                   /* An exponent (if any) must have seen a digit.  */
1158                   if (seen_exponent && !seen_digit)
1159                     java_lex_error
1160                       ("Invalid FP literal, exponent must have digit", 0);
1161
1162                   literal_token [literal_index] = '\0';
1163
1164 #ifndef JC1_LITE
1165                   java_perform_atof (java_lval, literal_token,
1166                                      fflag, number_beginning);
1167 #endif
1168                   return FP_LIT_TK;
1169                 }
1170             }
1171         } /* JAVA_ASCII_FPCHAR (c) */
1172
1173       /* Here we get back to converting the integral literal.  */
1174       if (radix == 16 && ! found_hex_digits)
1175         java_lex_error
1176           ("0x must be followed by at least one hexadecimal digit", 0);
1177       else if (radix == 8 && found_non_octal_digits >= 0)
1178         {
1179           int back = literal_index - found_non_octal_digits;
1180           ctxp->lexer->position.col -= back;
1181           java_lex_error ("Octal literal contains digit out of range", 0);
1182           ctxp->lexer->position.col += back;
1183         }
1184       else if (c == 'L' || c == 'l')
1185         {
1186           java_next_unicode ();
1187           long_suffix = 1;
1188         }
1189
1190       /* This section of code is borrowed from gcc/c-lex.c.  */
1191       if (!overflow)
1192         {
1193           bytes = GET_TYPE_PRECISION (long_type_node);
1194           for (i = bytes; i < TOTAL_PARTS; i++)
1195             if (parts [i])
1196               {
1197                 overflow = 1;
1198                 break;
1199               }
1200         }
1201       high = low = 0;
1202       for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1203         {
1204           high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1205                                               / HOST_BITS_PER_CHAR)]
1206                    << (i * HOST_BITS_PER_CHAR));
1207           low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1208         }
1209       /* End borrowed section.  */
1210
1211 #ifndef JC1_LITE
1212       /* Range checking.  */
1213       /* Temporarily set type to unsigned.  */
1214       value = build_int_cst_wide (long_suffix
1215                                   ? unsigned_long_type_node
1216                                   : unsigned_int_type_node, low, high);
1217       SET_LVAL_NODE (value);
1218
1219       /* For base 10 numbers, only values up to the highest value
1220          (plus one) can be written.  For instance, only ints up to
1221          2147483648 can be written.  The special case of the largest
1222          negative value is handled elsewhere.  For other bases, any
1223          number can be represented.  */
1224       if (overflow || (radix == 10
1225                        && tree_int_cst_lt (long_suffix
1226                                            ? decimal_long_max
1227                                            : decimal_int_max,
1228                                            value)))
1229         {
1230           if (long_suffix)
1231             JAVA_RANGE_ERROR ("Numeric overflow for 'long' literal");
1232           else
1233             JAVA_RANGE_ERROR ("Numeric overflow for 'int' literal");
1234         }
1235
1236       /* Sign extend the value.  */
1237       value = build_int_cst_wide (long_suffix ? long_type_node : int_type_node,
1238                                   low, high);
1239       value = force_fit_type (value, 0, false, false);
1240
1241       if (radix != 10)
1242         {
1243           value = copy_node (value);
1244           JAVA_NOT_RADIX10_FLAG (value) = 1;
1245         }
1246
1247       SET_LVAL_NODE (value);
1248 #endif
1249       return INT_LIT_TK;
1250     }
1251
1252   /* We may have an ID here.  */
1253   if (JAVA_START_CHAR_P (c))
1254     {
1255       int ascii_index = 0, all_ascii = 1;
1256
1257       /* Keyword, boolean literal or null literal.  */
1258       while (c != UEOF && JAVA_PART_CHAR_P (c))
1259         {
1260           java_unicode_2_utf8 (c);
1261           if (c >= 128)
1262             all_ascii = 0;
1263           java_next_unicode ();
1264           ascii_index++;
1265           c = java_peek_unicode ();
1266         }
1267
1268       obstack_1grow (&temporary_obstack, '\0');
1269       string = obstack_finish (&temporary_obstack);
1270
1271       /* If we have something all ascii, we consider a keyword, a boolean
1272          literal, a null literal or an all ASCII identifier.  Otherwise,
1273          this is an identifier (possibly not respecting formation rule).  */
1274       if (all_ascii)
1275         {
1276           const struct java_keyword *kw;
1277           if ((kw=java_keyword (string, ascii_index)))
1278             {
1279               switch (kw->token)
1280                 {
1281                 case PUBLIC_TK:       case PROTECTED_TK: case STATIC_TK:
1282                 case ABSTRACT_TK:     case FINAL_TK:     case NATIVE_TK:
1283                 case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1284                 case PRIVATE_TK:      case STRICT_TK:
1285                   SET_MODIFIER_CTX (kw->token);
1286                   return MODIFIER_TK;
1287                 case FLOAT_TK:
1288                   SET_LVAL_NODE (float_type_node);
1289                   return FP_TK;
1290                 case DOUBLE_TK:
1291                   SET_LVAL_NODE (double_type_node);
1292                   return FP_TK;
1293                 case BOOLEAN_TK:
1294                   SET_LVAL_NODE (boolean_type_node);
1295                   return BOOLEAN_TK;
1296                 case BYTE_TK:
1297                   SET_LVAL_NODE (byte_type_node);
1298                   return INTEGRAL_TK;
1299                 case SHORT_TK:
1300                   SET_LVAL_NODE (short_type_node);
1301                   return INTEGRAL_TK;
1302                 case INT_TK:
1303                   SET_LVAL_NODE (int_type_node);
1304                   return INTEGRAL_TK;
1305                 case LONG_TK:
1306                   SET_LVAL_NODE (long_type_node);
1307                   return INTEGRAL_TK;
1308                 case CHAR_TK:
1309                   SET_LVAL_NODE (char_type_node);
1310                   return INTEGRAL_TK;
1311
1312                   /* Keyword based literals.  */
1313                 case TRUE_TK:
1314                 case FALSE_TK:
1315                   SET_LVAL_NODE ((kw->token == TRUE_TK ?
1316                                   boolean_true_node : boolean_false_node));
1317                   return BOOL_LIT_TK;
1318                 case NULL_TK:
1319                   SET_LVAL_NODE (null_pointer_node);
1320                   return NULL_TK;
1321
1322                 case ASSERT_TK:
1323                   if (flag_assert)
1324                     {
1325                       BUILD_OPERATOR (kw->token);
1326                       return kw->token;
1327                     }
1328                   else
1329                     break;
1330
1331                   /* Some keyword we want to retain information on the location
1332                      they where found.  */
1333                 case CASE_TK:
1334                 case DEFAULT_TK:
1335                 case SUPER_TK:
1336                 case THIS_TK:
1337                 case RETURN_TK:
1338                 case BREAK_TK:
1339                 case CONTINUE_TK:
1340                 case TRY_TK:
1341                 case CATCH_TK:
1342                 case THROW_TK:
1343                 case INSTANCEOF_TK:
1344                   BUILD_OPERATOR (kw->token);
1345
1346                 default:
1347                   return kw->token;
1348                 }
1349             }
1350         }
1351
1352       java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1353       return ID_TK;
1354     }
1355
1356   java_next_unicode ();
1357
1358   /* Character literals.  */
1359   if (c == '\'')
1360     {
1361       int char_lit;
1362
1363       if ((c = java_get_unicode ()) == '\\')
1364         char_lit = java_parse_escape_sequence ();
1365       else
1366         {
1367           if (c == '\n' || c == '\'')
1368             java_lex_error ("Invalid character literal", 0);
1369           char_lit = c;
1370         }
1371
1372       c = java_get_unicode ();
1373
1374       if ((c == '\n') || (c == UEOF))
1375         java_lex_error ("Character literal not terminated at end of line", 0);
1376       if (c != '\'')
1377         java_lex_error ("Syntax error in character literal", 0);
1378
1379       if (char_lit == JAVA_CHAR_ERROR)
1380         char_lit = 0;           /* We silently convert it to zero.  */
1381
1382       SET_LVAL_NODE (build_int_cst (char_type_node, char_lit));
1383       return CHAR_LIT_TK;
1384     }
1385
1386   /* String literals.  */
1387   if (c == '"')
1388     {
1389       int no_error = 1;
1390       char *string;
1391
1392       for (;;)
1393         {
1394           c = java_peek_unicode ();
1395           if (c == '\n' || c == UEOF) /* ULT.  */
1396             {
1397               java_lex_error ("String not terminated at end of line", 0);
1398               break;
1399             }
1400           java_next_unicode ();
1401           if (c == '"')
1402             break;
1403           if (c == '\\')
1404             c = java_parse_escape_sequence ();
1405           if (c == JAVA_CHAR_ERROR)
1406             {
1407               no_error = 0;
1408               c = 0;            /* We silently convert it to zero.  */
1409             }
1410           java_unicode_2_utf8 (c);
1411         }
1412
1413       obstack_1grow (&temporary_obstack, '\0');
1414       string = obstack_finish (&temporary_obstack);
1415 #ifndef JC1_LITE
1416       if (!no_error || (c != '"'))
1417         java_lval->node = error_mark_node; /* FIXME: Requires further
1418                                               testing.  */
1419       else
1420         java_lval->node = build_string (strlen (string), string);
1421 #endif
1422       obstack_free (&temporary_obstack, string);
1423       return STRING_LIT_TK;
1424     }
1425
1426   switch (c)
1427     {
1428     case '/':
1429       /* Check for comment.  */
1430       switch (c = java_peek_unicode ())
1431         {
1432         case '/':
1433           java_next_unicode ();
1434           for (;;)
1435             {
1436               c = java_get_unicode ();
1437               if (c == UEOF)
1438                 {
1439                   /* It is ok to end a `//' comment with EOF, unless
1440                      we're being pedantic.  */
1441                   if (pedantic)
1442                     java_lex_error ("Comment not terminated at end of input",
1443                                     0);
1444                   return 0;
1445                 }
1446               if (c == '\n')    /* ULT */
1447                 goto step1;
1448             }
1449           break;
1450
1451         case '*':
1452           java_next_unicode ();
1453           if ((c = java_get_unicode ()) == '*')
1454             {
1455               c = java_get_unicode ();
1456               if (c == '/')
1457                 {
1458                   /* Empty documentation comment.  We have to reset
1459                      the deprecation marker as only the most recent
1460                      doc comment applies.  */
1461                   ctxp->deprecated = 0;
1462                 }
1463               else
1464                 java_parse_doc_section (c);
1465             }
1466           else
1467             java_parse_end_comment ((c = java_get_unicode ()));
1468           goto step1;
1469           break;
1470
1471         case '=':
1472           java_next_unicode ();
1473           BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1474
1475         default:
1476           BUILD_OPERATOR (DIV_TK);
1477         }
1478
1479     case '(':
1480       BUILD_OPERATOR (OP_TK);
1481     case ')':
1482       return CP_TK;
1483     case '{':
1484 #ifndef JC1_LITE
1485       java_lval->operator.token = OCB_TK;
1486       java_lval->operator.location = BUILD_LOCATION();
1487 #ifdef USE_MAPPED_LOCATION
1488       if (ctxp->ccb_indent == 1)
1489         ctxp->first_ccb_indent1 = input_location;
1490 #else
1491       if (ctxp->ccb_indent == 1)
1492         ctxp->first_ccb_indent1 = input_line;
1493 #endif
1494 #endif
1495       ctxp->ccb_indent++;
1496       return OCB_TK;
1497     case '}':
1498       ctxp->ccb_indent--;
1499 #ifndef JC1_LITE
1500       java_lval->operator.token = CCB_TK;
1501       java_lval->operator.location = BUILD_LOCATION();
1502 #ifdef USE_MAPPED_LOCATION
1503       if (ctxp->ccb_indent == 1)
1504         ctxp->last_ccb_indent1 = input_location;
1505 #else
1506       if (ctxp->ccb_indent == 1)
1507         ctxp->last_ccb_indent1 = input_line;
1508 #endif
1509 #endif
1510       return CCB_TK;
1511     case '[':
1512       BUILD_OPERATOR (OSB_TK);
1513     case ']':
1514       return CSB_TK;
1515     case ';':
1516       return SC_TK;
1517     case ',':
1518       return C_TK;
1519     case '.':
1520       BUILD_OPERATOR (DOT_TK);
1521
1522       /* Operators.  */
1523     case '=':
1524       c = java_peek_unicode ();
1525       if (c == '=')
1526         {
1527           java_next_unicode ();
1528           BUILD_OPERATOR (EQ_TK);
1529         }
1530       else
1531         {
1532           /* Equals is used in two different locations. In the
1533              variable_declarator: rule, it has to be seen as '=' as opposed
1534              to being seen as an ordinary assignment operator in
1535              assignment_operators: rule.  */
1536           BUILD_OPERATOR (ASSIGN_TK);
1537         }
1538
1539     case '>':
1540       switch ((c = java_peek_unicode ()))
1541         {
1542         case '=':
1543           java_next_unicode ();
1544           BUILD_OPERATOR (GTE_TK);
1545         case '>':
1546           java_next_unicode ();
1547           switch ((c = java_peek_unicode ()))
1548             {
1549             case '>':
1550               java_next_unicode ();
1551               c = java_peek_unicode ();
1552               if (c == '=')
1553                 {
1554                   java_next_unicode ();
1555                   BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1556                 }
1557               else
1558                 {
1559                   BUILD_OPERATOR (ZRS_TK);
1560                 }
1561             case '=':
1562               java_next_unicode ();
1563               BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1564             default:
1565               BUILD_OPERATOR (SRS_TK);
1566             }
1567         default:
1568           BUILD_OPERATOR (GT_TK);
1569         }
1570
1571     case '<':
1572       switch ((c = java_peek_unicode ()))
1573         {
1574         case '=':
1575           java_next_unicode ();
1576           BUILD_OPERATOR (LTE_TK);
1577         case '<':
1578           java_next_unicode ();
1579           if ((c = java_peek_unicode ()) == '=')
1580             {
1581               java_next_unicode ();
1582               BUILD_OPERATOR2 (LS_ASSIGN_TK);
1583             }
1584           else
1585             {
1586               BUILD_OPERATOR (LS_TK);
1587             }
1588         default:
1589           BUILD_OPERATOR (LT_TK);
1590         }
1591
1592     case '&':
1593       switch ((c = java_peek_unicode ()))
1594         {
1595         case '&':
1596           java_next_unicode ();
1597           BUILD_OPERATOR (BOOL_AND_TK);
1598         case '=':
1599           java_next_unicode ();
1600           BUILD_OPERATOR2 (AND_ASSIGN_TK);
1601         default:
1602           BUILD_OPERATOR (AND_TK);
1603         }
1604
1605     case '|':
1606       switch ((c = java_peek_unicode ()))
1607         {
1608         case '|':
1609           java_next_unicode ();
1610           BUILD_OPERATOR (BOOL_OR_TK);
1611         case '=':
1612           java_next_unicode ();
1613           BUILD_OPERATOR2 (OR_ASSIGN_TK);
1614         default:
1615           BUILD_OPERATOR (OR_TK);
1616         }
1617
1618     case '+':
1619       switch ((c = java_peek_unicode ()))
1620         {
1621         case '+':
1622           java_next_unicode ();
1623           BUILD_OPERATOR (INCR_TK);
1624         case '=':
1625           java_next_unicode ();
1626           BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1627         default:
1628           BUILD_OPERATOR (PLUS_TK);
1629         }
1630
1631     case '-':
1632       switch ((c = java_peek_unicode ()))
1633         {
1634         case '-':
1635           java_next_unicode ();
1636           BUILD_OPERATOR (DECR_TK);
1637         case '=':
1638           java_next_unicode ();
1639           BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1640         default:
1641           BUILD_OPERATOR (MINUS_TK);
1642         }
1643
1644     case '*':
1645       if ((c = java_peek_unicode ()) == '=')
1646         {
1647           java_next_unicode ();
1648           BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1649         }
1650       else
1651         {
1652           BUILD_OPERATOR (MULT_TK);
1653         }
1654
1655     case '^':
1656       if ((c = java_peek_unicode ()) == '=')
1657         {
1658           java_next_unicode ();
1659           BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1660         }
1661       else
1662         {
1663           BUILD_OPERATOR (XOR_TK);
1664         }
1665
1666     case '%':
1667       if ((c = java_peek_unicode ()) == '=')
1668         {
1669           java_next_unicode ();
1670           BUILD_OPERATOR2 (REM_ASSIGN_TK);
1671         }
1672       else
1673         {
1674           BUILD_OPERATOR (REM_TK);
1675         }
1676
1677     case '!':
1678       if ((c = java_peek_unicode()) == '=')
1679         {
1680           java_next_unicode ();
1681           BUILD_OPERATOR (NEQ_TK);
1682         }
1683       else
1684         {
1685           BUILD_OPERATOR (NEG_TK);
1686         }
1687
1688     case '?':
1689       BUILD_OPERATOR (REL_QM_TK);
1690     case ':':
1691       BUILD_OPERATOR (REL_CL_TK);
1692     case '~':
1693       BUILD_OPERATOR (NOT_TK);
1694     }
1695
1696   if (c == 0x1a)                /* CTRL-Z.  */
1697     {
1698       if ((c = java_peek_unicode ()) == UEOF)
1699         return 0;               /* Ok here.  */
1700     }
1701
1702   /* Everything else is an invalid character in the input.  */
1703   {
1704     char lex_error_buffer [128];
1705     sprintf (lex_error_buffer, "Invalid character '%s' in input",
1706              java_sprint_unicode (c));
1707     java_lex_error (lex_error_buffer, -1);
1708   }
1709   return 0;
1710 }
1711
1712 #ifndef JC1_LITE
1713
1714 /* The exported interface to the lexer.  */
1715 static int
1716 java_lex (YYSTYPE *java_lval)
1717 {
1718   int r;
1719
1720   timevar_push (TV_LEX);
1721   r = do_java_lex (java_lval);
1722   timevar_pop (TV_LEX);
1723   return r;
1724 }
1725
1726 /* This is called by the parser to see if an error should be generated
1727    due to numeric overflow.  This function only handles the particular
1728    case of the largest negative value, and is only called in the case
1729    where this value is not preceded by `-'.  */
1730 static void
1731 error_if_numeric_overflow (tree value)
1732 {
1733   if (TREE_CODE (value) == INTEGER_CST
1734       && !JAVA_NOT_RADIX10_FLAG (value)
1735       && tree_int_cst_sgn (value) < 0)
1736     {
1737       if (TREE_TYPE (value) == long_type_node)
1738         java_lex_error ("Numeric overflow for 'long' literal", 0);
1739       else
1740         java_lex_error ("Numeric overflow for 'int' literal", 0);
1741     }
1742 }
1743
1744 #endif /* JC1_LITE */
1745
1746 static void
1747 java_unicode_2_utf8 (unicode_t unicode)
1748 {
1749   if (RANGE (unicode, 0x01, 0x7f))
1750     obstack_1grow (&temporary_obstack, (char)unicode);
1751   else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1752     {
1753       obstack_1grow (&temporary_obstack,
1754                      (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1755       obstack_1grow (&temporary_obstack,
1756                      (unsigned char)(0x80 | (unicode & 0x3f)));
1757     }
1758   else                          /* Range 0x800-0xffff.  */
1759     {
1760       obstack_1grow (&temporary_obstack,
1761                      (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1762       obstack_1grow (&temporary_obstack,
1763                      (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1764       obstack_1grow (&temporary_obstack,
1765                      (unsigned char)(0x80 | (unicode & 0x003f)));
1766     }
1767 }
1768
1769 #ifndef JC1_LITE
1770 static tree
1771 build_wfl_node (tree node)
1772 {
1773 #ifdef USE_MAPPED_LOCATION
1774   node = build_expr_wfl (node, input_location);
1775 #else
1776   node = build_expr_wfl (node, ctxp->filename,
1777                          ctxp->lexer->token_start.line,
1778                          ctxp->lexer->token_start.col);
1779 #endif
1780   /* Prevent java_complete_lhs from short-circuiting node (if constant).  */
1781   TREE_TYPE (node) = NULL_TREE;
1782   return node;
1783 }
1784 #endif
1785
1786 static void
1787 java_lex_error (const char *msg ATTRIBUTE_UNUSED, int forward ATTRIBUTE_UNUSED)
1788 {
1789 #ifndef JC1_LITE
1790   int col = (ctxp->lexer->position.col
1791              + forward * ctxp->lexer->next_columns);
1792 #if USE_MAPPED_LOCATION
1793   source_location save_location = input_location;
1794   LINEMAP_POSITION_FOR_COLUMN (input_location, &line_table, col);
1795
1796   /* Might be caught in the middle of some error report.  */
1797   ctxp->java_error_flag = 0;
1798   java_error (NULL);
1799   java_error (msg);
1800   input_location = save_location;
1801 #else
1802   java_lc save = ctxp->lexer->token_start;
1803   ctxp->lexer->token_start.line = ctxp->lexer->position.line;
1804   ctxp->lexer->token_start.col = col;
1805
1806   /* Might be caught in the middle of some error report.  */
1807   ctxp->java_error_flag = 0;
1808   java_error (NULL);
1809   java_error (msg);
1810   ctxp->lexer->token_start = save;
1811 #endif
1812 #endif
1813 }
1814
1815 #ifndef JC1_LITE
1816 static int
1817 java_is_eol (FILE *fp, int c)
1818 {
1819   int next;
1820   switch (c)
1821     {
1822     case '\r':
1823       next = getc (fp);
1824       if (next != '\n' && next != EOF)
1825         ungetc (next, fp);
1826       return 1;
1827     case '\n':
1828       return 1;
1829     default:
1830       return 0;
1831     }
1832 }
1833 #endif
1834
1835 char *
1836 java_get_line_col (const char *filename ATTRIBUTE_UNUSED,
1837                    int line ATTRIBUTE_UNUSED, int col ATTRIBUTE_UNUSED)
1838 {
1839 #ifdef JC1_LITE
1840   return 0;
1841 #else
1842   /* Dumb implementation. Doesn't try to cache or optimize things.  */
1843   /* First line of the file is line 1, first column is 1.  */
1844
1845   /* COL == -1 means, at the CR/LF in LINE.  */
1846   /* COL == -2 means, at the first non space char in LINE.  */
1847
1848   FILE *fp;
1849   int c, ccol, cline = 1;
1850   int current_line_col = 0;
1851   int first_non_space = 0;
1852   char *base;
1853
1854   if (!(fp = fopen (filename, "r")))
1855     fatal_error ("can't open %s: %m", filename);
1856
1857   while (cline != line)
1858     {
1859       c = getc (fp);
1860       if (c == EOF)
1861         {
1862           static const char msg[] = "<<file too short - unexpected EOF>>";
1863           obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1864           goto have_line;
1865         }
1866       if (java_is_eol (fp, c))
1867         cline++;
1868     }
1869
1870   /* Gather the chars of the current line in a buffer.  */
1871   for (;;)
1872     {
1873       c = getc (fp);
1874       if (c < 0 || java_is_eol (fp, c))
1875         break;
1876       if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1877         first_non_space = current_line_col;
1878       obstack_1grow (&temporary_obstack, c);
1879       current_line_col++;
1880     }
1881  have_line:
1882
1883   obstack_1grow (&temporary_obstack, '\n');
1884
1885   if (col == -1)
1886     {
1887       col = current_line_col;
1888       first_non_space = 0;
1889     }
1890   else if (col == -2)
1891     col = first_non_space;
1892   else
1893     first_non_space = 0;
1894
1895   /* Place the '^' a the right position.  */
1896   base = obstack_base (&temporary_obstack);
1897   for (col += 2, ccol = 0; ccol < col; ccol++)
1898     {
1899       /* Compute \t when reaching first_non_space.  */
1900       char c = (first_non_space ?
1901                 (base [ccol] == '\t' ? '\t' : ' ') : ' ');
1902       obstack_1grow (&temporary_obstack, c);
1903     }
1904   obstack_grow0 (&temporary_obstack, "^", 1);
1905
1906   fclose (fp);
1907   return obstack_finish (&temporary_obstack);
1908 #endif
1909 }
1910
1911 #ifndef JC1_LITE
1912 static int
1913 utf8_cmp (const unsigned char *str, int length, const char *name)
1914 {
1915   const unsigned char *limit = str + length;
1916   int i;
1917
1918   for (i = 0; name[i]; ++i)
1919     {
1920       int ch = UTF8_GET (str, limit);
1921       if (ch != name[i])
1922         return ch - name[i];
1923     }
1924
1925   return str == limit ? 0 : 1;
1926 }
1927
1928 /* A sorted list of all C++ keywords.  */
1929
1930 static const char *const cxx_keywords[] =
1931 {
1932   "_Complex",
1933   "__alignof",
1934   "__alignof__",
1935   "__asm",
1936   "__asm__",
1937   "__attribute",
1938   "__attribute__",
1939   "__builtin_va_arg",
1940   "__complex",
1941   "__complex__",
1942   "__const",
1943   "__const__",
1944   "__extension__",
1945   "__imag",
1946   "__imag__",
1947   "__inline",
1948   "__inline__",
1949   "__label__",
1950   "__null",
1951   "__real",
1952   "__real__",
1953   "__restrict",
1954   "__restrict__",
1955   "__signed",
1956   "__signed__",
1957   "__typeof",
1958   "__typeof__",
1959   "__volatile",
1960   "__volatile__",
1961   "and",
1962   "and_eq",
1963   "asm",
1964   "auto",
1965   "bitand",
1966   "bitor",
1967   "bool",
1968   "break",
1969   "case",
1970   "catch",
1971   "char",
1972   "class",
1973   "compl",
1974   "const",
1975   "const_cast",
1976   "continue",
1977   "default",
1978   "delete",
1979   "do",
1980   "double",
1981   "dynamic_cast",
1982   "else",
1983   "enum",
1984   "explicit",
1985   "export",
1986   "extern",
1987   "false",
1988   "float",
1989   "for",
1990   "friend",
1991   "goto",
1992   "if",
1993   "inline",
1994   "int",
1995   "long",
1996   "mutable",
1997   "namespace",
1998   "new",
1999   "not",
2000   "not_eq",
2001   "operator",
2002   "or",
2003   "or_eq",
2004   "private",
2005   "protected",
2006   "public",
2007   "register",
2008   "reinterpret_cast",
2009   "return",
2010   "short",
2011   "signed",
2012   "sizeof",
2013   "static",
2014   "static_cast",
2015   "struct",
2016   "switch",
2017   "template",
2018   "this",
2019   "throw",
2020   "true",
2021   "try",
2022   "typedef",
2023   "typeid",
2024   "typename",
2025   "typeof",
2026   "union",
2027   "unsigned",
2028   "using",
2029   "virtual",
2030   "void",
2031   "volatile",
2032   "wchar_t",
2033   "while",
2034   "xor",
2035   "xor_eq"
2036 };
2037
2038 /* Return true if NAME is a C++ keyword.  */
2039
2040 int
2041 cxx_keyword_p (const char *name, int length)
2042 {
2043   int last = ARRAY_SIZE (cxx_keywords);
2044   int first = 0;
2045   int mid = (last + first) / 2;
2046   int old = -1;
2047
2048   for (mid = (last + first) / 2;
2049        mid != old;
2050        old = mid, mid = (last + first) / 2)
2051     {
2052       int kwl = strlen (cxx_keywords[mid]);
2053       int min_length = kwl > length ? length : kwl;
2054       int r = utf8_cmp ((const unsigned char *) name, min_length, cxx_keywords[mid]);
2055
2056       if (r == 0)
2057         {
2058           int i;
2059           /* We've found a match if all the remaining characters are `$'.  */
2060           for (i = min_length; i < length && name[i] == '$'; ++i)
2061             ;
2062           if (i == length)
2063             return 1;
2064           r = 1;
2065         }
2066
2067       if (r < 0)
2068         last = mid;
2069       else
2070         first = mid;
2071     }
2072   return 0;
2073 }
2074 #endif /* JC1_LITE */