gcc/java/lex.c

   1 /* Language lexer for the GNU compiler for the Java(TM) language.
   2    Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003
   3    Free Software Foundation, Inc.
   4    Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GCC is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING.  If not, write to
  20 the Free Software Foundation, 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.
  22
  23 Java and all Java-based marks are trademarks or registered trademarks
  24 of Sun Microsystems, Inc. in the United States and other countries.
  25 The Free Software Foundation is independent of Sun Microsystems, Inc.  */
  26
  27 /* It defines java_lex (yylex) that reads a Java ASCII source file
  28    possibly containing Unicode escape sequence or utf8 encoded
  29    characters and returns a token for everything found but comments,
  30    white spaces and line terminators. When necessary, it also fills
  31    the java_lval (yylval) union. It's implemented to be called by a
  32    re-entrant parser generated by Bison.
  33
  34    The lexical analysis conforms to the Java grammar described in "The
  35    Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
  36    Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
  37
  38 #include "keyword.h"
  39 #include "flags.h"
  40 #include "chartables.h"
  41
  42 /* Function declarations.  */
  43 static char *java_sprint_unicode (struct java_line *, int);
  44 static void java_unicode_2_utf8 (unicode_t);
  45 static void java_lex_error (const char *, int);
  46 #ifndef JC1_LITE
  47 static int java_is_eol (FILE *, int);
  48 static tree build_wfl_node (tree);
  49 #endif
  50 static void java_store_unicode (struct java_line *, unicode_t, int);
  51 static int java_parse_escape_sequence (void);
  52 static int java_start_char_p (unicode_t);
  53 static int java_part_char_p (unicode_t);
  54 static int java_parse_doc_section (int);
  55 static void java_parse_end_comment (int);
  56 static int java_get_unicode (void);
  57 static int java_read_unicode (java_lexer *, int *);
  58 static int java_read_unicode_collapsing_terminators (java_lexer *, int *);
  59 static void java_store_unicode (struct java_line *, unicode_t, int);
  60 static int java_read_char (java_lexer *);
  61 static void java_allocate_new_line (void);
  62 static void java_unget_unicode (void);
  63 static unicode_t java_sneak_unicode (void);
  64 #ifndef JC1_LITE
  65 static int utf8_cmp (const unsigned char *, int, const char *);
  66 #endif
  67
  68 java_lexer *java_new_lexer (FILE *, const char *);
  69 #ifndef JC1_LITE
  70 static void error_if_numeric_overflow (tree);
  71 #endif
  72
  73 #ifdef HAVE_ICONV
  74 /* This is nonzero if we have initialized `need_byteswap'.  */
  75 static int byteswap_init = 0;
  76
  77 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
  78    big-endian order -- not native endian order.  We handle this by
  79    doing a conversion once at startup and seeing what happens.  This
  80    flag holds the results of this determination.  */
  81 static int need_byteswap = 0;
  82 #endif
  83
  84 void
  85 java_init_lex (FILE *finput, const char *encoding)
  86 {
  87 #ifndef JC1_LITE
  88   int java_lang_imported = 0;
  89
  90   if (!java_lang_id)
  91     java_lang_id = get_identifier ("java.lang");
  92   if (!inst_id)
  93     inst_id = get_identifier ("inst$");
  94   if (!wpv_id)
  95     wpv_id = get_identifier ("write_parm_value$");
  96
  97   if (!java_lang_imported)
  98     {
  99       tree node = build_tree_list
 100         (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
 101       read_import_dir (TREE_PURPOSE (node));
 102       TREE_CHAIN (node) = ctxp->import_demand_list;
 103       ctxp->import_demand_list = node;
 104       java_lang_imported = 1;
 105     }
 106
 107   if (!wfl_operator)
 108     wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
 109   if (!label_id)
 110     label_id = get_identifier ("$L");
 111   if (!wfl_append)
 112     wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
 113   if (!wfl_string_buffer)
 114     wfl_string_buffer =
 115       build_expr_wfl (get_identifier (flag_emit_class_files
 116                                       ? "java.lang.StringBuffer"
 117                                       : "gnu.gcj.runtime.StringBuffer"),
 118                       NULL, 0, 0);
 119   if (!wfl_to_string)
 120     wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
 121
 122   CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
 123     CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
 124
 125   memset (ctxp->modifier_ctx, 0, sizeof (ctxp->modifier_ctx));
 126   current_jcf = ggc_alloc_cleared (sizeof (JCF));
 127   ctxp->current_parsed_class = NULL;
 128   ctxp->package = NULL_TREE;
 129 #endif
 130
 131   ctxp->filename = input_filename;
 132   ctxp->lineno = lineno = 0;
 133   ctxp->p_line = NULL;
 134   ctxp->c_line = NULL;
 135   ctxp->java_error_flag = 0;
 136   ctxp->lexer = java_new_lexer (finput, encoding);
 137 }
 138
 139 static char *
 140 java_sprint_unicode (struct java_line *line, int i)
 141 {
 142   static char buffer [10];
 143   if (line->unicode_escape_p [i] || line->line [i] > 128)
 144     sprintf (buffer, "\\u%04x", line->line [i]);
 145   else
 146     {
 147       buffer [0] = line->line [i];
 148       buffer [1] = '\0';
 149     }
 150   return buffer;
 151 }
 152
 153 static unicode_t
 154 java_sneak_unicode (void)
 155 {
 156   return (ctxp->c_line->line [ctxp->c_line->current]);
 157 }
 158
 159 static void
 160 java_unget_unicode (void)
 161 {
 162   if (!ctxp->c_line->current)
 163     /* Can't unget unicode.  */
 164     abort ();
 165
 166   ctxp->c_line->current--;
 167   ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
 168 }
 169
 170 static void
 171 java_allocate_new_line (void)
 172 {
 173   unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
 174   char ahead_escape_p = (ctxp->c_line ?
 175                          ctxp->c_line->unicode_escape_ahead_p : 0);
 176
 177   if (ctxp->c_line && !ctxp->c_line->white_space_only)
 178     {
 179       if (ctxp->p_line)
 180         {
 181           free (ctxp->p_line->unicode_escape_p);
 182           free (ctxp->p_line->line);
 183           free (ctxp->p_line);
 184         }
 185       ctxp->p_line = ctxp->c_line;
 186       ctxp->c_line = NULL;              /* Reallocated.  */
 187     }
 188
 189   if (!ctxp->c_line)
 190     {
 191       ctxp->c_line = xmalloc (sizeof (struct java_line));
 192       ctxp->c_line->max = JAVA_LINE_MAX;
 193       ctxp->c_line->line = xmalloc (sizeof (unicode_t)*ctxp->c_line->max);
 194       ctxp->c_line->unicode_escape_p =
 195         xmalloc (sizeof (char)*ctxp->c_line->max);
 196       ctxp->c_line->white_space_only = 0;
 197     }
 198
 199   ctxp->c_line->line [0] = ctxp->c_line->size = 0;
 200   ctxp->c_line->char_col = ctxp->c_line->current = 0;
 201   if (ahead)
 202     {
 203       ctxp->c_line->line [ctxp->c_line->size] = ahead;
 204       ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
 205       ctxp->c_line->size++;
 206     }
 207   ctxp->c_line->ahead [0] = 0;
 208   ctxp->c_line->unicode_escape_ahead_p = 0;
 209   ctxp->c_line->lineno = ++lineno;
 210   ctxp->c_line->white_space_only = 1;
 211 }
 212
 213 /* Create a new lexer object.  */
 214
 215 java_lexer *
 216 java_new_lexer (FILE *finput, const char *encoding)
 217 {
 218   java_lexer *lex = xmalloc (sizeof (java_lexer));
 219   int enc_error = 0;
 220
 221   lex->finput = finput;
 222   lex->bs_count = 0;
 223   lex->unget_value = 0;
 224   lex->hit_eof = 0;
 225
 226 #ifdef HAVE_ICONV
 227   lex->handle = iconv_open ("UCS-2", encoding);
 228   if (lex->handle != (iconv_t) -1)
 229     {
 230       lex->first = -1;
 231       lex->last = -1;
 232       lex->out_first = -1;
 233       lex->out_last = -1;
 234       lex->read_anything = 0;
 235       lex->use_fallback = 0;
 236
 237       /* Work around broken iconv() implementations by doing checking at
 238          runtime.  We assume that if the UTF-8 => UCS-2 encoder is broken,
 239          then all UCS-2 encoders will be broken.  Perhaps not a valid
 240          assumption.  */
 241       if (! byteswap_init)
 242         {
 243           iconv_t handle;
 244
 245           byteswap_init = 1;
 246
 247           handle = iconv_open ("UCS-2", "UTF-8");
 248           if (handle != (iconv_t) -1)
 249             {
 250               unicode_t result;
 251               unsigned char in[3];
 252               char *inp, *outp;
 253               size_t inc, outc, r;
 254
 255               /* This is the UTF-8 encoding of \ufeff.  */
 256               in[0] = 0xef;
 257               in[1] = 0xbb;
 258               in[2] = 0xbf;
 259
 260               inp = in;
 261               inc = 3;
 262               outp = (char *) &result;
 263               outc = 2;
 264
 265               r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
 266                          &outp, &outc);
 267               iconv_close (handle);
 268               /* Conversion must be complete for us to use the result.  */
 269               if (r != (size_t) -1 && inc == 0 && outc == 0)
 270                 need_byteswap = (result != 0xfeff);
 271             }
 272         }
 273
 274       lex->byte_swap = need_byteswap;
 275     }
 276   else
 277 #endif /* HAVE_ICONV */
 278     {
 279       /* If iconv failed, use the internal decoder if the default
 280          encoding was requested.  This code is used on platforms where
 281          iconv exists but is insufficient for our needs.  For
 282          instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
 283
 284          On Solaris the default encoding, as returned by nl_langinfo(),
 285          is `646' (aka ASCII), but the Solaris iconv_open() doesn't
 286          understand that.  We work around that by pretending
 287          `646' to be the same as UTF-8.   */
 288       if (strcmp (encoding, DEFAULT_ENCODING) && strcmp (encoding, "646"))
 289         enc_error = 1;
 290 #ifdef HAVE_ICONV
 291       else
 292         lex->use_fallback = 1;
 293 #endif /* HAVE_ICONV */
 294     }
 295
 296   if (enc_error)
 297     fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation.  If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
 298
 299   return lex;
 300 }
 301
 302 void
 303 java_destroy_lexer (java_lexer *lex)
 304 {
 305 #ifdef HAVE_ICONV
 306   if (! lex->use_fallback)
 307     iconv_close (lex->handle);
 308 #endif
 309   free (lex);
 310 }
 311
 312 static int
 313 java_read_char (java_lexer *lex)
 314 {
 315   if (lex->unget_value)
 316     {
 317       unicode_t r = lex->unget_value;
 318       lex->unget_value = 0;
 319       return r;
 320     }
 321
 322 #ifdef HAVE_ICONV
 323   if (! lex->use_fallback)
 324     {
 325       size_t ir, inbytesleft, in_save, out_count, out_save;
 326       char *inp, *outp;
 327       unicode_t result;
 328
 329       /* If there is data which has already been converted, use it.  */
 330       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 331         {
 332           lex->out_first = 0;
 333           lex->out_last = 0;
 334
 335           while (1)
 336             {
 337               /* See if we need to read more data.  If FIRST == 0 then
 338                  the previous conversion attempt ended in the middle of
 339                  a character at the end of the buffer.  Otherwise we
 340                  only have to read if the buffer is empty.  */
 341               if (lex->first == 0 || lex->first >= lex->last)
 342                 {
 343                   int r;
 344
 345                   if (lex->first >= lex->last)
 346                     {
 347                       lex->first = 0;
 348                       lex->last = 0;
 349                     }
 350                   if (feof (lex->finput))
 351                     return UEOF;
 352                   r = fread (&lex->buffer[lex->last], 1,
 353                              sizeof (lex->buffer) - lex->last,
 354                              lex->finput);
 355                   lex->last += r;
 356                 }
 357
 358               inbytesleft = lex->last - lex->first;
 359               out_count = sizeof (lex->out_buffer) - lex->out_last;
 360
 361               if (inbytesleft == 0)
 362                 {
 363                   /* We've tried to read and there is nothing left.  */
 364                   return UEOF;
 365                 }
 366
 367               in_save = inbytesleft;
 368               out_save = out_count;
 369               inp = &lex->buffer[lex->first];
 370               outp = &lex->out_buffer[lex->out_last];
 371               ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
 372                           &inbytesleft, &outp, &out_count);
 373
 374               /* If we haven't read any bytes, then look to see if we
 375                  have read a BOM.  */
 376               if (! lex->read_anything && out_save - out_count >= 2)
 377                 {
 378                   unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
 379                   if (uc == 0xfeff)
 380                     {
 381                       lex->byte_swap = 0;
 382                       lex->out_first += 2;
 383                     }
 384                   else if (uc == 0xfffe)
 385                     {
 386                       lex->byte_swap = 1;
 387                       lex->out_first += 2;
 388                     }
 389                   lex->read_anything = 1;
 390                 }
 391
 392               if (lex->byte_swap)
 393                 {
 394                   unsigned int i;
 395                   for (i = 0; i < out_save - out_count; i += 2)
 396                     {
 397                       char t = lex->out_buffer[lex->out_last + i];
 398                       lex->out_buffer[lex->out_last + i]
 399                         = lex->out_buffer[lex->out_last + i + 1];
 400                       lex->out_buffer[lex->out_last + i + 1] = t;
 401                     }
 402                 }
 403
 404               lex->first += in_save - inbytesleft;
 405               lex->out_last += out_save - out_count;
 406
 407               /* If we converted anything at all, move along.  */
 408               if (out_count != out_save)
 409                 break;
 410
 411               if (ir == (size_t) -1)
 412                 {
 413                   if (errno == EINVAL)
 414                     {
 415                       /* This is ok.  This means that the end of our buffer
 416                          is in the middle of a character sequence.  We just
 417                          move the valid part of the buffer to the beginning
 418                          to force a read.  */
 419                       memmove (&lex->buffer[0], &lex->buffer[lex->first],
 420                                lex->last - lex->first);
 421                       lex->last -= lex->first;
 422                       lex->first = 0;
 423                     }
 424                   else
 425                     {
 426                       /* A more serious error.  */
 427                       java_lex_error ("unrecognized character in input stream",
 428                                       0);
 429                       return UEOF;
 430                     }
 431                 }
 432             }
 433         }
 434
 435       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 436         {
 437           /* Don't have any data.  */
 438           return UEOF;
 439         }
 440
 441       /* Success.  */
 442       result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
 443       lex->out_first += 2;
 444       return result;
 445     }
 446   else
 447 #endif /* HAVE_ICONV */
 448     {
 449       int c, c1, c2;
 450       c = getc (lex->finput);
 451
 452       if (c == EOF)
 453         return UEOF;
 454       if (c < 128)
 455         return (unicode_t) c;
 456       else
 457         {
 458           if ((c & 0xe0) == 0xc0)
 459             {
 460               c1 = getc (lex->finput);
 461               if ((c1 & 0xc0) == 0x80)
 462                 {
 463                   unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
 464                   /* Check for valid 2-byte characters.  We explicitly
 465                      allow \0 because this encoding is common in the
 466                      Java world.  */
 467                   if (r == 0 || (r >= 0x80 && r <= 0x7ff))
 468                     return r;
 469                 }
 470             }
 471           else if ((c & 0xf0) == 0xe0)
 472             {
 473               c1 = getc (lex->finput);
 474               if ((c1 & 0xc0) == 0x80)
 475                 {
 476                   c2 = getc (lex->finput);
 477                   if ((c2 & 0xc0) == 0x80)
 478                     {
 479                       unicode_t r =  (unicode_t)(((c & 0xf) << 12) +
 480                                                  (( c1 & 0x3f) << 6)
 481                                                  + (c2 & 0x3f));
 482                       /* Check for valid 3-byte characters.
 483                          Don't allow surrogate, \ufffe or \uffff.  */
 484                       if (IN_RANGE (r, 0x800, 0xffff)
 485                           && ! IN_RANGE (r, 0xd800, 0xdfff)
 486                           && r != 0xfffe && r != 0xffff)
 487                         return r;
 488                     }
 489                 }
 490             }
 491
 492           /* We simply don't support invalid characters.  We also
 493              don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
 494              cannot be valid Java characters.  */
 495           java_lex_error ("malformed UTF-8 character", 0);
 496         }
 497     }
 498
 499   /* We only get here on error.  */
 500   return UEOF;
 501 }
 502
 503 static void
 504 java_store_unicode (struct java_line *l, unicode_t c, int unicode_escape_p)
 505 {
 506   if (l->size == l->max)
 507     {
 508       l->max += JAVA_LINE_MAX;
 509       l->line = xrealloc (l->line, sizeof (unicode_t)*l->max);
 510       l->unicode_escape_p = xrealloc (l->unicode_escape_p,
 511                                       sizeof (char)*l->max);
 512     }
 513   l->line [l->size] = c;
 514   l->unicode_escape_p [l->size++] = unicode_escape_p;
 515 }
 516
 517 static int
 518 java_read_unicode (java_lexer *lex, int *unicode_escape_p)
 519 {
 520   int c;
 521
 522   c = java_read_char (lex);
 523   *unicode_escape_p = 0;
 524
 525   if (c != '\\')
 526     {
 527       lex->bs_count = 0;
 528       return c;
 529     }
 530
 531   ++lex->bs_count;
 532   if ((lex->bs_count) % 2 == 1)
 533     {
 534       /* Odd number of \ seen.  */
 535       c = java_read_char (lex);
 536       if (c == 'u')
 537         {
 538           unicode_t unicode = 0;
 539           int shift = 12;
 540
 541           /* Recognize any number of `u's in \u.  */
 542           while ((c = java_read_char (lex)) == 'u')
 543             ;
 544
 545           shift = 12;
 546           do
 547             {
 548               if (c == UEOF)
 549                 {
 550                   java_lex_error ("prematurely terminated \\u sequence", 0);
 551                   return UEOF;
 552                 }
 553
 554               if (hex_p (c))
 555                 unicode |= (unicode_t)(hex_value (c) << shift);
 556               else
 557                 {
 558                   java_lex_error ("non-hex digit in \\u sequence", 0);
 559                   break;
 560                 }
 561
 562               c = java_read_char (lex);
 563               shift -= 4;
 564             }
 565           while (shift >= 0);
 566
 567           if (c != UEOF)
 568             lex->unget_value = c;
 569
 570           lex->bs_count = 0;
 571           *unicode_escape_p = 1;
 572           return unicode;
 573         }
 574       lex->unget_value = c;
 575     }
 576   return (unicode_t) '\\';
 577 }
 578
 579 static int
 580 java_read_unicode_collapsing_terminators (java_lexer *lex,
 581                                           int *unicode_escape_p)
 582 {
 583   int c = java_read_unicode (lex, unicode_escape_p);
 584
 585   if (c == '\r')
 586     {
 587       /* We have to read ahead to see if we got \r\n.  In that case we
 588          return a single line terminator.  */
 589       int dummy;
 590       c = java_read_unicode (lex, &dummy);
 591       if (c != '\n' && c != UEOF)
 592         lex->unget_value = c;
 593       /* In either case we must return a newline.  */
 594       c = '\n';
 595     }
 596
 597   return c;
 598 }
 599
 600 static int
 601 java_get_unicode (void)
 602 {
 603   /* It's time to read a line when...  */
 604   if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
 605     {
 606       int c;
 607       int found_chars = 0;
 608
 609       if (ctxp->lexer->hit_eof)
 610         return UEOF;
 611
 612       java_allocate_new_line ();
 613       if (ctxp->c_line->line[0] != '\n')
 614         {
 615           for (;;)
 616             {
 617               int unicode_escape_p;
 618               c = java_read_unicode_collapsing_terminators (ctxp->lexer,
 619                                                             &unicode_escape_p);
 620               if (c != UEOF)
 621                 {
 622                   found_chars = 1;
 623                   java_store_unicode (ctxp->c_line, c, unicode_escape_p);
 624                   if (ctxp->c_line->white_space_only
 625                       && !JAVA_WHITE_SPACE_P (c)
 626                       && c != '\n')
 627                     ctxp->c_line->white_space_only = 0;
 628                 }
 629               if ((c == '\n') || (c == UEOF))
 630                 break;
 631             }
 632
 633           if (c == UEOF && ! found_chars)
 634             {
 635               ctxp->lexer->hit_eof = 1;
 636               return UEOF;
 637             }
 638         }
 639     }
 640   ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
 641   JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
 642   return ctxp->c_line->line [ctxp->c_line->current++];
 643 }
 644
 645 /* Parse the end of a C style comment.
 646  * C is the first character following the '/' and '*'.  */
 647 static void
 648 java_parse_end_comment (int c)
 649 {
 650   for ( ;; c = java_get_unicode ())
 651     {
 652       switch (c)
 653         {
 654         case UEOF:
 655           java_lex_error ("Comment not terminated at end of input", 0);
 656           return;
 657         case '*':
 658           switch (c = java_get_unicode ())
 659             {
 660             case UEOF:
 661               java_lex_error ("Comment not terminated at end of input", 0);
 662               return;
 663             case '/':
 664               return;
 665             case '*':   /* Reparse only '*'.  */
 666               java_unget_unicode ();
 667             }
 668         }
 669     }
 670 }
 671
 672 /* Parse the documentation section. Keywords must be at the beginning
 673    of a documentation comment line (ignoring white space and any `*'
 674    character). Parsed keyword(s): @DEPRECATED.  */
 675
 676 static int
 677 java_parse_doc_section (int c)
 678 {
 679   int valid_tag = 0, seen_star = 0;
 680
 681   while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n')
 682     {
 683       switch (c)
 684         {
 685         case '*':
 686           seen_star = 1;
 687           break;
 688         case '\n': /* ULT */
 689           valid_tag = 1;
 690         default:
 691           seen_star = 0;
 692         }
 693       c = java_get_unicode();
 694     }
 695
 696   if (c == UEOF)
 697     java_lex_error ("Comment not terminated at end of input", 0);
 698
 699   if (seen_star && (c == '/'))
 700     return 1;                   /* Goto step1 in caller.  */
 701
 702   /* We're parsing `@deprecated'.  */
 703   if (valid_tag && (c == '@'))
 704     {
 705       char tag [11];
 706       int  tag_index = 0;
 707
 708       while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n')
 709         {
 710           c = java_get_unicode ();
 711           tag [tag_index++] = c;
 712         }
 713
 714       if (c == UEOF)
 715         java_lex_error ("Comment not terminated at end of input", 0);
 716       tag [tag_index] = '\0';
 717
 718       if (!strcmp (tag, "deprecated"))
 719         ctxp->deprecated = 1;
 720     }
 721   java_unget_unicode ();
 722   return 0;
 723 }
 724
 725 /* Return true if C is a valid start character for a Java identifier.
 726    This is only called if C >= 128 -- smaller values are handled
 727    inline.  However, this function handles all values anyway.  */
 728 static int
 729 java_start_char_p (unicode_t c)
 730 {
 731   unsigned int hi = c / 256;
 732   const char *const page = type_table[hi];
 733   unsigned long val = (unsigned long) page;
 734   int flags;
 735
 736   if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
 737     flags = page[c & 255];
 738   else
 739     flags = val;
 740
 741   return flags & LETTER_START;
 742 }
 743
 744 /* Return true if C is a valid part character for a Java identifier.
 745    This is only called if C >= 128 -- smaller values are handled
 746    inline.  However, this function handles all values anyway.  */
 747 static int
 748 java_part_char_p (unicode_t c)
 749 {
 750   unsigned int hi = c / 256;
 751   const char *const page = type_table[hi];
 752   unsigned long val = (unsigned long) page;
 753   int flags;
 754
 755   if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
 756     flags = page[c & 255];
 757   else
 758     flags = val;
 759
 760   return flags & LETTER_PART;
 761 }
 762
 763 static int
 764 java_parse_escape_sequence (void)
 765 {
 766   unicode_t char_lit;
 767   int c;
 768
 769   switch (c = java_get_unicode ())
 770     {
 771     case 'b':
 772       return (unicode_t)0x8;
 773     case 't':
 774       return (unicode_t)0x9;
 775     case 'n':
 776       return (unicode_t)0xa;
 777     case 'f':
 778       return (unicode_t)0xc;
 779     case 'r':
 780       return (unicode_t)0xd;
 781     case '"':
 782       return (unicode_t)0x22;
 783     case '\'':
 784       return (unicode_t)0x27;
 785     case '\\':
 786       return (unicode_t)0x5c;
 787     case '0': case '1': case '2': case '3': case '4':
 788     case '5': case '6': case '7':
 789       {
 790         int octal_escape[3];
 791         int octal_escape_index = 0;
 792         int max = 3;
 793         int i, shift;
 794
 795         for (; octal_escape_index < max && RANGE (c, '0', '7');
 796              c = java_get_unicode ())
 797           {
 798             if (octal_escape_index == 0 && c > '3')
 799               {
 800                 /* According to the grammar, `\477' has a well-defined
 801                    meaning -- it is `\47' followed by `7'.  */
 802                 --max;
 803               }
 804             octal_escape [octal_escape_index++] = c;
 805           }
 806
 807         java_unget_unicode ();
 808
 809         for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
 810              i < octal_escape_index; i++, shift -= 3)
 811           char_lit |= (octal_escape [i] - '0') << shift;
 812
 813         return char_lit;
 814       }
 815     default:
 816       java_lex_error ("Invalid character in escape sequence", 0);
 817       return JAVA_CHAR_ERROR;
 818     }
 819 }
 820
 821 #ifndef JC1_LITE
 822 #define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)
 823
 824 /* Subroutine of java_lex: converts floating-point literals to tree
 825    nodes.  LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
 826    store the result.  FFLAG indicates whether the literal was tagged
 827    with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
 828    is the line number on which to report any error.  */
 829
 830 static void java_perform_atof (YYSTYPE *, char *, int, int);
 831
 832 static void
 833 java_perform_atof (YYSTYPE *java_lval, char *literal_token, int fflag,
 834                    int number_beginning)
 835 {
 836   REAL_VALUE_TYPE value;
 837   tree type = (fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
 838
 839   SET_REAL_VALUE_ATOF (value,
 840                        REAL_VALUE_ATOF (literal_token, TYPE_MODE (type)));
 841
 842   if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
 843     {
 844       JAVA_FLOAT_RANGE_ERROR (fflag ? "float" : "double");
 845       value = DCONST0;
 846     }
 847   else if (IS_ZERO (value))
 848     {
 849       /* We check to see if the value is really 0 or if we've found an
 850          underflow.  We do this in the most primitive imaginable way.  */
 851       int really_zero = 1;
 852       char *p = literal_token;
 853       if (*p == '-')
 854         ++p;
 855       while (*p && *p != 'e' && *p != 'E')
 856         {
 857           if (*p != '0' && *p != '.')
 858             {
 859               really_zero = 0;
 860               break;
 861             }
 862           ++p;
 863         }
 864       if (! really_zero)
 865         {
 866           int i = ctxp->c_line->current;
 867           ctxp->c_line->current = number_beginning;
 868           java_lex_error ("Floating point literal underflow", 0);
 869           ctxp->c_line->current = i;
 870         }
 871     }
 872
 873   SET_LVAL_NODE_TYPE (build_real (type, value), type);
 874 }
 875 #endif
 876
 877 static int yylex (YYSTYPE *);
 878
 879 static int
 880 #ifdef JC1_LITE
 881 yylex (YYSTYPE *java_lval)
 882 #else
 883 java_lex (YYSTYPE *java_lval)
 884 #endif
 885 {
 886   int c;
 887   unicode_t first_unicode;
 888   int ascii_index, all_ascii;
 889   char *string;
 890
 891   /* Translation of the Unicode escape in the raw stream of Unicode
 892      characters. Takes care of line terminator.  */
 893  step1:
 894   /* Skip white spaces: SP, TAB and FF or ULT.  */
 895   for (c = java_get_unicode ();
 896        c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
 897     if (c == '\n')
 898       {
 899         ctxp->elc.line = ctxp->c_line->lineno;
 900         ctxp->elc.col  = ctxp->c_line->char_col-2;
 901       }
 902
 903   ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
 904
 905   if (c == 0x1a)                /* CTRL-Z.  */
 906     {
 907       if ((c = java_get_unicode ()) == UEOF)
 908         return 0;               /* Ok here.  */
 909       else
 910         java_unget_unicode ();  /* Caught later, at the end of the
 911                                    function.  */
 912     }
 913   /* Handle EOF here.  */
 914   if (c == UEOF)        /* Should probably do something here...  */
 915     return 0;
 916
 917   /* Take care of eventual comments.  */
 918   if (c == '/')
 919     {
 920       switch (c = java_get_unicode ())
 921         {
 922         case '/':
 923           for (;;)
 924             {
 925               c = java_get_unicode ();
 926               if (c == UEOF)
 927                 {
 928                   /* It is ok to end a `//' comment with EOF, unless
 929                      we're being pedantic.  */
 930                   if (pedantic)
 931                     java_lex_error ("Comment not terminated at end of input",
 932                                     0);
 933                   return 0;
 934                 }
 935               if (c == '\n')    /* ULT */
 936                 goto step1;
 937             }
 938           break;
 939
 940         case '*':
 941           if ((c = java_get_unicode ()) == '*')
 942             {
 943               if ((c = java_get_unicode ()) == '/')
 944                 goto step1;     /* Empty documentation comment.  */
 945               else if (java_parse_doc_section (c))
 946                 goto step1;
 947             }
 948
 949           java_parse_end_comment ((c = java_get_unicode ()));
 950           goto step1;
 951           break;
 952         default:
 953           java_unget_unicode ();
 954           c = '/';
 955           break;
 956         }
 957     }
 958
 959   ctxp->elc.line = ctxp->c_line->lineno;
 960   ctxp->elc.prev_col = ctxp->elc.col;
 961   ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
 962   if (ctxp->elc.col < 0)
 963     abort ();
 964
 965   /* Numeric literals.  */
 966   if (JAVA_ASCII_DIGIT (c) || (c == '.'))
 967     {
 968       /* This section of code is borrowed from gcc/c-lex.c.  */
 969 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
 970       int parts[TOTAL_PARTS];
 971       HOST_WIDE_INT high, low;
 972       /* End borrowed section.  */
 973       char literal_token [256];
 974       int  literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
 975       int  found_hex_digits = 0, found_non_octal_digits = 0;
 976       int  i;
 977 #ifndef JC1_LITE
 978       int  number_beginning = ctxp->c_line->current;
 979       tree value;
 980 #endif
 981
 982       /* We might have a . separator instead of a FP like .[0-9]*.  */
 983       if (c == '.')
 984         {
 985           unicode_t peep = java_sneak_unicode ();
 986
 987           if (!JAVA_ASCII_DIGIT (peep))
 988             {
 989               JAVA_LEX_SEP('.');
 990               BUILD_OPERATOR (DOT_TK);
 991             }
 992         }
 993
 994       for (i = 0; i < TOTAL_PARTS; i++)
 995         parts [i] = 0;
 996
 997       if (c == '0')
 998         {
 999           c = java_get_unicode ();
1000           if (c == 'x' || c == 'X')
1001             {
1002               radix = 16;
1003               c = java_get_unicode ();
1004             }
1005           else if (JAVA_ASCII_DIGIT (c))
1006             radix = 8;
1007           else if (c == '.' || c == 'e' || c =='E')
1008             {
1009               /* Push the '.', 'e', or 'E' back and prepare for a FP
1010                  parsing...  */
1011               java_unget_unicode ();
1012               c = '0';
1013             }
1014           else
1015             {
1016               /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}.  */
1017               JAVA_LEX_LIT ("0", 10);
1018               switch (c)
1019                 {
1020                 case 'L': case 'l':
1021                   SET_LVAL_NODE (long_zero_node);
1022                   return (INT_LIT_TK);
1023                 case 'f': case 'F':
1024                   SET_LVAL_NODE (float_zero_node);
1025                   return (FP_LIT_TK);
1026                 case 'd': case 'D':
1027                   SET_LVAL_NODE (double_zero_node);
1028                   return (FP_LIT_TK);
1029                 default:
1030                   java_unget_unicode ();
1031                   SET_LVAL_NODE (integer_zero_node);
1032                   return (INT_LIT_TK);
1033                 }
1034             }
1035         }
1036       /* Parse the first part of the literal, until we find something
1037          which is not a number.  */
1038       while ((radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1039              JAVA_ASCII_DIGIT (c))
1040         {
1041           /* We store in a string (in case it turns out to be a FP) and in
1042              PARTS if we have to process a integer literal.  */
1043           int numeric = hex_value (c);
1044           int count;
1045
1046           /* Remember when we find a valid hexadecimal digit.  */
1047           if (radix == 16)
1048             found_hex_digits = 1;
1049           /* Remember when we find an invalid octal digit.  */
1050           else if (radix == 8 && !JAVA_ASCII_OCTDIGIT (c))
1051             found_non_octal_digits = 1;
1052
1053           literal_token [literal_index++] = c;
1054           /* This section of code if borrowed from gcc/c-lex.c.  */
1055           for (count = 0; count < TOTAL_PARTS; count++)
1056             {
1057               parts[count] *= radix;
1058               if (count)
1059                 {
1060                   parts[count]   += (parts[count-1] >> HOST_BITS_PER_CHAR);
1061                   parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1062                 }
1063               else
1064                 parts[0] += numeric;
1065             }
1066           if (parts [TOTAL_PARTS-1] != 0)
1067             overflow = 1;
1068           /* End borrowed section.  */
1069           c = java_get_unicode ();
1070         }
1071
1072       /* If we have something from the FP char set but not a digit, parse
1073          a FP literal.  */
1074       if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1075         {
1076           int stage = 0;
1077           int seen_digit = (literal_index ? 1 : 0);
1078           int seen_exponent = 0;
1079           int fflag = 0;        /* 1 for {f,F}, 0 for {d,D}. FP literal are
1080                                    double unless specified.  */
1081
1082           /* It is ok if the radix is 8 because this just means we've
1083              seen a leading `0'.  However, radix==16 is invalid.  */
1084           if (radix == 16)
1085             java_lex_error ("Can't express non-decimal FP literal", 0);
1086           radix = 10;
1087
1088           for (;;)
1089             {
1090               if (c == '.')
1091                 {
1092                   if (stage < 1)
1093                     {
1094                       stage = 1;
1095                       literal_token [literal_index++ ] = c;
1096                       c = java_get_unicode ();
1097                     }
1098                   else
1099                     java_lex_error ("Invalid character in FP literal", 0);
1100                 }
1101
1102               if (c == 'e' || c == 'E')
1103                 {
1104                   if (stage < 2)
1105                     {
1106                       /* {E,e} must have seen at least a digit.  */
1107                       if (!seen_digit)
1108                         java_lex_error
1109                           ("Invalid FP literal, mantissa must have digit", 0);
1110                       seen_digit = 0;
1111                       seen_exponent = 1;
1112                       stage = 2;
1113                       literal_token [literal_index++] = c;
1114                       c = java_get_unicode ();
1115                     }
1116                   else
1117                     java_lex_error ("Invalid character in FP literal", 0);
1118                 }
1119               if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1120                 {
1121                   fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1122                   stage = 4;    /* So we fall through.  */
1123                 }
1124
1125               if ((c=='-' || c =='+') && stage == 2)
1126                 {
1127                   stage = 3;
1128                   literal_token [literal_index++] = c;
1129                   c = java_get_unicode ();
1130                 }
1131
1132               if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1133                   (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1134                   (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1135                   (stage == 3 && JAVA_ASCII_DIGIT (c)))
1136                 {
1137                   if (JAVA_ASCII_DIGIT (c))
1138                     seen_digit = 1;
1139                   if (stage == 2)
1140                     stage = 3;
1141                   literal_token [literal_index++ ] = c;
1142                   c = java_get_unicode ();
1143                 }
1144               else
1145                 {
1146                   if (stage != 4) /* Don't push back fF/dD.  */
1147                     java_unget_unicode ();
1148
1149                   /* An exponent (if any) must have seen a digit.  */
1150                   if (seen_exponent && !seen_digit)
1151                     java_lex_error
1152                       ("Invalid FP literal, exponent must have digit", 0);
1153
1154                   literal_token [literal_index] = '\0';
1155                   JAVA_LEX_LIT (literal_token, radix);
1156
1157 #ifndef JC1_LITE
1158                   java_perform_atof (java_lval, literal_token,
1159                                      fflag, number_beginning);
1160 #endif
1161                   return FP_LIT_TK;
1162                 }
1163             }
1164         } /* JAVA_ASCII_FPCHAR (c) */
1165
1166       /* Here we get back to converting the integral literal.  */
1167       if (radix == 16 && ! found_hex_digits)
1168         java_lex_error
1169           ("0x must be followed by at least one hexadecimal digit", 0);
1170       else if (radix == 8 && found_non_octal_digits)
1171         java_lex_error ("Octal literal contains digit out of range", 0);
1172       else if (c == 'L' || c == 'l')
1173         long_suffix = 1;
1174       else
1175         java_unget_unicode ();
1176
1177 #ifdef JAVA_LEX_DEBUG
1178       literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe.  */
1179       JAVA_LEX_LIT (literal_token, radix);
1180 #endif
1181       /* This section of code is borrowed from gcc/c-lex.c.  */
1182       if (!overflow)
1183         {
1184           bytes = GET_TYPE_PRECISION (long_type_node);
1185           for (i = bytes; i < TOTAL_PARTS; i++)
1186             if (parts [i])
1187               {
1188                 overflow = 1;
1189                 break;
1190               }
1191         }
1192       high = low = 0;
1193       for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1194         {
1195           high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1196                                               / HOST_BITS_PER_CHAR)]
1197                    << (i * HOST_BITS_PER_CHAR));
1198           low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1199         }
1200       /* End borrowed section.  */
1201
1202 #ifndef JC1_LITE
1203       /* Range checking.  */
1204       value = build_int_2 (low, high);
1205       /* Temporarily set type to unsigned.  */
1206       SET_LVAL_NODE_TYPE (value, (long_suffix
1207                                   ? unsigned_long_type_node
1208                                   : unsigned_int_type_node));
1209
1210       /* For base 10 numbers, only values up to the highest value
1211          (plus one) can be written.  For instance, only ints up to
1212          2147483648 can be written.  The special case of the largest
1213          negative value is handled elsewhere.  For other bases, any
1214          number can be represented.  */
1215       if (overflow || (radix == 10
1216                        && tree_int_cst_lt (long_suffix
1217                                            ? decimal_long_max
1218                                            : decimal_int_max,
1219                                            value)))
1220         {
1221           if (long_suffix)
1222             JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1223           else
1224             JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1225         }
1226
1227       /* Sign extend the value.  */
1228       SET_LVAL_NODE_TYPE (value, (long_suffix ? long_type_node : int_type_node));
1229       force_fit_type (value, 0);
1230       JAVA_RADIX10_FLAG (value) = radix == 10;
1231 #else
1232       SET_LVAL_NODE_TYPE (build_int_2 (low, high),
1233                           long_suffix ? long_type_node : int_type_node);
1234 #endif
1235       return INT_LIT_TK;
1236     }
1237
1238   /* Character literals.  */
1239   if (c == '\'')
1240     {
1241       int char_lit;
1242       if ((c = java_get_unicode ()) == '\\')
1243         char_lit = java_parse_escape_sequence ();
1244       else
1245         {
1246           if (c == '\n' || c == '\'')
1247             java_lex_error ("Invalid character literal", 0);
1248           char_lit = c;
1249         }
1250
1251       c = java_get_unicode ();
1252
1253       if ((c == '\n') || (c == UEOF))
1254         java_lex_error ("Character literal not terminated at end of line", 0);
1255       if (c != '\'')
1256         java_lex_error ("Syntax error in character literal", 0);
1257
1258       if (char_lit == JAVA_CHAR_ERROR)
1259         char_lit = 0;           /* We silently convert it to zero.  */
1260
1261       JAVA_LEX_CHAR_LIT (char_lit);
1262       SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node);
1263       return CHAR_LIT_TK;
1264     }
1265
1266   /* String literals.  */
1267   if (c == '"')
1268     {
1269       int no_error;
1270       char *string;
1271
1272       for (no_error = 1, c = java_get_unicode ();
1273            c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1274         {
1275           if (c == '\\')
1276             c = java_parse_escape_sequence ();
1277           if (c == JAVA_CHAR_ERROR)
1278             {
1279               no_error = 0;
1280               c = 0;            /* We silently convert it to zero.  */
1281             }
1282           java_unicode_2_utf8 (c);
1283         }
1284       if (c == '\n' || c == UEOF) /* ULT.  */
1285         {
1286           lineno--;     /* Refer to the line where the terminator was seen.  */
1287           java_lex_error ("String not terminated at end of line", 0);
1288           lineno++;
1289         }
1290
1291       obstack_1grow (&temporary_obstack, '\0');
1292       string = obstack_finish (&temporary_obstack);
1293 #ifndef JC1_LITE
1294       if (!no_error || (c != '"'))
1295         java_lval->node = error_mark_node; /* FIXME: Requires futher
1296                                               testing.  */
1297       else
1298         java_lval->node = build_string (strlen (string), string);
1299 #endif
1300       obstack_free (&temporary_obstack, string);
1301       return STRING_LIT_TK;
1302     }
1303
1304   /* Separator.  */
1305   switch (c)
1306     {
1307     case '(':
1308       JAVA_LEX_SEP (c);
1309       BUILD_OPERATOR (OP_TK);
1310     case ')':
1311       JAVA_LEX_SEP (c);
1312       return CP_TK;
1313     case '{':
1314       JAVA_LEX_SEP (c);
1315       if (ctxp->ccb_indent == 1)
1316         ctxp->first_ccb_indent1 = lineno;
1317       ctxp->ccb_indent++;
1318       BUILD_OPERATOR (OCB_TK);
1319     case '}':
1320       JAVA_LEX_SEP (c);
1321       ctxp->ccb_indent--;
1322       if (ctxp->ccb_indent == 1)
1323         ctxp->last_ccb_indent1 = lineno;
1324       BUILD_OPERATOR (CCB_TK);
1325     case '[':
1326       JAVA_LEX_SEP (c);
1327       BUILD_OPERATOR (OSB_TK);
1328     case ']':
1329       JAVA_LEX_SEP (c);
1330       return CSB_TK;
1331     case ';':
1332       JAVA_LEX_SEP (c);
1333       return SC_TK;
1334     case ',':
1335       JAVA_LEX_SEP (c);
1336       return C_TK;
1337     case '.':
1338       JAVA_LEX_SEP (c);
1339       BUILD_OPERATOR (DOT_TK);
1340       /*      return DOT_TK; */
1341     }
1342
1343   /* Operators.  */
1344   switch (c)
1345     {
1346     case '=':
1347       if ((c = java_get_unicode ()) == '=')
1348         {
1349           BUILD_OPERATOR (EQ_TK);
1350         }
1351       else
1352         {
1353           /* Equals is used in two different locations. In the
1354              variable_declarator: rule, it has to be seen as '=' as opposed
1355              to being seen as an ordinary assignment operator in
1356              assignment_operators: rule.  */
1357           java_unget_unicode ();
1358           BUILD_OPERATOR (ASSIGN_TK);
1359         }
1360
1361     case '>':
1362       switch ((c = java_get_unicode ()))
1363         {
1364         case '=':
1365           BUILD_OPERATOR (GTE_TK);
1366         case '>':
1367           switch ((c = java_get_unicode ()))
1368             {
1369             case '>':
1370               if ((c = java_get_unicode ()) == '=')
1371                 {
1372                   BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1373                 }
1374               else
1375                 {
1376                   java_unget_unicode ();
1377                   BUILD_OPERATOR (ZRS_TK);
1378                 }
1379             case '=':
1380               BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1381             default:
1382               java_unget_unicode ();
1383               BUILD_OPERATOR (SRS_TK);
1384             }
1385         default:
1386           java_unget_unicode ();
1387           BUILD_OPERATOR (GT_TK);
1388         }
1389
1390     case '<':
1391       switch ((c = java_get_unicode ()))
1392         {
1393         case '=':
1394           BUILD_OPERATOR (LTE_TK);
1395         case '<':
1396           if ((c = java_get_unicode ()) == '=')
1397             {
1398               BUILD_OPERATOR2 (LS_ASSIGN_TK);
1399             }
1400           else
1401             {
1402               java_unget_unicode ();
1403               BUILD_OPERATOR (LS_TK);
1404             }
1405         default:
1406           java_unget_unicode ();
1407           BUILD_OPERATOR (LT_TK);
1408         }
1409
1410     case '&':
1411       switch ((c = java_get_unicode ()))
1412         {
1413         case '&':
1414           BUILD_OPERATOR (BOOL_AND_TK);
1415         case '=':
1416           BUILD_OPERATOR2 (AND_ASSIGN_TK);
1417         default:
1418           java_unget_unicode ();
1419           BUILD_OPERATOR (AND_TK);
1420         }
1421
1422     case '|':
1423       switch ((c = java_get_unicode ()))
1424         {
1425         case '|':
1426           BUILD_OPERATOR (BOOL_OR_TK);
1427         case '=':
1428           BUILD_OPERATOR2 (OR_ASSIGN_TK);
1429         default:
1430           java_unget_unicode ();
1431           BUILD_OPERATOR (OR_TK);
1432         }
1433
1434     case '+':
1435       switch ((c = java_get_unicode ()))
1436         {
1437         case '+':
1438           BUILD_OPERATOR (INCR_TK);
1439         case '=':
1440           BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1441         default:
1442           java_unget_unicode ();
1443           BUILD_OPERATOR (PLUS_TK);
1444         }
1445
1446     case '-':
1447       switch ((c = java_get_unicode ()))
1448         {
1449         case '-':
1450           BUILD_OPERATOR (DECR_TK);
1451         case '=':
1452           BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1453         default:
1454           java_unget_unicode ();
1455           BUILD_OPERATOR (MINUS_TK);
1456         }
1457
1458     case '*':
1459       if ((c = java_get_unicode ()) == '=')
1460         {
1461           BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1462         }
1463       else
1464         {
1465           java_unget_unicode ();
1466           BUILD_OPERATOR (MULT_TK);
1467         }
1468
1469     case '/':
1470       if ((c = java_get_unicode ()) == '=')
1471         {
1472           BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1473         }
1474       else
1475         {
1476           java_unget_unicode ();
1477           BUILD_OPERATOR (DIV_TK);
1478         }
1479
1480     case '^':
1481       if ((c = java_get_unicode ()) == '=')
1482         {
1483           BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1484         }
1485       else
1486         {
1487           java_unget_unicode ();
1488           BUILD_OPERATOR (XOR_TK);
1489         }
1490
1491     case '%':
1492       if ((c = java_get_unicode ()) == '=')
1493         {
1494           BUILD_OPERATOR2 (REM_ASSIGN_TK);
1495         }
1496       else
1497         {
1498           java_unget_unicode ();
1499           BUILD_OPERATOR (REM_TK);
1500         }
1501
1502     case '!':
1503       if ((c = java_get_unicode()) == '=')
1504         {
1505           BUILD_OPERATOR (NEQ_TK);
1506         }
1507       else
1508         {
1509           java_unget_unicode ();
1510           BUILD_OPERATOR (NEG_TK);
1511         }
1512
1513     case '?':
1514       JAVA_LEX_OP ("?");
1515       BUILD_OPERATOR (REL_QM_TK);
1516     case ':':
1517       JAVA_LEX_OP (":");
1518       BUILD_OPERATOR (REL_CL_TK);
1519     case '~':
1520       BUILD_OPERATOR (NOT_TK);
1521     }
1522
1523   /* Keyword, boolean literal or null literal.  */
1524   for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1525        c != UEOF && JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1526     {
1527       java_unicode_2_utf8 (c);
1528       if (all_ascii && c >= 128)
1529         all_ascii = 0;
1530       ascii_index++;
1531     }
1532
1533   obstack_1grow (&temporary_obstack, '\0');
1534   string = obstack_finish (&temporary_obstack);
1535   if (c != UEOF)
1536     java_unget_unicode ();
1537
1538   /* If we have something all ascii, we consider a keyword, a boolean
1539      literal, a null literal or an all ASCII identifier.  Otherwise,
1540      this is an identifier (possibly not respecting formation rule).  */
1541   if (all_ascii)
1542     {
1543       const struct java_keyword *kw;
1544       if ((kw=java_keyword (string, ascii_index)))
1545         {
1546           JAVA_LEX_KW (string);
1547           switch (kw->token)
1548             {
1549             case PUBLIC_TK:       case PROTECTED_TK: case STATIC_TK:
1550             case ABSTRACT_TK:     case FINAL_TK:     case NATIVE_TK:
1551             case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1552             case PRIVATE_TK:      case STRICT_TK:
1553               SET_MODIFIER_CTX (kw->token);
1554               return MODIFIER_TK;
1555             case FLOAT_TK:
1556               SET_LVAL_NODE (float_type_node);
1557               return FP_TK;
1558             case DOUBLE_TK:
1559               SET_LVAL_NODE (double_type_node);
1560               return FP_TK;
1561             case BOOLEAN_TK:
1562               SET_LVAL_NODE (boolean_type_node);
1563               return BOOLEAN_TK;
1564             case BYTE_TK:
1565               SET_LVAL_NODE (byte_type_node);
1566               return INTEGRAL_TK;
1567             case SHORT_TK:
1568               SET_LVAL_NODE (short_type_node);
1569               return INTEGRAL_TK;
1570             case INT_TK:
1571               SET_LVAL_NODE (int_type_node);
1572               return INTEGRAL_TK;
1573             case LONG_TK:
1574               SET_LVAL_NODE (long_type_node);
1575               return INTEGRAL_TK;
1576             case CHAR_TK:
1577               SET_LVAL_NODE (char_type_node);
1578               return INTEGRAL_TK;
1579
1580               /* Keyword based literals.  */
1581             case TRUE_TK:
1582             case FALSE_TK:
1583               SET_LVAL_NODE ((kw->token == TRUE_TK ?
1584                               boolean_true_node : boolean_false_node));
1585               return BOOL_LIT_TK;
1586             case NULL_TK:
1587               SET_LVAL_NODE (null_pointer_node);
1588               return NULL_TK;
1589
1590             case ASSERT_TK:
1591               if (flag_assert)
1592                 {
1593                   BUILD_OPERATOR (kw->token);
1594                   return kw->token;
1595                 }
1596               else
1597                 break;
1598
1599               /* Some keyword we want to retain information on the location
1600                  they where found.  */
1601             case CASE_TK:
1602             case DEFAULT_TK:
1603             case SUPER_TK:
1604             case THIS_TK:
1605             case RETURN_TK:
1606             case BREAK_TK:
1607             case CONTINUE_TK:
1608             case TRY_TK:
1609             case CATCH_TK:
1610             case THROW_TK:
1611             case INSTANCEOF_TK:
1612               BUILD_OPERATOR (kw->token);
1613
1614             default:
1615               return kw->token;
1616             }
1617         }
1618     }
1619
1620   /* We may have an ID here.  */
1621   if (JAVA_START_CHAR_P (first_unicode))
1622     {
1623       JAVA_LEX_ID (string);
1624       java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1625       return ID_TK;
1626     }
1627
1628   /* Everything else is an invalid character in the input.  */
1629   {
1630     char lex_error_buffer [128];
1631     sprintf (lex_error_buffer, "Invalid character `%s' in input",
1632              java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1633     java_lex_error (lex_error_buffer, 1);
1634   }
1635   return 0;
1636 }
1637
1638 #ifndef JC1_LITE
1639 /* This is called by the parser to see if an error should be generated
1640    due to numeric overflow.  This function only handles the particular
1641    case of the largest negative value, and is only called in the case
1642    where this value is not preceded by `-'.  */
1643 static void
1644 error_if_numeric_overflow (tree value)
1645 {
1646   if (TREE_CODE (value) == INTEGER_CST
1647       && JAVA_RADIX10_FLAG (value)
1648       && tree_int_cst_sgn (value) < 0)
1649     {
1650       if (TREE_TYPE (value) == long_type_node)
1651         java_lex_error ("Numeric overflow for `long' literal", 0);
1652       else
1653         java_lex_error ("Numeric overflow for `int' literal", 0);
1654     }
1655 }
1656 #endif /* JC1_LITE */
1657
1658 static void
1659 java_unicode_2_utf8 (unicode_t unicode)
1660 {
1661   if (RANGE (unicode, 0x01, 0x7f))
1662     obstack_1grow (&temporary_obstack, (char)unicode);
1663   else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1664     {
1665       obstack_1grow (&temporary_obstack,
1666                      (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1667       obstack_1grow (&temporary_obstack,
1668                      (unsigned char)(0x80 | (unicode & 0x3f)));
1669     }
1670   else                          /* Range 0x800-0xffff.  */
1671     {
1672       obstack_1grow (&temporary_obstack,
1673                      (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1674       obstack_1grow (&temporary_obstack,
1675                      (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1676       obstack_1grow (&temporary_obstack,
1677                      (unsigned char)(0x80 | (unicode & 0x003f)));
1678     }
1679 }
1680
1681 #ifndef JC1_LITE
1682 static tree
1683 build_wfl_node (tree node)
1684 {
1685   node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1686   /* Prevent java_complete_lhs from short-circuiting node (if constant).  */
1687   TREE_TYPE (node) = NULL_TREE;
1688   return node;
1689 }
1690 #endif
1691
1692 static void
1693 java_lex_error (const char *msg ATTRIBUTE_UNUSED, int forward ATTRIBUTE_UNUSED)
1694 {
1695 #ifndef JC1_LITE
1696   ctxp->elc.line = ctxp->c_line->lineno;
1697   ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1698
1699   /* Might be caught in the middle of some error report.  */
1700   ctxp->java_error_flag = 0;
1701   java_error (NULL);
1702   java_error (msg);
1703 #endif
1704 }
1705
1706 #ifndef JC1_LITE
1707 static int
1708 java_is_eol (FILE *fp, int c)
1709 {
1710   int next;
1711   switch (c)
1712     {
1713     case '\r':
1714       next = getc (fp);
1715       if (next != '\n' && next != EOF)
1716         ungetc (next, fp);
1717       return 1;
1718     case '\n':
1719       return 1;
1720     default:
1721       return 0;
1722     }
1723 }
1724 #endif
1725
1726 char *
1727 java_get_line_col (const char *filename ATTRIBUTE_UNUSED,
1728                    int line ATTRIBUTE_UNUSED, int col ATTRIBUTE_UNUSED)
1729 {
1730 #ifdef JC1_LITE
1731   return 0;
1732 #else
1733   /* Dumb implementation. Doesn't try to cache or optimize things.  */
1734   /* First line of the file is line 1, first column is 1.  */
1735
1736   /* COL == -1 means, at the CR/LF in LINE.  */
1737   /* COL == -2 means, at the first non space char in LINE.  */
1738
1739   FILE *fp;
1740   int c, ccol, cline = 1;
1741   int current_line_col = 0;
1742   int first_non_space = 0;
1743   char *base;
1744
1745   if (!(fp = fopen (filename, "r")))
1746     fatal_io_error ("can't open %s", filename);
1747
1748   while (cline != line)
1749     {
1750       c = getc (fp);
1751       if (c == EOF)
1752         {
1753           static const char msg[] = "<<file too short - unexpected EOF>>";
1754           obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1755           goto have_line;
1756         }
1757       if (java_is_eol (fp, c))
1758         cline++;
1759     }
1760
1761   /* Gather the chars of the current line in a buffer.  */
1762   for (;;)
1763     {
1764       c = getc (fp);
1765       if (c < 0 || java_is_eol (fp, c))
1766         break;
1767       if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1768         first_non_space = current_line_col;
1769       obstack_1grow (&temporary_obstack, c);
1770       current_line_col++;
1771     }
1772  have_line:
1773
1774   obstack_1grow (&temporary_obstack, '\n');
1775
1776   if (col == -1)
1777     {
1778       col = current_line_col;
1779       first_non_space = 0;
1780     }
1781   else if (col == -2)
1782     col = first_non_space;
1783   else
1784     first_non_space = 0;
1785
1786   /* Place the '^' a the right position.  */
1787   base = obstack_base (&temporary_obstack);
1788   for (ccol = 1; ccol <= col+3; ccol++)
1789     {
1790       /* Compute \t when reaching first_non_space.  */
1791       char c = (first_non_space ?
1792                 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1793       obstack_1grow (&temporary_obstack, c);
1794     }
1795   obstack_grow0 (&temporary_obstack, "^", 1);
1796
1797   fclose (fp);
1798   return obstack_finish (&temporary_obstack);
1799 #endif
1800 }
1801
1802 #ifndef JC1_LITE
1803 static int
1804 utf8_cmp (const unsigned char *str, int length, const char *name)
1805 {
1806   const unsigned char *limit = str + length;
1807   int i;
1808
1809   for (i = 0; name[i]; ++i)
1810     {
1811       int ch = UTF8_GET (str, limit);
1812       if (ch != name[i])
1813         return ch - name[i];
1814     }
1815
1816   return str == limit ? 0 : 1;
1817 }
1818
1819 /* A sorted list of all C++ keywords.  */
1820
1821 static const char *const cxx_keywords[] =
1822 {
1823   "_Complex",
1824   "__alignof",
1825   "__alignof__",
1826   "__asm",
1827   "__asm__",
1828   "__attribute",
1829   "__attribute__",
1830   "__builtin_va_arg",
1831   "__complex",
1832   "__complex__",
1833   "__const",
1834   "__const__",
1835   "__extension__",
1836   "__imag",
1837   "__imag__",
1838   "__inline",
1839   "__inline__",
1840   "__label__",
1841   "__null",
1842   "__real",
1843   "__real__",
1844   "__restrict",
1845   "__restrict__",
1846   "__signed",
1847   "__signed__",
1848   "__typeof",
1849   "__typeof__",
1850   "__volatile",
1851   "__volatile__",
1852   "and",
1853   "and_eq",
1854   "asm",
1855   "auto",
1856   "bitand",
1857   "bitor",
1858   "bool",
1859   "break",
1860   "case",
1861   "catch",
1862   "char",
1863   "class",
1864   "compl",
1865   "const",
1866   "const_cast",
1867   "continue",
1868   "default",
1869   "delete",
1870   "do",
1871   "double",
1872   "dynamic_cast",
1873   "else",
1874   "enum",
1875   "explicit",
1876   "export",
1877   "extern",
1878   "false",
1879   "float",
1880   "for",
1881   "friend",
1882   "goto",
1883   "if",
1884   "inline",
1885   "int",
1886   "long",
1887   "mutable",
1888   "namespace",
1889   "new",
1890   "not",
1891   "not_eq",
1892   "operator",
1893   "or",
1894   "or_eq",
1895   "private",
1896   "protected",
1897   "public",
1898   "register",
1899   "reinterpret_cast",
1900   "return",
1901   "short",
1902   "signed",
1903   "sizeof",
1904   "static",
1905   "static_cast",
1906   "struct",
1907   "switch",
1908   "template",
1909   "this",
1910   "throw",
1911   "true",
1912   "try",
1913   "typedef",
1914   "typeid",
1915   "typename",
1916   "typeof",
1917   "union",
1918   "unsigned",
1919   "using",
1920   "virtual",
1921   "void",
1922   "volatile",
1923   "wchar_t",
1924   "while",
1925   "xor",
1926   "xor_eq"
1927 };
1928
1929 /* Return true if NAME is a C++ keyword.  */
1930
1931 int
1932 cxx_keyword_p (const char *name, int length)
1933 {
1934   int last = ARRAY_SIZE (cxx_keywords);
1935   int first = 0;
1936   int mid = (last + first) / 2;
1937   int old = -1;
1938
1939   for (mid = (last + first) / 2;
1940        mid != old;
1941        old = mid, mid = (last + first) / 2)
1942     {
1943       int kwl = strlen (cxx_keywords[mid]);
1944       int min_length = kwl > length ? length : kwl;
1945       int r = utf8_cmp (name, min_length, cxx_keywords[mid]);
1946
1947       if (r == 0)
1948         {
1949           int i;
1950           /* We've found a match if all the remaining characters are `$'.  */
1951           for (i = min_length; i < length && name[i] == '$'; ++i)
1952             ;
1953           if (i == length)
1954             return 1;
1955           r = 1;
1956         }
1957
1958       if (r < 0)
1959         last = mid;
1960       else
1961         first = mid;
1962     }
1963   return 0;
1964 }
1965 #endif /* JC1_LITE */