gcc/java/lex.c

   1 /* Language lexer for the GNU compiler for the Java(TM) language.
   2    Copyright (C) 1997, 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
   3    Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
   4
   5 This file is part of GNU CC.
   6
   7 GNU CC is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GNU CC is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GNU CC; see the file COPYING.  If not, write to
  19 the Free Software Foundation, 59 Temple Place - Suite 330,
  20 Boston, MA 02111-1307, USA.
  21
  22 Java and all Java-based marks are trademarks or registered trademarks
  23 of Sun Microsystems, Inc. in the United States and other countries.
  24 The Free Software Foundation is independent of Sun Microsystems, Inc.  */
  25
  26 /* It defines java_lex (yylex) that reads a Java ASCII source file
  27    possibly containing Unicode escape sequence or utf8 encoded
  28    characters and returns a token for everything found but comments,
  29    white spaces and line terminators. When necessary, it also fills
  30    the java_lval (yylval) union. It's implemented to be called by a
  31    re-entrant parser generated by Bison.
  32
  33    The lexical analysis conforms to the Java grammar described in "The
  34    Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
  35    Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
  36
  37 #include "keyword.h"
  38 #include "flags.h"
  39 #include "chartables.h"
  40
  41 /* Function declaration  */
  42 static char *java_sprint_unicode PARAMS ((struct java_line *, int));
  43 static void java_unicode_2_utf8 PARAMS ((unicode_t));
  44 static void java_lex_error PARAMS ((const char *, int));
  45 #ifndef JC1_LITE
  46 static int java_is_eol PARAMS ((FILE *, int));
  47 static tree build_wfl_node PARAMS ((tree));
  48 #endif
  49 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
  50 static int java_parse_escape_sequence PARAMS ((void));
  51 static int java_start_char_p PARAMS ((unicode_t));
  52 static int java_part_char_p PARAMS ((unicode_t));
  53 static int java_parse_doc_section PARAMS ((int));
  54 static void java_parse_end_comment PARAMS ((int));
  55 static int java_get_unicode PARAMS ((void));
  56 static int java_read_unicode PARAMS ((java_lexer *, int *));
  57 static int java_read_unicode_collapsing_terminators PARAMS ((java_lexer *,
  58                                                              int *));
  59 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
  60 static int java_read_char PARAMS ((java_lexer *));
  61 static void java_allocate_new_line PARAMS ((void));
  62 static void java_unget_unicode PARAMS ((void));
  63 static unicode_t java_sneak_unicode PARAMS ((void));
  64 #ifndef JC1_LITE
  65 static int utf8_cmp PARAMS ((const unsigned char *, int, const char *));
  66 #endif
  67
  68 java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
  69 #ifndef JC1_LITE
  70 static void error_if_numeric_overflow PARAMS ((tree));
  71 #endif
  72
  73 #ifdef HAVE_ICONV
  74 /* This is nonzero if we have initialized `need_byteswap'.  */
  75 static int byteswap_init = 0;
  76
  77 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
  78    big-endian order -- not native endian order.  We handle this by
  79    doing a conversion once at startup and seeing what happens.  This
  80    flag holds the results of this determination.  */
  81 static int need_byteswap = 0;
  82 #endif
  83
  84 void
  85 java_init_lex (finput, encoding)
  86      FILE *finput;
  87      const char *encoding;
  88 {
  89 #ifndef JC1_LITE
  90   int java_lang_imported = 0;
  91
  92   if (!java_lang_id)
  93     java_lang_id = get_identifier ("java.lang");
  94   if (!java_lang_cloneable)
  95     java_lang_cloneable = get_identifier ("java.lang.Cloneable");
  96   if (!java_io_serializable)
  97     java_io_serializable = get_identifier ("java.io.Serializable");
  98   if (!inst_id)
  99     inst_id = get_identifier ("inst$");
 100   if (!wpv_id)
 101     wpv_id = get_identifier ("write_parm_value$");
 102
 103   if (!java_lang_imported)
 104     {
 105       tree node = build_tree_list
 106         (build_expr_wfl (java_lang_id, NULL, 0, 0), NULL_TREE);
 107       read_import_dir (TREE_PURPOSE (node));
 108       TREE_CHAIN (node) = ctxp->import_demand_list;
 109       ctxp->import_demand_list = node;
 110       java_lang_imported = 1;
 111     }
 112
 113   if (!wfl_operator)
 114     wfl_operator = build_expr_wfl (NULL_TREE, ctxp->filename, 0, 0);
 115   if (!label_id)
 116     label_id = get_identifier ("$L");
 117   if (!wfl_append)
 118     wfl_append = build_expr_wfl (get_identifier ("append"), NULL, 0, 0);
 119   if (!wfl_string_buffer)
 120     wfl_string_buffer =
 121       build_expr_wfl (get_identifier (flag_emit_class_files
 122                                       ? "java.lang.StringBuffer"
 123                                       : "gnu.gcj.runtime.StringBuffer"),
 124                       NULL, 0, 0);
 125   if (!wfl_to_string)
 126     wfl_to_string = build_expr_wfl (get_identifier ("toString"), NULL, 0, 0);
 127
 128   CPC_INITIALIZER_LIST (ctxp) = CPC_STATIC_INITIALIZER_LIST (ctxp) =
 129     CPC_INSTANCE_INITIALIZER_LIST (ctxp) = NULL_TREE;
 130
 131   memset ((PTR) ctxp->modifier_ctx, 0, 11*sizeof (ctxp->modifier_ctx[0]));
 132   memset ((PTR) current_jcf, 0, sizeof (JCF));
 133   ctxp->current_parsed_class = NULL;
 134   ctxp->package = NULL_TREE;
 135 #endif
 136
 137   ctxp->filename = input_filename;
 138   ctxp->lineno = lineno = 0;
 139   ctxp->p_line = NULL;
 140   ctxp->c_line = NULL;
 141   ctxp->java_error_flag = 0;
 142   ctxp->lexer = java_new_lexer (finput, encoding);
 143 }
 144
 145 static char *
 146 java_sprint_unicode (line, i)
 147     struct java_line *line;
 148     int i;
 149 {
 150   static char buffer [10];
 151   if (line->unicode_escape_p [i] || line->line [i] > 128)
 152     sprintf (buffer, "\\u%04x", line->line [i]);
 153   else
 154     {
 155       buffer [0] = line->line [i];
 156       buffer [1] = '\0';
 157     }
 158   return buffer;
 159 }
 160
 161 static unicode_t
 162 java_sneak_unicode ()
 163 {
 164   return (ctxp->c_line->line [ctxp->c_line->current]);
 165 }
 166
 167 static void
 168 java_unget_unicode ()
 169 {
 170   if (!ctxp->c_line->current)
 171     /* Can't unget unicode.  */
 172     abort ();
 173
 174   ctxp->c_line->current--;
 175   ctxp->c_line->char_col -= JAVA_COLUMN_DELTA (0);
 176 }
 177
 178 static void
 179 java_allocate_new_line ()
 180 {
 181   unicode_t ahead = (ctxp->c_line ? ctxp->c_line->ahead[0] : '\0');
 182   char ahead_escape_p = (ctxp->c_line ?
 183                          ctxp->c_line->unicode_escape_ahead_p : 0);
 184
 185   if (ctxp->c_line && !ctxp->c_line->white_space_only)
 186     {
 187       if (ctxp->p_line)
 188         {
 189           free (ctxp->p_line->unicode_escape_p);
 190           free (ctxp->p_line->line);
 191           free (ctxp->p_line);
 192         }
 193       ctxp->p_line = ctxp->c_line;
 194       ctxp->c_line = NULL;              /* Reallocated */
 195     }
 196
 197   if (!ctxp->c_line)
 198     {
 199       ctxp->c_line = (struct java_line *)xmalloc (sizeof (struct java_line));
 200       ctxp->c_line->max = JAVA_LINE_MAX;
 201       ctxp->c_line->line = (unicode_t *)xmalloc
 202         (sizeof (unicode_t)*ctxp->c_line->max);
 203       ctxp->c_line->unicode_escape_p =
 204           (char *)xmalloc (sizeof (char)*ctxp->c_line->max);
 205       ctxp->c_line->white_space_only = 0;
 206     }
 207
 208   ctxp->c_line->line [0] = ctxp->c_line->size = 0;
 209   ctxp->c_line->char_col = ctxp->c_line->current = 0;
 210   if (ahead)
 211     {
 212       ctxp->c_line->line [ctxp->c_line->size] = ahead;
 213       ctxp->c_line->unicode_escape_p [ctxp->c_line->size] = ahead_escape_p;
 214       ctxp->c_line->size++;
 215     }
 216   ctxp->c_line->ahead [0] = 0;
 217   ctxp->c_line->unicode_escape_ahead_p = 0;
 218   ctxp->c_line->lineno = ++lineno;
 219   ctxp->c_line->white_space_only = 1;
 220 }
 221
 222 /* Create a new lexer object.  */
 223
 224 java_lexer *
 225 java_new_lexer (finput, encoding)
 226      FILE *finput;
 227      const char *encoding;
 228 {
 229   java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
 230   int enc_error = 0;
 231
 232   lex->finput = finput;
 233   lex->bs_count = 0;
 234   lex->unget_value = 0;
 235   lex->hit_eof = 0;
 236
 237 #ifdef HAVE_ICONV
 238   lex->handle = iconv_open ("UCS-2", encoding);
 239   if (lex->handle != (iconv_t) -1)
 240     {
 241       lex->first = -1;
 242       lex->last = -1;
 243       lex->out_first = -1;
 244       lex->out_last = -1;
 245       lex->read_anything = 0;
 246       lex->use_fallback = 0;
 247
 248       /* Work around broken iconv() implementations by doing checking at
 249          runtime.  We assume that if the UTF-8 => UCS-2 encoder is broken,
 250          then all UCS-2 encoders will be broken.  Perhaps not a valid
 251          assumption.  */
 252       if (! byteswap_init)
 253         {
 254           iconv_t handle;
 255
 256           byteswap_init = 1;
 257
 258           handle = iconv_open ("UCS-2", "UTF-8");
 259           if (handle != (iconv_t) -1)
 260             {
 261               unicode_t result;
 262               unsigned char in[3];
 263               char *inp, *outp;
 264               size_t inc, outc, r;
 265
 266               /* This is the UTF-8 encoding of \ufeff.  */
 267               in[0] = 0xef;
 268               in[1] = 0xbb;
 269               in[2] = 0xbf;
 270
 271               inp = in;
 272               inc = 3;
 273               outp = (char *) &result;
 274               outc = 2;
 275
 276               r = iconv (handle, (ICONV_CONST char **) &inp, &inc,
 277                          &outp, &outc);
 278               iconv_close (handle);
 279               /* Conversion must be complete for us to use the result.  */
 280               if (r != (size_t) -1 && inc == 0 && outc == 0)
 281                 need_byteswap = (result != 0xfeff);
 282             }
 283         }
 284
 285       lex->byte_swap = need_byteswap;
 286     }
 287   else
 288 #endif /* HAVE_ICONV */
 289     {
 290       /* If iconv failed, use the internal decoder if the default
 291          encoding was requested.  This code is used on platforms where
 292          iconv exists but is insufficient for our needs.  For
 293          instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.  */
 294       if (strcmp (encoding, DEFAULT_ENCODING))
 295         enc_error = 1;
 296 #ifdef HAVE_ICONV
 297       else
 298         lex->use_fallback = 1;
 299 #endif /* HAVE_ICONV */
 300     }
 301
 302   if (enc_error)
 303     fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation.  If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding);
 304
 305   return lex;
 306 }
 307
 308 void
 309 java_destroy_lexer (lex)
 310      java_lexer *lex;
 311 {
 312 #ifdef HAVE_ICONV
 313   if (! lex->use_fallback)
 314     iconv_close (lex->handle);
 315 #endif
 316   free (lex);
 317 }
 318
 319 static int
 320 java_read_char (lex)
 321      java_lexer *lex;
 322 {
 323   if (lex->unget_value)
 324     {
 325       unicode_t r = lex->unget_value;
 326       lex->unget_value = 0;
 327       return r;
 328     }
 329
 330 #ifdef HAVE_ICONV
 331   if (! lex->use_fallback)
 332     {
 333       size_t ir, inbytesleft, in_save, out_count, out_save;
 334       char *inp, *outp;
 335       unicode_t result;
 336
 337       /* If there is data which has already been converted, use it.  */
 338       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 339         {
 340           lex->out_first = 0;
 341           lex->out_last = 0;
 342
 343           while (1)
 344             {
 345               /* See if we need to read more data.  If FIRST == 0 then
 346                  the previous conversion attempt ended in the middle of
 347                  a character at the end of the buffer.  Otherwise we
 348                  only have to read if the buffer is empty.  */
 349               if (lex->first == 0 || lex->first >= lex->last)
 350                 {
 351                   int r;
 352
 353                   if (lex->first >= lex->last)
 354                     {
 355                       lex->first = 0;
 356                       lex->last = 0;
 357                     }
 358                   if (feof (lex->finput))
 359                     return UEOF;
 360                   r = fread (&lex->buffer[lex->last], 1,
 361                              sizeof (lex->buffer) - lex->last,
 362                              lex->finput);
 363                   lex->last += r;
 364                 }
 365
 366               inbytesleft = lex->last - lex->first;
 367               out_count = sizeof (lex->out_buffer) - lex->out_last;
 368
 369               if (inbytesleft == 0)
 370                 {
 371                   /* We've tried to read and there is nothing left.  */
 372                   return UEOF;
 373                 }
 374
 375               in_save = inbytesleft;
 376               out_save = out_count;
 377               inp = &lex->buffer[lex->first];
 378               outp = &lex->out_buffer[lex->out_last];
 379               ir = iconv (lex->handle, (ICONV_CONST char **) &inp,
 380                           &inbytesleft, &outp, &out_count);
 381
 382               /* If we haven't read any bytes, then look to see if we
 383                  have read a BOM.  */
 384               if (! lex->read_anything && out_save - out_count >= 2)
 385                 {
 386                   unicode_t uc = * (unicode_t *) &lex->out_buffer[0];
 387                   if (uc == 0xfeff)
 388                     {
 389                       lex->byte_swap = 0;
 390                       lex->out_first += 2;
 391                     }
 392                   else if (uc == 0xfffe)
 393                     {
 394                       lex->byte_swap = 1;
 395                       lex->out_first += 2;
 396                     }
 397                   lex->read_anything = 1;
 398                 }
 399
 400               if (lex->byte_swap)
 401                 {
 402                   unsigned int i;
 403                   for (i = 0; i < out_save - out_count; i += 2)
 404                     {
 405                       char t = lex->out_buffer[lex->out_last + i];
 406                       lex->out_buffer[lex->out_last + i]
 407                         = lex->out_buffer[lex->out_last + i + 1];
 408                       lex->out_buffer[lex->out_last + i + 1] = t;
 409                     }
 410                 }
 411
 412               lex->first += in_save - inbytesleft;
 413               lex->out_last += out_save - out_count;
 414
 415               /* If we converted anything at all, move along.  */
 416               if (out_count != out_save)
 417                 break;
 418
 419               if (ir == (size_t) -1)
 420                 {
 421                   if (errno == EINVAL)
 422                     {
 423                       /* This is ok.  This means that the end of our buffer
 424                          is in the middle of a character sequence.  We just
 425                          move the valid part of the buffer to the beginning
 426                          to force a read.  */
 427                       memmove (&lex->buffer[0], &lex->buffer[lex->first],
 428                                lex->last - lex->first);
 429                       lex->last -= lex->first;
 430                       lex->first = 0;
 431                     }
 432                   else
 433                     {
 434                       /* A more serious error.  */
 435                       java_lex_error ("unrecognized character in input stream",
 436                                       0);
 437                       return UEOF;
 438                     }
 439                 }
 440             }
 441         }
 442
 443       if (lex->out_first == -1 || lex->out_first >= lex->out_last)
 444         {
 445           /* Don't have any data.  */
 446           return UEOF;
 447         }
 448
 449       /* Success.  */
 450       result = * ((unicode_t *) &lex->out_buffer[lex->out_first]);
 451       lex->out_first += 2;
 452       return result;
 453     }
 454   else
 455 #endif /* HAVE_ICONV */
 456     {
 457       int c, c1, c2;
 458       c = getc (lex->finput);
 459
 460       if (c == EOF)
 461         return UEOF;
 462       if (c < 128)
 463         return (unicode_t) c;
 464       else
 465         {
 466           if ((c & 0xe0) == 0xc0)
 467             {
 468               c1 = getc (lex->finput);
 469               if ((c1 & 0xc0) == 0x80)
 470                 {
 471                   unicode_t r = (unicode_t)(((c & 0x1f) << 6) + (c1 & 0x3f));
 472                   /* Check for valid 2-byte characters.  We explicitly
 473                      allow \0 because this encoding is common in the
 474                      Java world.  */
 475                   if (r == 0 || (r >= 0x80 && r <= 0x7ff))
 476                     return r;
 477                 }
 478             }
 479           else if ((c & 0xf0) == 0xe0)
 480             {
 481               c1 = getc (lex->finput);
 482               if ((c1 & 0xc0) == 0x80)
 483                 {
 484                   c2 = getc (lex->finput);
 485                   if ((c2 & 0xc0) == 0x80)
 486                     {
 487                       unicode_t r =  (unicode_t)(((c & 0xf) << 12) +
 488                                                  (( c1 & 0x3f) << 6)
 489                                                  + (c2 & 0x3f));
 490                       /* Check for valid 3-byte characters.
 491                          Don't allow surrogate, \ufffe or \uffff.  */
 492                       if (r >= 0x800 && r <= 0xffff
 493                           && ! (r >= 0xd800 && r <= 0xdfff)
 494                           && r != 0xfffe && r != 0xffff)
 495                         return r;
 496                     }
 497                 }
 498             }
 499
 500           /* We simply don't support invalid characters.  We also
 501              don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
 502              cannot be valid Java characters.  */
 503           java_lex_error ("malformed UTF-8 character", 0);
 504         }
 505     }
 506
 507   /* We only get here on error.  */
 508   return UEOF;
 509 }
 510
 511 static void
 512 java_store_unicode (l, c, unicode_escape_p)
 513     struct java_line *l;
 514     unicode_t c;
 515     int unicode_escape_p;
 516 {
 517   if (l->size == l->max)
 518     {
 519       l->max += JAVA_LINE_MAX;
 520       l->line = (unicode_t *) xrealloc (l->line, sizeof (unicode_t)*l->max);
 521       l->unicode_escape_p = (char *) xrealloc (l->unicode_escape_p,
 522                                                sizeof (char)*l->max);
 523     }
 524   l->line [l->size] = c;
 525   l->unicode_escape_p [l->size++] = unicode_escape_p;
 526 }
 527
 528 static int
 529 java_read_unicode (lex, unicode_escape_p)
 530      java_lexer *lex;
 531      int *unicode_escape_p;
 532 {
 533   int c;
 534
 535   c = java_read_char (lex);
 536   *unicode_escape_p = 0;
 537
 538   if (c != '\\')
 539     {
 540       lex->bs_count = 0;
 541       return c;
 542     }
 543
 544   ++lex->bs_count;
 545   if ((lex->bs_count) % 2 == 1)
 546     {
 547       /* Odd number of \ seen.  */
 548       c = java_read_char (lex);
 549       if (c == 'u')
 550         {
 551           unicode_t unicode = 0;
 552           int shift = 12;
 553
 554           /* Recognize any number of `u's in \u.  */
 555           while ((c = java_read_char (lex)) == 'u')
 556             ;
 557
 558           /* Unget the most recent character as it is not a `u'.  */
 559           if (c == UEOF)
 560             return UEOF;
 561           lex->unget_value = c;
 562
 563           /* Next should be 4 hex digits, otherwise it's an error.
 564              The hex value is converted into the unicode, pushed into
 565              the Unicode stream.  */
 566           for (shift = 12; shift >= 0; shift -= 4)
 567             {
 568               if ((c = java_read_char (lex)) == UEOF)
 569                 return UEOF;
 570               if (hex_p (c))
 571                 unicode |= (unicode_t)(hex_value (c) << shift);
 572               else
 573                 java_lex_error ("Non hex digit in Unicode escape sequence", 0);
 574             }
 575           lex->bs_count = 0;
 576           *unicode_escape_p = 1;
 577           return unicode;
 578         }
 579       lex->unget_value = c;
 580     }
 581   return (unicode_t) '\\';
 582 }
 583
 584 static int
 585 java_read_unicode_collapsing_terminators (lex, unicode_escape_p)
 586      java_lexer *lex;
 587      int *unicode_escape_p;
 588 {
 589   int c = java_read_unicode (lex, unicode_escape_p);
 590
 591   if (c == '\r')
 592     {
 593       /* We have to read ahead to see if we got \r\n.  In that case we
 594          return a single line terminator.  */
 595       int dummy;
 596       c = java_read_unicode (lex, &dummy);
 597       if (c != '\n')
 598         lex->unget_value = c;
 599       /* In either case we must return a newline.  */
 600       c = '\n';
 601     }
 602
 603   return c;
 604 }
 605
 606 static int
 607 java_get_unicode ()
 608 {
 609   /* It's time to read a line when... */
 610   if (!ctxp->c_line || ctxp->c_line->current == ctxp->c_line->size)
 611     {
 612       int c;
 613       int found_chars = 0;
 614
 615       if (ctxp->lexer->hit_eof)
 616         return UEOF;
 617
 618       java_allocate_new_line ();
 619       if (ctxp->c_line->line[0] != '\n')
 620         {
 621           for (;;)
 622             {
 623               int unicode_escape_p;
 624               c = java_read_unicode_collapsing_terminators (ctxp->lexer,
 625                                                             &unicode_escape_p);
 626               if (c != UEOF)
 627                 {
 628                   found_chars = 1;
 629                   java_store_unicode (ctxp->c_line, c, unicode_escape_p);
 630                   if (ctxp->c_line->white_space_only
 631                       && !JAVA_WHITE_SPACE_P (c)
 632                       && c != '\n')
 633                     ctxp->c_line->white_space_only = 0;
 634                 }
 635               if ((c == '\n') || (c == UEOF))
 636                 break;
 637             }
 638
 639           if (c == UEOF && ! found_chars)
 640             {
 641               ctxp->lexer->hit_eof = 1;
 642               return UEOF;
 643             }
 644         }
 645     }
 646   ctxp->c_line->char_col += JAVA_COLUMN_DELTA (0);
 647   JAVA_LEX_CHAR (ctxp->c_line->line [ctxp->c_line->current]);
 648   return ctxp->c_line->line [ctxp->c_line->current++];
 649 }
 650
 651 /* Parse the end of a C style comment.
 652  * C is the first character following the '/' and '*'. */
 653 static void
 654 java_parse_end_comment (c)
 655      int c;
 656 {
 657   for ( ;; c = java_get_unicode ())
 658     {
 659       switch (c)
 660         {
 661         case UEOF:
 662           java_lex_error ("Comment not terminated at end of input", 0);
 663           return;
 664         case '*':
 665           switch (c = java_get_unicode ())
 666             {
 667             case UEOF:
 668               java_lex_error ("Comment not terminated at end of input", 0);
 669               return;
 670             case '/':
 671               return;
 672             case '*':   /* reparse only '*' */
 673               java_unget_unicode ();
 674             }
 675         }
 676     }
 677 }
 678
 679 /* Parse the documentation section. Keywords must be at the beginning
 680    of a documentation comment line (ignoring white space and any `*'
 681    character). Parsed keyword(s): @DEPRECATED.  */
 682
 683 static int
 684 java_parse_doc_section (c)
 685      int c;
 686 {
 687   int valid_tag = 0, seen_star = 0;
 688
 689   while (JAVA_WHITE_SPACE_P (c) || (c == '*') || c == '\n')
 690     {
 691       switch (c)
 692         {
 693         case '*':
 694           seen_star = 1;
 695           break;
 696         case '\n': /* ULT */
 697           valid_tag = 1;
 698         default:
 699           seen_star = 0;
 700         }
 701       c = java_get_unicode();
 702     }
 703
 704   if (c == UEOF)
 705     java_lex_error ("Comment not terminated at end of input", 0);
 706
 707   if (seen_star && (c == '/'))
 708     return 1;                   /* Goto step1 in caller */
 709
 710   /* We're parsing @deprecated */
 711   if (valid_tag && (c == '@'))
 712     {
 713       char tag [11];
 714       int  tag_index = 0;
 715
 716       while (tag_index < 10 && c != UEOF && c != ' ' && c != '\n')
 717         {
 718           c = java_get_unicode ();
 719           tag [tag_index++] = c;
 720         }
 721
 722       if (c == UEOF)
 723         java_lex_error ("Comment not terminated at end of input", 0);
 724       tag [tag_index] = '\0';
 725
 726       if (!strcmp (tag, "deprecated"))
 727         ctxp->deprecated = 1;
 728     }
 729   java_unget_unicode ();
 730   return 0;
 731 }
 732
 733 /* Return true if C is a valid start character for a Java identifier.
 734    This is only called if C >= 128 -- smaller values are handled
 735    inline.  However, this function handles all values anyway.  */
 736 static int
 737 java_start_char_p (c)
 738      unicode_t c;
 739 {
 740   unsigned int hi = c / 256;
 741   const char *const page = type_table[hi];
 742   unsigned long val = (unsigned long) page;
 743   int flags;
 744
 745   if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
 746     flags = page[c & 255];
 747   else
 748     flags = val;
 749
 750   return flags & LETTER_START;
 751 }
 752
 753 /* Return true if C is a valid part character for a Java identifier.
 754    This is only called if C >= 128 -- smaller values are handled
 755    inline.  However, this function handles all values anyway.  */
 756 static int
 757 java_part_char_p (c)
 758      unicode_t c;
 759 {
 760   unsigned int hi = c / 256;
 761   const char *const page = type_table[hi];
 762   unsigned long val = (unsigned long) page;
 763   int flags;
 764
 765   if ((val & ~ (LETTER_PART | LETTER_START)) != 0)
 766     flags = page[c & 255];
 767   else
 768     flags = val;
 769
 770   return flags & LETTER_PART;
 771 }
 772
 773 static int
 774 java_parse_escape_sequence ()
 775 {
 776   unicode_t char_lit;
 777   int c;
 778
 779   switch (c = java_get_unicode ())
 780     {
 781     case 'b':
 782       return (unicode_t)0x8;
 783     case 't':
 784       return (unicode_t)0x9;
 785     case 'n':
 786       return (unicode_t)0xa;
 787     case 'f':
 788       return (unicode_t)0xc;
 789     case 'r':
 790       return (unicode_t)0xd;
 791     case '"':
 792       return (unicode_t)0x22;
 793     case '\'':
 794       return (unicode_t)0x27;
 795     case '\\':
 796       return (unicode_t)0x5c;
 797     case '0': case '1': case '2': case '3': case '4':
 798     case '5': case '6': case '7':
 799       {
 800         int octal_escape[3];
 801         int octal_escape_index = 0;
 802         int max = 3;
 803         int i, shift;
 804
 805         for (; octal_escape_index < max && RANGE (c, '0', '7');
 806              c = java_get_unicode ())
 807           {
 808             if (octal_escape_index == 0 && c > '3')
 809               {
 810                 /* According to the grammar, `\477' has a well-defined
 811                    meaning -- it is `\47' followed by `7'.  */
 812                 --max;
 813               }
 814             octal_escape [octal_escape_index++] = c;
 815           }
 816
 817         java_unget_unicode ();
 818
 819         for (char_lit=0, i = 0, shift = 3*(octal_escape_index-1);
 820              i < octal_escape_index; i++, shift -= 3)
 821           char_lit |= (octal_escape [i] - '0') << shift;
 822
 823         return char_lit;
 824       }
 825     default:
 826       java_lex_error ("Invalid character in escape sequence", 0);
 827       return JAVA_CHAR_ERROR;
 828     }
 829 }
 830
 831 /* Isolate the code which may raise an arithmetic exception in its
 832    own function.  */
 833
 834 #ifndef JC1_LITE
 835 struct jpa_args
 836 {
 837   YYSTYPE *java_lval;
 838   char *literal_token;
 839   int fflag;
 840   int number_beginning;
 841 };
 842
 843 #define IS_ZERO(X) (ereal_cmp (X, dconst0) == 0)
 844
 845 static void java_perform_atof   PARAMS ((PTR));
 846
 847 static void
 848 java_perform_atof (av)
 849      PTR av;
 850 {
 851   struct jpa_args *a = (struct jpa_args *)av;
 852   YYSTYPE *java_lval = a->java_lval;
 853   int number_beginning = a->number_beginning;
 854   REAL_VALUE_TYPE value;
 855   tree type = (a->fflag ? FLOAT_TYPE_NODE : DOUBLE_TYPE_NODE);
 856
 857   SET_REAL_VALUE_ATOF (value,
 858                        REAL_VALUE_ATOF (a->literal_token, TYPE_MODE (type)));
 859
 860   if (REAL_VALUE_ISINF (value) || REAL_VALUE_ISNAN (value))
 861     {
 862       JAVA_FLOAT_RANGE_ERROR ((a->fflag ? "float" : "double"));
 863       value = DCONST0;
 864     }
 865   else if (IS_ZERO (value))
 866     {
 867       /* We check to see if the value is really 0 or if we've found an
 868          underflow.  We do this in the most primitive imaginable way.  */
 869       int really_zero = 1;
 870       char *p = a->literal_token;
 871       if (*p == '-')
 872         ++p;
 873       while (*p && *p != 'e' && *p != 'E')
 874         {
 875           if (*p != '0' && *p != '.')
 876             {
 877               really_zero = 0;
 878               break;
 879             }
 880           ++p;
 881         }
 882       if (! really_zero)
 883         {
 884           int i = ctxp->c_line->current;
 885           ctxp->c_line->current = number_beginning;
 886           java_lex_error ("Floating point literal underflow", 0);
 887           ctxp->c_line->current = i;
 888         }
 889     }
 890
 891   SET_LVAL_NODE_TYPE (build_real (type, value), type);
 892 }
 893 #endif
 894
 895 static int yylex                PARAMS ((YYSTYPE *));
 896
 897 static int
 898 #ifdef JC1_LITE
 899 yylex (java_lval)
 900 #else
 901 java_lex (java_lval)
 902 #endif
 903      YYSTYPE *java_lval;
 904 {
 905   int c;
 906   unicode_t first_unicode;
 907   int ascii_index, all_ascii;
 908   char *string;
 909
 910   /* Translation of the Unicode escape in the raw stream of Unicode
 911      characters. Takes care of line terminator.  */
 912  step1:
 913   /* Skip white spaces: SP, TAB and FF or ULT */
 914   for (c = java_get_unicode ();
 915        c == '\n' || JAVA_WHITE_SPACE_P (c); c = java_get_unicode ())
 916     if (c == '\n')
 917       {
 918         ctxp->elc.line = ctxp->c_line->lineno;
 919         ctxp->elc.col  = ctxp->c_line->char_col-2;
 920       }
 921
 922   ctxp->elc.col = (ctxp->elc.col < 0 ? 0 : ctxp->elc.col);
 923
 924   if (c == 0x1a)                /* CTRL-Z */
 925     {
 926       if ((c = java_get_unicode ()) == UEOF)
 927         return 0;               /* Ok here */
 928       else
 929         java_unget_unicode ();  /* Caught later, at the end of the function */
 930     }
 931   /* Handle EOF here */
 932   if (c == UEOF)        /* Should probably do something here... */
 933     return 0;
 934
 935   /* Take care of eventual comments.  */
 936   if (c == '/')
 937     {
 938       switch (c = java_get_unicode ())
 939         {
 940         case '/':
 941           for (;;)
 942             {
 943               c = java_get_unicode ();
 944               if (c == UEOF)
 945                 {
 946                   /* It is ok to end a `//' comment with EOF, unless
 947                      we're being pedantic.  */
 948                   if (pedantic)
 949                     java_lex_error ("Comment not terminated at end of input",
 950                                     0);
 951                   return 0;
 952                 }
 953               if (c == '\n')    /* ULT */
 954                 goto step1;
 955             }
 956           break;
 957
 958         case '*':
 959           if ((c = java_get_unicode ()) == '*')
 960             {
 961               if ((c = java_get_unicode ()) == '/')
 962                 goto step1;     /* Empy documentation comment  */
 963               else if (java_parse_doc_section (c))
 964                 goto step1;
 965             }
 966
 967           java_parse_end_comment ((c = java_get_unicode ()));
 968           goto step1;
 969           break;
 970         default:
 971           java_unget_unicode ();
 972           c = '/';
 973           break;
 974         }
 975     }
 976
 977   ctxp->elc.line = ctxp->c_line->lineno;
 978   ctxp->elc.prev_col = ctxp->elc.col;
 979   ctxp->elc.col = ctxp->c_line->char_col - JAVA_COLUMN_DELTA (-1);
 980   if (ctxp->elc.col < 0)
 981     abort ();
 982
 983   /* Numeric literals */
 984   if (JAVA_ASCII_DIGIT (c) || (c == '.'))
 985     {
 986       /* This section of code is borrowed from gcc/c-lex.c  */
 987 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
 988       int parts[TOTAL_PARTS];
 989       HOST_WIDE_INT high, low;
 990       /* End borrowed section  */
 991       char literal_token [256];
 992       int  literal_index = 0, radix = 10, long_suffix = 0, overflow = 0, bytes;
 993       int  found_hex_digits = 0;
 994       int  i;
 995 #ifndef JC1_LITE
 996       int  number_beginning = ctxp->c_line->current;
 997       tree value;
 998 #endif
 999
1000       /* We might have a . separator instead of a FP like .[0-9]* */
1001       if (c == '.')
1002         {
1003           unicode_t peep = java_sneak_unicode ();
1004
1005           if (!JAVA_ASCII_DIGIT (peep))
1006             {
1007               JAVA_LEX_SEP('.');
1008               BUILD_OPERATOR (DOT_TK);
1009             }
1010         }
1011
1012       for (i = 0; i < TOTAL_PARTS; i++)
1013         parts [i] = 0;
1014
1015       if (c == '0')
1016         {
1017           c = java_get_unicode ();
1018           if (c == 'x' || c == 'X')
1019             {
1020               radix = 16;
1021               c = java_get_unicode ();
1022             }
1023           else if (JAVA_ASCII_DIGIT (c))
1024             radix = 8;
1025           else if (c == '.')
1026             {
1027               /* Push the '.' back and prepare for a FP parsing... */
1028               java_unget_unicode ();
1029               c = '0';
1030             }
1031           else
1032             {
1033               /* We have a zero literal: 0, 0{f,F}, 0{d,D} */
1034               JAVA_LEX_LIT ("0", 10);
1035               switch (c)
1036                 {
1037                 case 'L': case 'l':
1038                   SET_LVAL_NODE (long_zero_node);
1039                   return (INT_LIT_TK);
1040                 case 'f': case 'F':
1041                   SET_LVAL_NODE (float_zero_node);
1042                   return (FP_LIT_TK);
1043                 case 'd': case 'D':
1044                   SET_LVAL_NODE (double_zero_node);
1045                   return (FP_LIT_TK);
1046                 default:
1047                   java_unget_unicode ();
1048                   SET_LVAL_NODE (integer_zero_node);
1049                   return (INT_LIT_TK);
1050                 }
1051             }
1052         }
1053       /* Parse the first part of the literal, until we find something
1054          which is not a number.  */
1055       while ((radix == 10 && JAVA_ASCII_DIGIT (c)) ||
1056              (radix == 16 && JAVA_ASCII_HEXDIGIT (c)) ||
1057              (radix == 8  && JAVA_ASCII_OCTDIGIT (c)))
1058         {
1059           /* We store in a string (in case it turns out to be a FP) and in
1060              PARTS if we have to process a integer literal.  */
1061           int numeric = hex_value (c);
1062           int count;
1063
1064           /* Remember when we find a valid hexadecimal digit */
1065           if (radix == 16)
1066             found_hex_digits = 1;
1067
1068           literal_token [literal_index++] = c;
1069           /* This section of code if borrowed from gcc/c-lex.c  */
1070           for (count = 0; count < TOTAL_PARTS; count++)
1071             {
1072               parts[count] *= radix;
1073               if (count)
1074                 {
1075                   parts[count]   += (parts[count-1] >> HOST_BITS_PER_CHAR);
1076                   parts[count-1] &= (1 << HOST_BITS_PER_CHAR) - 1;
1077                 }
1078               else
1079                 parts[0] += numeric;
1080             }
1081           if (parts [TOTAL_PARTS-1] != 0)
1082             overflow = 1;
1083           /* End borrowed section.  */
1084           c = java_get_unicode ();
1085         }
1086
1087       /* If we have something from the FP char set but not a digit, parse
1088          a FP literal.  */
1089       if (JAVA_ASCII_FPCHAR (c) && !JAVA_ASCII_DIGIT (c))
1090         {
1091           int stage = 0;
1092           int seen_digit = (literal_index ? 1 : 0);
1093           int seen_exponent = 0;
1094           int fflag = 0;        /* 1 for {f,F}, 0 for {d,D}. FP literal are
1095                                    double unless specified. */
1096
1097           /* It is ok if the radix is 8 because this just means we've
1098              seen a leading `0'.  However, radix==16 is invalid.  */
1099           if (radix == 16)
1100             java_lex_error ("Can't express non-decimal FP literal", 0);
1101           radix = 10;
1102
1103           for (;;)
1104             {
1105               if (c == '.')
1106                 {
1107                   if (stage < 1)
1108                     {
1109                       stage = 1;
1110                       literal_token [literal_index++ ] = c;
1111                       c = java_get_unicode ();
1112                     }
1113                   else
1114                     java_lex_error ("Invalid character in FP literal", 0);
1115                 }
1116
1117               if (c == 'e' || c == 'E')
1118                 {
1119                   if (stage < 2)
1120                     {
1121                       /* {E,e} must have seen at list a digit */
1122                       if (!seen_digit)
1123                         java_lex_error ("Invalid FP literal", 0);
1124                       seen_digit = 0;
1125                       seen_exponent = 1;
1126                       stage = 2;
1127                       literal_token [literal_index++] = c;
1128                       c = java_get_unicode ();
1129                     }
1130                   else
1131                     java_lex_error ("Invalid character in FP literal", 0);
1132                 }
1133               if ( c == 'f' || c == 'F' || c == 'd' || c == 'D')
1134                 {
1135                   fflag = ((c == 'd') || (c == 'D')) ? 0 : 1;
1136                   stage = 4;    /* So we fall through */
1137                 }
1138
1139               if ((c=='-' || c =='+') && stage == 2)
1140                 {
1141                   stage = 3;
1142                   literal_token [literal_index++] = c;
1143                   c = java_get_unicode ();
1144                 }
1145
1146               if ((stage == 0 && JAVA_ASCII_FPCHAR (c)) ||
1147                   (stage == 1 && JAVA_ASCII_FPCHAR (c) && !(c == '.')) ||
1148                   (stage == 2 && (JAVA_ASCII_DIGIT (c) || JAVA_FP_PM (c))) ||
1149                   (stage == 3 && JAVA_ASCII_DIGIT (c)))
1150                 {
1151                   if (JAVA_ASCII_DIGIT (c))
1152                     seen_digit = 1;
1153                   literal_token [literal_index++ ] = c;
1154                   c = java_get_unicode ();
1155                 }
1156               else
1157                 {
1158 #ifndef JC1_LITE
1159                   struct jpa_args a;
1160 #endif
1161                   if (stage != 4) /* Don't push back fF/dD */
1162                     java_unget_unicode ();
1163
1164                   /* An exponent (if any) must have seen a digit.  */
1165                   if (seen_exponent && !seen_digit)
1166                     java_lex_error ("Invalid FP literal", 0);
1167
1168                   literal_token [literal_index] = '\0';
1169                   JAVA_LEX_LIT (literal_token, radix);
1170
1171 #ifndef JC1_LITE
1172                   a.literal_token = literal_token;
1173                   a.fflag = fflag;
1174                   a.java_lval = java_lval;
1175                   a.number_beginning = number_beginning;
1176                   if (do_float_handler (java_perform_atof, (PTR) &a))
1177                     return FP_LIT_TK;
1178
1179                   JAVA_FLOAT_RANGE_ERROR ((fflag ? "float" : "double"));
1180 #else
1181                   return FP_LIT_TK;
1182 #endif
1183                 }
1184             }
1185         } /* JAVA_ASCCI_FPCHAR (c) */
1186
1187       if (radix == 16 && ! found_hex_digits)
1188         java_lex_error
1189           ("0x must be followed by at least one hexadecimal digit", 0);
1190
1191       /* Here we get back to converting the integral literal.  */
1192       if (c == 'L' || c == 'l')
1193         long_suffix = 1;
1194       else if (radix == 16 && JAVA_ASCII_LETTER (c))
1195         java_lex_error ("Digit out of range in hexadecimal literal", 0);
1196       else if (radix == 8  && JAVA_ASCII_DIGIT (c))
1197         java_lex_error ("Digit out of range in octal literal", 0);
1198       else if (radix == 16 && !literal_index)
1199         java_lex_error ("No digit specified for hexadecimal literal", 0);
1200       else
1201         java_unget_unicode ();
1202
1203 #ifdef JAVA_LEX_DEBUG
1204       literal_token [literal_index] = '\0'; /* So JAVA_LEX_LIT is safe. */
1205       JAVA_LEX_LIT (literal_token, radix);
1206 #endif
1207       /* This section of code is borrowed from gcc/c-lex.c  */
1208       if (!overflow)
1209         {
1210           bytes = GET_TYPE_PRECISION (long_type_node);
1211           for (i = bytes; i < TOTAL_PARTS; i++)
1212             if (parts [i])
1213               {
1214                 overflow = 1;
1215                 break;
1216               }
1217         }
1218       high = low = 0;
1219       for (i = 0; i < HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR; i++)
1220         {
1221           high |= ((HOST_WIDE_INT) parts[i + (HOST_BITS_PER_WIDE_INT
1222                                               / HOST_BITS_PER_CHAR)]
1223                    << (i * HOST_BITS_PER_CHAR));
1224           low |= (HOST_WIDE_INT) parts[i] << (i * HOST_BITS_PER_CHAR);
1225         }
1226       /* End borrowed section.  */
1227
1228       /* Range checking */
1229       if (long_suffix)
1230         {
1231           /* 9223372036854775808L is valid if operand of a '-'. Otherwise
1232              9223372036854775807L is the biggest `long' literal that can be
1233              expressed using a 10 radix. For other radixes, everything that
1234              fits withing 64 bits is OK. */
1235           int hb = (high >> 31);
1236           if (overflow || (hb && low && radix == 10)
1237               || (hb && high & 0x7fffffff && radix == 10))
1238             JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1239         }
1240       else
1241         {
1242           /* 2147483648 is valid if operand of a '-'. Otherwise,
1243              2147483647 is the biggest `int' literal that can be
1244              expressed using a 10 radix. For other radixes, everything
1245              that fits within 32 bits is OK.  As all literals are
1246              signed, we sign extend here. */
1247           int hb = (low >> 31) & 0x1;
1248           if (overflow || high || (hb && low & 0x7fffffff && radix == 10))
1249             JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1250           high = -hb;
1251         }
1252 #ifndef JC1_LITE
1253       value = build_int_2 (low, high);
1254       JAVA_RADIX10_FLAG (value) = radix == 10;
1255       SET_LVAL_NODE_TYPE (value, long_suffix ? long_type_node : int_type_node);
1256 #else
1257       SET_LVAL_NODE_TYPE (build_int_2 (low, high),
1258                           long_suffix ? long_type_node : int_type_node);
1259 #endif
1260       return INT_LIT_TK;
1261     }
1262
1263   /* Character literals */
1264   if (c == '\'')
1265     {
1266       int char_lit;
1267       if ((c = java_get_unicode ()) == '\\')
1268         char_lit = java_parse_escape_sequence ();
1269       else
1270         {
1271           if (c == '\n' || c == '\'')
1272             java_lex_error ("Invalid character literal", 0);
1273           char_lit = c;
1274         }
1275
1276       c = java_get_unicode ();
1277
1278       if ((c == '\n') || (c == UEOF))
1279         java_lex_error ("Character literal not terminated at end of line", 0);
1280       if (c != '\'')
1281         java_lex_error ("Syntax error in character literal", 0);
1282
1283       if (char_lit == JAVA_CHAR_ERROR)
1284         char_lit = 0;           /* We silently convert it to zero */
1285
1286       JAVA_LEX_CHAR_LIT (char_lit);
1287       SET_LVAL_NODE_TYPE (build_int_2 (char_lit, 0), char_type_node);
1288       return CHAR_LIT_TK;
1289     }
1290
1291   /* String literals */
1292   if (c == '"')
1293     {
1294       int no_error;
1295       char *string;
1296
1297       for (no_error = 1, c = java_get_unicode ();
1298            c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
1299         {
1300           if (c == '\\')
1301             c = java_parse_escape_sequence ();
1302           if (c == JAVA_CHAR_ERROR)
1303             {
1304               no_error = 0;
1305               c = 0;            /* We silently convert it to zero.  */
1306             }
1307           java_unicode_2_utf8 (c);
1308         }
1309       if (c == '\n' || c == UEOF) /* ULT */
1310         {
1311           lineno--;             /* Refer to the line the terminator was seen */
1312           java_lex_error ("String not terminated at end of line", 0);
1313           lineno++;
1314         }
1315
1316       obstack_1grow (&temporary_obstack, '\0');
1317       string = obstack_finish (&temporary_obstack);
1318 #ifndef JC1_LITE
1319       if (!no_error || (c != '"'))
1320         java_lval->node = error_mark_node; /* Requires futher testing FIXME */
1321       else
1322         java_lval->node = build_string (strlen (string), string);
1323 #endif
1324       obstack_free (&temporary_obstack, string);
1325       return STRING_LIT_TK;
1326     }
1327
1328   /* Separator */
1329   switch (c)
1330     {
1331     case '(':
1332       JAVA_LEX_SEP (c);
1333       BUILD_OPERATOR (OP_TK);
1334     case ')':
1335       JAVA_LEX_SEP (c);
1336       return CP_TK;
1337     case '{':
1338       JAVA_LEX_SEP (c);
1339       if (ctxp->ccb_indent == 1)
1340         ctxp->first_ccb_indent1 = lineno;
1341       ctxp->ccb_indent++;
1342       BUILD_OPERATOR (OCB_TK);
1343     case '}':
1344       JAVA_LEX_SEP (c);
1345       ctxp->ccb_indent--;
1346       if (ctxp->ccb_indent == 1)
1347         ctxp->last_ccb_indent1 = lineno;
1348       BUILD_OPERATOR (CCB_TK);
1349     case '[':
1350       JAVA_LEX_SEP (c);
1351       BUILD_OPERATOR (OSB_TK);
1352     case ']':
1353       JAVA_LEX_SEP (c);
1354       return CSB_TK;
1355     case ';':
1356       JAVA_LEX_SEP (c);
1357       return SC_TK;
1358     case ',':
1359       JAVA_LEX_SEP (c);
1360       return C_TK;
1361     case '.':
1362       JAVA_LEX_SEP (c);
1363       BUILD_OPERATOR (DOT_TK);
1364       /*      return DOT_TK; */
1365     }
1366
1367   /* Operators */
1368   switch (c)
1369     {
1370     case '=':
1371       if ((c = java_get_unicode ()) == '=')
1372         {
1373           BUILD_OPERATOR (EQ_TK);
1374         }
1375       else
1376         {
1377           /* Equals is used in two different locations. In the
1378              variable_declarator: rule, it has to be seen as '=' as opposed
1379              to being seen as an ordinary assignment operator in
1380              assignment_operators: rule.  */
1381           java_unget_unicode ();
1382           BUILD_OPERATOR (ASSIGN_TK);
1383         }
1384
1385     case '>':
1386       switch ((c = java_get_unicode ()))
1387         {
1388         case '=':
1389           BUILD_OPERATOR (GTE_TK);
1390         case '>':
1391           switch ((c = java_get_unicode ()))
1392             {
1393             case '>':
1394               if ((c = java_get_unicode ()) == '=')
1395                 {
1396                   BUILD_OPERATOR2 (ZRS_ASSIGN_TK);
1397                 }
1398               else
1399                 {
1400                   java_unget_unicode ();
1401                   BUILD_OPERATOR (ZRS_TK);
1402                 }
1403             case '=':
1404               BUILD_OPERATOR2 (SRS_ASSIGN_TK);
1405             default:
1406               java_unget_unicode ();
1407               BUILD_OPERATOR (SRS_TK);
1408             }
1409         default:
1410           java_unget_unicode ();
1411           BUILD_OPERATOR (GT_TK);
1412         }
1413
1414     case '<':
1415       switch ((c = java_get_unicode ()))
1416         {
1417         case '=':
1418           BUILD_OPERATOR (LTE_TK);
1419         case '<':
1420           if ((c = java_get_unicode ()) == '=')
1421             {
1422               BUILD_OPERATOR2 (LS_ASSIGN_TK);
1423             }
1424           else
1425             {
1426               java_unget_unicode ();
1427               BUILD_OPERATOR (LS_TK);
1428             }
1429         default:
1430           java_unget_unicode ();
1431           BUILD_OPERATOR (LT_TK);
1432         }
1433
1434     case '&':
1435       switch ((c = java_get_unicode ()))
1436         {
1437         case '&':
1438           BUILD_OPERATOR (BOOL_AND_TK);
1439         case '=':
1440           BUILD_OPERATOR2 (AND_ASSIGN_TK);
1441         default:
1442           java_unget_unicode ();
1443           BUILD_OPERATOR (AND_TK);
1444         }
1445
1446     case '|':
1447       switch ((c = java_get_unicode ()))
1448         {
1449         case '|':
1450           BUILD_OPERATOR (BOOL_OR_TK);
1451         case '=':
1452           BUILD_OPERATOR2 (OR_ASSIGN_TK);
1453         default:
1454           java_unget_unicode ();
1455           BUILD_OPERATOR (OR_TK);
1456         }
1457
1458     case '+':
1459       switch ((c = java_get_unicode ()))
1460         {
1461         case '+':
1462           BUILD_OPERATOR (INCR_TK);
1463         case '=':
1464           BUILD_OPERATOR2 (PLUS_ASSIGN_TK);
1465         default:
1466           java_unget_unicode ();
1467           BUILD_OPERATOR (PLUS_TK);
1468         }
1469
1470     case '-':
1471       switch ((c = java_get_unicode ()))
1472         {
1473         case '-':
1474           BUILD_OPERATOR (DECR_TK);
1475         case '=':
1476           BUILD_OPERATOR2 (MINUS_ASSIGN_TK);
1477         default:
1478           java_unget_unicode ();
1479           BUILD_OPERATOR (MINUS_TK);
1480         }
1481
1482     case '*':
1483       if ((c = java_get_unicode ()) == '=')
1484         {
1485           BUILD_OPERATOR2 (MULT_ASSIGN_TK);
1486         }
1487       else
1488         {
1489           java_unget_unicode ();
1490           BUILD_OPERATOR (MULT_TK);
1491         }
1492
1493     case '/':
1494       if ((c = java_get_unicode ()) == '=')
1495         {
1496           BUILD_OPERATOR2 (DIV_ASSIGN_TK);
1497         }
1498       else
1499         {
1500           java_unget_unicode ();
1501           BUILD_OPERATOR (DIV_TK);
1502         }
1503
1504     case '^':
1505       if ((c = java_get_unicode ()) == '=')
1506         {
1507           BUILD_OPERATOR2 (XOR_ASSIGN_TK);
1508         }
1509       else
1510         {
1511           java_unget_unicode ();
1512           BUILD_OPERATOR (XOR_TK);
1513         }
1514
1515     case '%':
1516       if ((c = java_get_unicode ()) == '=')
1517         {
1518           BUILD_OPERATOR2 (REM_ASSIGN_TK);
1519         }
1520       else
1521         {
1522           java_unget_unicode ();
1523           BUILD_OPERATOR (REM_TK);
1524         }
1525
1526     case '!':
1527       if ((c = java_get_unicode()) == '=')
1528         {
1529           BUILD_OPERATOR (NEQ_TK);
1530         }
1531       else
1532         {
1533           java_unget_unicode ();
1534           BUILD_OPERATOR (NEG_TK);
1535         }
1536
1537     case '?':
1538       JAVA_LEX_OP ("?");
1539       BUILD_OPERATOR (REL_QM_TK);
1540     case ':':
1541       JAVA_LEX_OP (":");
1542       BUILD_OPERATOR (REL_CL_TK);
1543     case '~':
1544       BUILD_OPERATOR (NOT_TK);
1545     }
1546
1547   /* Keyword, boolean literal or null literal */
1548   for (first_unicode = c, all_ascii = 1, ascii_index = 0;
1549        JAVA_PART_CHAR_P (c); c = java_get_unicode ())
1550     {
1551       java_unicode_2_utf8 (c);
1552       if (all_ascii && c >= 128)
1553         all_ascii = 0;
1554       ascii_index++;
1555     }
1556
1557   obstack_1grow (&temporary_obstack, '\0');
1558   string = obstack_finish (&temporary_obstack);
1559   java_unget_unicode ();
1560
1561   /* If we have something all ascii, we consider a keyword, a boolean
1562      literal, a null literal or an all ASCII identifier.  Otherwise,
1563      this is an identifier (possibly not respecting formation rule).  */
1564   if (all_ascii)
1565     {
1566       const struct java_keyword *kw;
1567       if ((kw=java_keyword (string, ascii_index)))
1568         {
1569           JAVA_LEX_KW (string);
1570           switch (kw->token)
1571             {
1572             case PUBLIC_TK:       case PROTECTED_TK: case STATIC_TK:
1573             case ABSTRACT_TK:     case FINAL_TK:     case NATIVE_TK:
1574             case SYNCHRONIZED_TK: case TRANSIENT_TK: case VOLATILE_TK:
1575             case PRIVATE_TK:      case STRICT_TK:
1576               SET_MODIFIER_CTX (kw->token);
1577               return MODIFIER_TK;
1578             case FLOAT_TK:
1579               SET_LVAL_NODE (float_type_node);
1580               return FP_TK;
1581             case DOUBLE_TK:
1582               SET_LVAL_NODE (double_type_node);
1583               return FP_TK;
1584             case BOOLEAN_TK:
1585               SET_LVAL_NODE (boolean_type_node);
1586               return BOOLEAN_TK;
1587             case BYTE_TK:
1588               SET_LVAL_NODE (byte_type_node);
1589               return INTEGRAL_TK;
1590             case SHORT_TK:
1591               SET_LVAL_NODE (short_type_node);
1592               return INTEGRAL_TK;
1593             case INT_TK:
1594               SET_LVAL_NODE (int_type_node);
1595               return INTEGRAL_TK;
1596             case LONG_TK:
1597               SET_LVAL_NODE (long_type_node);
1598               return INTEGRAL_TK;
1599             case CHAR_TK:
1600               SET_LVAL_NODE (char_type_node);
1601               return INTEGRAL_TK;
1602
1603               /* Keyword based literals */
1604             case TRUE_TK:
1605             case FALSE_TK:
1606               SET_LVAL_NODE ((kw->token == TRUE_TK ?
1607                               boolean_true_node : boolean_false_node));
1608               return BOOL_LIT_TK;
1609             case NULL_TK:
1610               SET_LVAL_NODE (null_pointer_node);
1611               return NULL_TK;
1612
1613               /* Some keyword we want to retain information on the location
1614                  they where found */
1615             case CASE_TK:
1616             case DEFAULT_TK:
1617             case SUPER_TK:
1618             case THIS_TK:
1619             case RETURN_TK:
1620             case BREAK_TK:
1621             case CONTINUE_TK:
1622             case TRY_TK:
1623             case CATCH_TK:
1624             case THROW_TK:
1625             case INSTANCEOF_TK:
1626               BUILD_OPERATOR (kw->token);
1627
1628             default:
1629               return kw->token;
1630             }
1631         }
1632     }
1633
1634   /* We may have an ID here */
1635   if (JAVA_START_CHAR_P (first_unicode))
1636     {
1637       JAVA_LEX_ID (string);
1638       java_lval->node = BUILD_ID_WFL (GET_IDENTIFIER (string));
1639       return ID_TK;
1640     }
1641
1642   /* Everything else is an invalid character in the input */
1643   {
1644     char lex_error_buffer [128];
1645     sprintf (lex_error_buffer, "Invalid character `%s' in input",
1646              java_sprint_unicode (ctxp->c_line, ctxp->c_line->current));
1647     java_lex_error (lex_error_buffer, 1);
1648   }
1649   return 0;
1650 }
1651
1652 #ifndef JC1_LITE
1653 /* This is called by the parser to see if an error should be generated
1654    due to numeric overflow.  This function only handles the particular
1655    case of the largest negative value, and is only called in the case
1656    where this value is not preceded by `-'.  */
1657 static void
1658 error_if_numeric_overflow (value)
1659      tree value;
1660 {
1661   if (TREE_CODE (value) == INTEGER_CST && JAVA_RADIX10_FLAG (value))
1662     {
1663       unsigned HOST_WIDE_INT lo, hi;
1664
1665       lo = TREE_INT_CST_LOW (value);
1666       hi = TREE_INT_CST_HIGH (value);
1667       if (TREE_TYPE (value) == long_type_node)
1668         {
1669           int hb = (hi >> 31);
1670           if (hb && !(hi & 0x7fffffff))
1671             java_lex_error ("Numeric overflow for `long' literal", 0);
1672         }
1673       else
1674         {
1675           int hb = (lo >> 31) & 0x1;
1676           if (hb && !(lo & 0x7fffffff))
1677             java_lex_error ("Numeric overflow for `int' literal", 0);
1678         }
1679     }
1680 }
1681 #endif /* JC1_LITE */
1682
1683 static void
1684 java_unicode_2_utf8 (unicode)
1685     unicode_t unicode;
1686 {
1687   if (RANGE (unicode, 0x01, 0x7f))
1688     obstack_1grow (&temporary_obstack, (char)unicode);
1689   else if (RANGE (unicode, 0x80, 0x7ff) || unicode == 0)
1690     {
1691       obstack_1grow (&temporary_obstack,
1692                      (unsigned char)(0xc0 | ((0x7c0 & unicode) >> 6)));
1693       obstack_1grow (&temporary_obstack,
1694                      (unsigned char)(0x80 | (unicode & 0x3f)));
1695     }
1696   else                          /* Range 0x800-0xffff */
1697     {
1698       obstack_1grow (&temporary_obstack,
1699                      (unsigned char)(0xe0 | (unicode & 0xf000) >> 12));
1700       obstack_1grow (&temporary_obstack,
1701                      (unsigned char)(0x80 | (unicode & 0x0fc0) >> 6));
1702       obstack_1grow (&temporary_obstack,
1703                      (unsigned char)(0x80 | (unicode & 0x003f)));
1704     }
1705 }
1706
1707 #ifndef JC1_LITE
1708 static tree
1709 build_wfl_node (node)
1710      tree node;
1711 {
1712   node = build_expr_wfl (node, ctxp->filename, ctxp->elc.line, ctxp->elc.col);
1713   /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1714   TREE_TYPE (node) = NULL_TREE;
1715   return node;
1716 }
1717 #endif
1718
1719 static void
1720 java_lex_error (msg, forward)
1721      const char *msg ATTRIBUTE_UNUSED;
1722      int forward ATTRIBUTE_UNUSED;
1723 {
1724 #ifndef JC1_LITE
1725   ctxp->elc.line = ctxp->c_line->lineno;
1726   ctxp->elc.col = ctxp->c_line->char_col-1+forward;
1727
1728   /* Might be caught in the middle of some error report */
1729   ctxp->java_error_flag = 0;
1730   java_error (NULL);
1731   java_error (msg);
1732 #endif
1733 }
1734
1735 #ifndef JC1_LITE
1736 static int
1737 java_is_eol (fp, c)
1738   FILE *fp;
1739   int c;
1740 {
1741   int next;
1742   switch (c)
1743     {
1744     case '\r':
1745       next = getc (fp);
1746       if (next != '\n' && next != EOF)
1747         ungetc (next, fp);
1748       return 1;
1749     case '\n':
1750       return 1;
1751     default:
1752       return 0;
1753     }
1754 }
1755 #endif
1756
1757 char *
1758 java_get_line_col (filename, line, col)
1759      const char *filename ATTRIBUTE_UNUSED;
1760      int line ATTRIBUTE_UNUSED, col ATTRIBUTE_UNUSED;
1761 {
1762 #ifdef JC1_LITE
1763   return 0;
1764 #else
1765   /* Dumb implementation. Doesn't try to cache or optimize things. */
1766   /* First line of the file is line 1, first column is 1 */
1767
1768   /* COL == -1 means, at the CR/LF in LINE */
1769   /* COL == -2 means, at the first non space char in LINE */
1770
1771   FILE *fp;
1772   int c, ccol, cline = 1;
1773   int current_line_col = 0;
1774   int first_non_space = 0;
1775   char *base;
1776
1777   if (!(fp = fopen (filename, "r")))
1778     fatal_io_error ("can't open %s", filename);
1779
1780   while (cline != line)
1781     {
1782       c = getc (fp);
1783       if (c == EOF)
1784         {
1785           static const char msg[] = "<<file too short - unexpected EOF>>";
1786           obstack_grow (&temporary_obstack, msg, sizeof(msg)-1);
1787           goto have_line;
1788         }
1789       if (java_is_eol (fp, c))
1790         cline++;
1791     }
1792
1793   /* Gather the chars of the current line in a buffer */
1794   for (;;)
1795     {
1796       c = getc (fp);
1797       if (c < 0 || java_is_eol (fp, c))
1798         break;
1799       if (!first_non_space && !JAVA_WHITE_SPACE_P (c))
1800         first_non_space = current_line_col;
1801       obstack_1grow (&temporary_obstack, c);
1802       current_line_col++;
1803     }
1804  have_line:
1805
1806   obstack_1grow (&temporary_obstack, '\n');
1807
1808   if (col == -1)
1809     {
1810       col = current_line_col;
1811       first_non_space = 0;
1812     }
1813   else if (col == -2)
1814     col = first_non_space;
1815   else
1816     first_non_space = 0;
1817
1818   /* Place the '^' a the right position */
1819   base = obstack_base (&temporary_obstack);
1820   for (ccol = 1; ccol <= col+3; ccol++)
1821     {
1822       /* Compute \t when reaching first_non_space */
1823       char c = (first_non_space ?
1824                 (base [ccol-1] == '\t' ? '\t' : ' ') : ' ');
1825       obstack_1grow (&temporary_obstack, c);
1826     }
1827   obstack_grow0 (&temporary_obstack, "^", 1);
1828
1829   fclose (fp);
1830   return obstack_finish (&temporary_obstack);
1831 #endif
1832 }
1833
1834 #ifndef JC1_LITE
1835 static int
1836 utf8_cmp (str, length, name)
1837      const unsigned char *str;
1838      int length;
1839      const char *name;
1840 {
1841   const unsigned char *limit = str + length;
1842   int i;
1843
1844   for (i = 0; name[i]; ++i)
1845     {
1846       int ch = UTF8_GET (str, limit);
1847       if (ch != name[i])
1848         return ch - name[i];
1849     }
1850
1851   return str == limit ? 0 : 1;
1852 }
1853
1854 /* A sorted list of all C++ keywords.  */
1855
1856 static const char *const cxx_keywords[] =
1857 {
1858   "_Complex",
1859   "__alignof",
1860   "__alignof__",
1861   "__asm",
1862   "__asm__",
1863   "__attribute",
1864   "__attribute__",
1865   "__builtin_va_arg",
1866   "__complex",
1867   "__complex__",
1868   "__const",
1869   "__const__",
1870   "__extension__",
1871   "__imag",
1872   "__imag__",
1873   "__inline",
1874   "__inline__",
1875   "__label__",
1876   "__null",
1877   "__real",
1878   "__real__",
1879   "__restrict",
1880   "__restrict__",
1881   "__signed",
1882   "__signed__",
1883   "__typeof",
1884   "__typeof__",
1885   "__volatile",
1886   "__volatile__",
1887   "and",
1888   "and_eq",
1889   "asm",
1890   "auto",
1891   "bitand",
1892   "bitor",
1893   "bool",
1894   "break",
1895   "case",
1896   "catch",
1897   "char",
1898   "class",
1899   "compl",
1900   "const",
1901   "const_cast",
1902   "continue",
1903   "default",
1904   "delete",
1905   "do",
1906   "double",
1907   "dynamic_cast",
1908   "else",
1909   "enum",
1910   "explicit",
1911   "export",
1912   "extern",
1913   "false",
1914   "float",
1915   "for",
1916   "friend",
1917   "goto",
1918   "if",
1919   "inline",
1920   "int",
1921   "long",
1922   "mutable",
1923   "namespace",
1924   "new",
1925   "not",
1926   "not_eq",
1927   "operator",
1928   "or",
1929   "or_eq",
1930   "private",
1931   "protected",
1932   "public",
1933   "register",
1934   "reinterpret_cast",
1935   "return",
1936   "short",
1937   "signed",
1938   "sizeof",
1939   "static",
1940   "static_cast",
1941   "struct",
1942   "switch",
1943   "template",
1944   "this",
1945   "throw",
1946   "true",
1947   "try",
1948   "typedef",
1949   "typeid",
1950   "typename",
1951   "typeof",
1952   "union",
1953   "unsigned",
1954   "using",
1955   "virtual",
1956   "void",
1957   "volatile",
1958   "wchar_t",
1959   "while",
1960   "xor",
1961   "xor_eq"
1962 };
1963
1964 /* Return true if NAME is a C++ keyword.  */
1965
1966 int
1967 cxx_keyword_p (name, length)
1968      const char *name;
1969      int length;
1970 {
1971   int last = ARRAY_SIZE (cxx_keywords);
1972   int first = 0;
1973   int mid = (last + first) / 2;
1974   int old = -1;
1975
1976   for (mid = (last + first) / 2;
1977        mid != old;
1978        old = mid, mid = (last + first) / 2)
1979     {
1980       int kwl = strlen (cxx_keywords[mid]);
1981       int min_length = kwl > length ? length : kwl;
1982       int r = utf8_cmp (name, min_length, cxx_keywords[mid]);
1983
1984       if (r == 0)
1985         {
1986           int i;
1987           /* We've found a match if all the remaining characters are
1988              `$'.  */
1989           for (i = min_length; i < length && name[i] == '$'; ++i)
1990             ;
1991           if (i == length)
1992             return 1;
1993           r = 1;
1994         }
1995
1996       if (r < 0)
1997         last = mid;
1998       else
1999         first = mid;
2000     }
2001   return 0;
2002 }
2003 #endif /* JC1_LITE */