1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
3 Free Software Foundation, Inc.
4 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to
20 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
21 Boston, MA 02110-1301, USA.
23 Java and all Java-based marks are trademarks or registered trademarks
24 of Sun Microsystems, Inc. in the United States and other countries.
25 The Free Software Foundation is independent of Sun Microsystems, Inc. */
27 /* It defines java_lex (yylex) that reads a Java ASCII source file
28 possibly containing Unicode escape sequence or utf8 encoded
29 characters and returns a token for everything found but comments,
30 white spaces and line terminators. When necessary, it also fills
31 the java_lval (yylval) union. It's implemented to be called by a
32 re-entrant parser generated by Bison.
34 The lexical analysis conforms to the Java grammar described in "The
35 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
36 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
40 #include "chartables.h"
45 /* Function declarations. */
46 static char *java_sprint_unicode (int);
47 static void java_unicode_2_utf8 (unicode_t
);
48 static void java_lex_error (const char *, int);
50 static int do_java_lex (YYSTYPE
*);
51 static int java_lex (YYSTYPE
*);
52 static int java_is_eol (FILE *, int);
53 static tree
build_wfl_node (tree
);
55 static int java_parse_escape_sequence (void);
56 static int java_start_char_p (unicode_t
);
57 static int java_part_char_p (unicode_t
);
58 static int java_space_char_p (unicode_t
);
59 static void java_parse_doc_section (int);
60 static void java_parse_end_comment (int);
61 static int java_read_char (java_lexer
*);
62 static int java_get_unicode (void);
63 static int java_peek_unicode (void);
64 static void java_next_unicode (void);
65 static int java_read_unicode (java_lexer
*, int *);
67 static int utf8_cmp (const unsigned char *, int, const char *);
70 java_lexer
*java_new_lexer (FILE *, const char *);
72 static void error_if_numeric_overflow (tree
);
76 /* This is nonzero if we have initialized `need_byteswap'. */
77 static int byteswap_init
= 0;
79 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
80 big-endian order -- not native endian order. We handle this by
81 doing a conversion once at startup and seeing what happens. This
82 flag holds the results of this determination. */
83 static int need_byteswap
= 0;
87 java_init_lex (FILE *finput
, const char *encoding
)
90 int java_lang_imported
= 0;
93 java_lang_id
= get_identifier ("java.lang");
95 inst_id
= get_identifier ("inst$");
97 wpv_id
= get_identifier ("write_parm_value$");
99 if (!java_lang_imported
)
101 tree node
= build_tree_list (build_unknown_wfl (java_lang_id
),
103 read_import_dir (TREE_PURPOSE (node
));
104 TREE_CHAIN (node
) = ctxp
->import_demand_list
;
105 ctxp
->import_demand_list
= node
;
106 java_lang_imported
= 1;
112 #ifdef USE_MAPPED_LOCATION
113 wfl_operator
= build_expr_wfl (NULL_TREE
, input_location
);
115 wfl_operator
= build_expr_wfl (NULL_TREE
, ctxp
->filename
, 0, 0);
120 label_id
= get_identifier ("$L");
122 wfl_append
= build_unknown_wfl (get_identifier ("append"));
123 if (!wfl_string_buffer
)
125 build_unknown_wfl (get_identifier (flag_emit_class_files
126 ? "java.lang.StringBuffer"
127 : "gnu.gcj.runtime.StringBuffer"));
129 wfl_to_string
= build_unknown_wfl (get_identifier ("toString"));
131 CPC_INITIALIZER_LIST (ctxp
) = CPC_STATIC_INITIALIZER_LIST (ctxp
) =
132 CPC_INSTANCE_INITIALIZER_LIST (ctxp
) = NULL_TREE
;
134 memset (ctxp
->modifier_ctx
, 0, sizeof (ctxp
->modifier_ctx
));
135 ctxp
->current_parsed_class
= NULL
;
136 ctxp
->package
= NULL_TREE
;
140 ctxp
->save_location
= input_location
;
142 ctxp
->java_error_flag
= 0;
143 ctxp
->lexer
= java_new_lexer (finput
, encoding
);
147 java_sprint_unicode (int c
)
149 static char buffer
[10];
150 if (c
< ' ' || c
>= 127)
151 sprintf (buffer
, "\\u%04x", c
);
160 /* Create a new lexer object. */
163 java_new_lexer (FILE *finput
, const char *encoding
)
165 java_lexer
*lex
= XNEW (java_lexer
);
168 lex
->finput
= finput
;
170 lex
->unget_value
= 0;
171 lex
->next_unicode
= 0;
172 lex
->avail_unicode
= 0;
173 lex
->next_columns
= 1;
174 lex
->encoding
= encoding
;
175 lex
->position
.line
= 1;
176 lex
->position
.col
= 1;
178 #ifdef USE_MAPPED_LOCATION
180 = linemap_line_start (&line_table
, 1, 120);
187 lex
->handle
= iconv_open ("UCS-2", encoding
);
188 if (lex
->handle
!= (iconv_t
) -1)
194 lex
->read_anything
= 0;
195 lex
->use_fallback
= 0;
197 /* Work around broken iconv() implementations by doing checking at
198 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
199 then all UCS-2 encoders will be broken. Perhaps not a valid
207 handle
= iconv_open ("UCS-2", "UTF-8");
208 if (handle
!= (iconv_t
) -1)
215 /* This is the UTF-8 encoding of \ufeff. */
222 outp
= (char *) &result
;
225 r
= iconv (handle
, (ICONV_CONST
char **) &inp
, &inc
,
227 iconv_close (handle
);
228 /* Conversion must be complete for us to use the result. */
229 if (r
!= (size_t) -1 && inc
== 0 && outc
== 0)
230 need_byteswap
= (result
!= 0xfeff);
234 lex
->byte_swap
= need_byteswap
;
237 #endif /* HAVE_ICONV */
239 /* If iconv failed, use the internal decoder if the default
240 encoding was requested. This code is used on platforms where
241 iconv exists but is insufficient for our needs. For
242 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
244 On Solaris the default encoding, as returned by nl_langinfo(),
245 is `646' (aka ASCII), but the Solaris iconv_open() doesn't
246 understand that. We work around that by pretending
247 `646' to be the same as UTF-8. */
248 if (strcmp (encoding
, DEFAULT_ENCODING
) && strcmp (encoding
, "646"))
253 lex
->use_fallback
= 1;
254 lex
->encoding
= "UTF-8";
256 #endif /* HAVE_ICONV */
260 fatal_error ("unknown encoding: %qs\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n%<--encoding=UTF-8%> option", encoding
);
266 java_destroy_lexer (java_lexer
*lex
)
269 if (! lex
->use_fallback
)
270 iconv_close (lex
->handle
);
276 java_read_char (java_lexer
*lex
)
279 if (! lex
->use_fallback
)
281 size_t ir
, inbytesleft
, in_save
, out_count
, out_save
;
285 /* If there is data which has already been converted, use it. */
286 if (lex
->out_first
== -1 || lex
->out_first
>= lex
->out_last
)
293 /* See if we need to read more data. If FIRST == 0 then
294 the previous conversion attempt ended in the middle of
295 a character at the end of the buffer. Otherwise we
296 only have to read if the buffer is empty. */
297 if (lex
->first
== 0 || lex
->first
>= lex
->last
)
301 if (lex
->first
>= lex
->last
)
306 if (feof (lex
->finput
))
308 r
= fread (&lex
->buffer
[lex
->last
], 1,
309 sizeof (lex
->buffer
) - lex
->last
,
314 inbytesleft
= lex
->last
- lex
->first
;
315 out_count
= sizeof (lex
->out_buffer
) - lex
->out_last
;
317 if (inbytesleft
== 0)
319 /* We've tried to read and there is nothing left. */
323 in_save
= inbytesleft
;
324 out_save
= out_count
;
325 inp
= &lex
->buffer
[lex
->first
];
326 outp
= (char *) &lex
->out_buffer
[lex
->out_last
];
327 ir
= iconv (lex
->handle
, (ICONV_CONST
char **) &inp
,
328 &inbytesleft
, &outp
, &out_count
);
330 /* If we haven't read any bytes, then look to see if we
332 if (! lex
->read_anything
&& out_save
- out_count
>= 2)
334 unicode_t uc
= * (unicode_t
*) &lex
->out_buffer
[0];
340 else if (uc
== 0xfffe)
345 lex
->read_anything
= 1;
351 for (i
= 0; i
< out_save
- out_count
; i
+= 2)
353 char t
= lex
->out_buffer
[lex
->out_last
+ i
];
354 lex
->out_buffer
[lex
->out_last
+ i
]
355 = lex
->out_buffer
[lex
->out_last
+ i
+ 1];
356 lex
->out_buffer
[lex
->out_last
+ i
+ 1] = t
;
360 lex
->first
+= in_save
- inbytesleft
;
361 lex
->out_last
+= out_save
- out_count
;
363 /* If we converted anything at all, move along. */
364 if (out_count
!= out_save
)
367 if (ir
== (size_t) -1)
371 /* This is ok. This means that the end of our buffer
372 is in the middle of a character sequence. We just
373 move the valid part of the buffer to the beginning
375 memmove (&lex
->buffer
[0], &lex
->buffer
[lex
->first
],
376 lex
->last
- lex
->first
);
377 lex
->last
-= lex
->first
;
382 /* A more serious error. */
385 "Unrecognized character for encoding '%s'",
387 java_lex_error (buffer
, 0);
394 if (lex
->out_first
== -1 || lex
->out_first
>= lex
->out_last
)
396 /* Don't have any data. */
401 result
= * ((unicode_t
*) &lex
->out_buffer
[lex
->out_first
]);
406 #endif /* HAVE_ICONV */
409 c
= getc (lex
->finput
);
414 return (unicode_t
) c
;
417 if ((c
& 0xe0) == 0xc0)
419 c1
= getc (lex
->finput
);
420 if ((c1
& 0xc0) == 0x80)
422 unicode_t r
= (unicode_t
)(((c
& 0x1f) << 6) + (c1
& 0x3f));
423 /* Check for valid 2-byte characters. We explicitly
424 allow \0 because this encoding is common in the
426 if (r
== 0 || (r
>= 0x80 && r
<= 0x7ff))
430 else if ((c
& 0xf0) == 0xe0)
432 c1
= getc (lex
->finput
);
433 if ((c1
& 0xc0) == 0x80)
435 c2
= getc (lex
->finput
);
436 if ((c2
& 0xc0) == 0x80)
438 unicode_t r
= (unicode_t
)(((c
& 0xf) << 12) +
441 /* Check for valid 3-byte characters.
442 Don't allow surrogate, \ufffe or \uffff. */
443 if (IN_RANGE (r
, 0x800, 0xffff)
444 && ! IN_RANGE (r
, 0xd800, 0xdfff)
445 && r
!= 0xfffe && r
!= 0xffff)
451 /* We simply don't support invalid characters. We also
452 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
453 cannot be valid Java characters. */
454 java_lex_error ("malformed UTF-8 character", 0);
458 /* We only get here on error. */
463 java_read_unicode (java_lexer
*lex
, int *unicode_escape_p
)
467 if (lex
->unget_value
)
469 c
= lex
->unget_value
;
470 lex
->unget_value
= 0;
473 c
= java_read_char (lex
);
475 *unicode_escape_p
= 0;
484 if ((lex
->bs_count
) % 2 == 1)
486 /* Odd number of \ seen. */
487 c
= java_read_char (lex
);
490 unicode_t unicode
= 0;
493 /* Recognize any number of `u's in \u. */
494 while ((c
= java_read_char (lex
)) == 'u')
502 java_lex_error ("prematurely terminated \\u sequence", 0);
507 unicode
|= (unicode_t
)(hex_value (c
) << shift
);
510 java_lex_error ("non-hex digit in \\u sequence", 0);
514 c
= java_read_char (lex
);
520 lex
->unget_value
= c
;
523 *unicode_escape_p
= 1;
526 lex
->unget_value
= c
;
528 return (unicode_t
) '\\';
531 /* Get the next Unicode character (post-Unicode-escape-handling).
532 Move the current position to just after returned character. */
535 java_get_unicode (void)
537 int next
= java_peek_unicode ();
538 java_next_unicode ();
542 /* Return the next Unicode character (post-Unicode-escape-handling).
543 Do not move the current position, which remains just before
544 the returned character. */
547 java_peek_unicode (void)
549 int unicode_escape_p
;
550 java_lexer
*lex
= ctxp
->lexer
;
553 if (lex
->avail_unicode
)
554 return lex
->next_unicode
;
556 next
= java_read_unicode (lex
, &unicode_escape_p
);
560 /* We have to read ahead to see if we got \r\n.
561 In that case we return a single line terminator. */
563 next
= java_read_unicode (lex
, &dummy
);
564 if (next
!= '\n' && next
!= UEOF
)
565 lex
->unget_value
= next
;
566 /* In either case we must return a newline. */
570 lex
->next_unicode
= next
;
571 lex
->avail_unicode
= 1;
575 lex
->next_columns
= 0;
581 lex
->next_columns
= 1 - lex
->position
.col
;
583 else if (next
== '\t')
585 int cur_col
= lex
->position
.col
;
586 lex
->next_columns
= ((cur_col
+ 7) & ~7) + 1 - cur_col
;
591 lex
->next_columns
= 1;
593 if (unicode_escape_p
)
594 lex
->next_columns
= 6;
598 /* Move forward one Unicode character (post-Unicode-escape-handling).
599 Only allowed after java_peek_unicode. The combination java_peek_unicode
600 followed by java_next_unicode is equivalent to java_get_unicode. */
602 static void java_next_unicode (void)
604 struct java_lexer
*lex
= ctxp
->lexer
;
605 lex
->position
.col
+= lex
->next_columns
;
606 if (lex
->next_unicode
== '\n')
608 lex
->position
.line
++;
610 #ifdef USE_MAPPED_LOCATION
612 = linemap_line_start (&line_table
, lex
->position
.line
, 120);
614 input_line
= lex
->position
.line
;
618 lex
->avail_unicode
= 0;
622 /* The inverse of java_next_unicode.
623 Not currently used, but could be if it would be cleaner or faster.
624 java_peek_unicode == java_get_unicode + java_unget_unicode.
625 java_get_unicode == java_peek_unicode + java_next_unicode.
627 static void java_unget_unicode ()
629 struct java_lexer
*lex
= ctxp
->lexer
;
630 if (lex
->avail_unicode
)
631 fatal_error ("internal error - bad unget");
632 lex
->avail_unicode
= 1;
633 lex
->position
.col
-= lex
->next_columns
;
637 /* Parse the end of a C style comment.
638 * C is the first character following the '/' and '*'. */
640 java_parse_end_comment (int c
)
642 for ( ;; c
= java_get_unicode ())
647 java_lex_error ("Comment not terminated at end of input", 0);
650 switch (c
= java_peek_unicode ())
653 java_lex_error ("Comment not terminated at end of input", 0);
656 java_next_unicode ();
658 case '*': /* Reparse only '*'. */
665 /* Parse the documentation section. Keywords must be at the beginning
666 of a documentation comment line (ignoring white space and any `*'
667 character). Parsed keyword(s): @DEPRECATED. */
670 java_parse_doc_section (int c
)
674 /* We reset this here, because only the most recent doc comment
675 applies to the following declaration. */
676 ctxp
->deprecated
= 0;
678 /* We loop over all the lines of the comment. We'll eventually exit
679 if we hit EOF prematurely, or when we see the comment
683 /* These first steps need only be done if we're still looking
684 for the deprecated tag. If we've already seen it, we might
685 as well skip looking for it again. */
686 if (! ctxp
->deprecated
)
688 /* Skip whitespace and '*'s. We must also check for the end
689 of the comment here. */
690 while (JAVA_WHITE_SPACE_P (c
) || c
== '*')
692 last_was_star
= (c
== '*');
693 c
= java_get_unicode ();
694 if (last_was_star
&& c
== '/')
696 /* We just saw the comment terminator. */
706 const char *deprecated
= "@deprecated";
709 for (i
= 0; deprecated
[i
]; ++i
)
711 if (c
!= deprecated
[i
])
713 /* We write the code in this way, with the
714 update at the end, so that after the loop
715 we're left with the next character in C. */
716 c
= java_get_unicode ();
722 /* @deprecated must be followed by a space or newline.
723 We also allow a '*' in case it appears just before
724 the end of a comment. In this position only we also
725 must allow any Unicode space character. */
726 if (c
== ' ' || c
== '\n' || c
== '*' || java_space_char_p (c
))
729 ctxp
->deprecated
= 1;
734 /* We've examined the relevant content from this line. Now we
735 skip the remaining characters and start over with the next
736 line. We also check for end of comment here. */
737 while (c
!= '\n' && c
!= UEOF
)
739 last_was_star
= (c
== '*');
740 c
= java_get_unicode ();
741 if (last_was_star
&& c
== '/')
747 /* We have to advance past the \n. */
748 c
= java_get_unicode ();
754 java_lex_error ("Comment not terminated at end of input", 0);
757 /* Return true if C is a valid start character for a Java identifier.
758 This is only called if C >= 128 -- smaller values are handled
759 inline. However, this function handles all values anyway. */
761 java_start_char_p (unicode_t c
)
763 unsigned int hi
= c
/ 256;
764 const char *const page
= type_table
[hi
];
765 unsigned long val
= (unsigned long) page
;
768 if ((val
& ~ LETTER_MASK
) != 0)
769 flags
= page
[c
& 255];
773 return flags
& LETTER_START
;
776 /* Return true if C is a valid part character for a Java identifier.
777 This is only called if C >= 128 -- smaller values are handled
778 inline. However, this function handles all values anyway. */
780 java_part_char_p (unicode_t c
)
782 unsigned int hi
= c
/ 256;
783 const char *const page
= type_table
[hi
];
784 unsigned long val
= (unsigned long) page
;
787 if ((val
& ~ LETTER_MASK
) != 0)
788 flags
= page
[c
& 255];
792 return flags
& LETTER_PART
;
795 /* Return true if C is whitespace. */
797 java_space_char_p (unicode_t c
)
799 unsigned int hi
= c
/ 256;
800 const char *const page
= type_table
[hi
];
801 unsigned long val
= (unsigned long) page
;
804 if ((val
& ~ LETTER_MASK
) != 0)
805 flags
= page
[c
& 255];
809 return flags
& LETTER_SPACE
;
813 java_parse_escape_sequence (void)
817 switch (c
= java_get_unicode ())
820 return (unicode_t
)0x8;
822 return (unicode_t
)0x9;
824 return (unicode_t
)0xa;
826 return (unicode_t
)0xc;
828 return (unicode_t
)0xd;
830 return (unicode_t
)0x22;
832 return (unicode_t
)0x27;
834 return (unicode_t
)0x5c;
835 case '0': case '1': case '2': case '3': case '4':
836 case '5': case '6': case '7':
839 unicode_t char_lit
= 0;
843 /* According to the grammar, `\477' has a well-defined
844 meaning -- it is `\47' followed by `7'. */
850 char_lit
= 8 * char_lit
+ c
- '0';
853 c
= java_peek_unicode ();
854 if (! RANGE (c
, '0', '7'))
856 java_next_unicode ();
862 java_lex_error ("Invalid character in escape sequence", -1);
863 return JAVA_CHAR_ERROR
;
868 #define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)
870 /* Subroutine of java_lex: converts floating-point literals to tree
871 nodes. LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
872 store the result. FFLAG indicates whether the literal was tagged
873 with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
874 is the line number on which to report any error. */
876 static void java_perform_atof (YYSTYPE
*, char *, int, int);
879 java_perform_atof (YYSTYPE
*java_lval
, char *literal_token
, int fflag
,
880 int number_beginning
)
882 REAL_VALUE_TYPE value
;
883 tree type
= (fflag
? FLOAT_TYPE_NODE
: DOUBLE_TYPE_NODE
);
885 SET_REAL_VALUE_ATOF (value
,
886 REAL_VALUE_ATOF (literal_token
, TYPE_MODE (type
)));
888 if (REAL_VALUE_ISINF (value
) || REAL_VALUE_ISNAN (value
))
890 JAVA_FLOAT_RANGE_ERROR (fflag
? "float" : "double");
893 else if (IS_ZERO (value
))
895 /* We check to see if the value is really 0 or if we've found an
896 underflow. We do this in the most primitive imaginable way. */
898 char *p
= literal_token
;
901 while (*p
&& *p
!= 'e' && *p
!= 'E')
903 if (*p
!= '0' && *p
!= '.')
912 int save_col
= ctxp
->lexer
->position
.col
;
913 ctxp
->lexer
->position
.col
= number_beginning
;
914 java_lex_error ("Floating point literal underflow", 0);
915 ctxp
->lexer
->position
.col
= save_col
;
919 SET_LVAL_NODE (build_real (type
, value
));
923 static int yylex (YYSTYPE
*);
927 yylex (YYSTYPE
*java_lval
)
929 do_java_lex (YYSTYPE
*java_lval
)
935 /* Translation of the Unicode escape in the raw stream of Unicode
936 characters. Takes care of line terminator. */
938 /* Skip white spaces: SP, TAB and FF or ULT. */
941 c
= java_peek_unicode ();
942 if (c
!= '\n' && ! JAVA_WHITE_SPACE_P (c
))
944 java_next_unicode ();
947 /* Handle EOF here. */
948 if (c
== UEOF
) /* Should probably do something here... */
952 #ifdef USE_MAPPED_LOCATION
953 LINEMAP_POSITION_FOR_COLUMN (input_location
, &line_table
,
954 ctxp
->lexer
->position
.col
);
956 ctxp
->lexer
->token_start
= ctxp
->lexer
->position
;
960 /* Numeric literals. */
961 if (JAVA_ASCII_DIGIT (c
) || (c
== '.'))
963 /* This section of code is borrowed from gcc/c-lex.c. */
964 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
965 int parts
[TOTAL_PARTS
];
966 HOST_WIDE_INT high
, low
;
967 /* End borrowed section. */
969 #define MAX_TOKEN_LEN 256
970 char literal_token
[MAX_TOKEN_LEN
+ 1];
971 int literal_index
= 0, radix
= 10, long_suffix
= 0, overflow
= 0, bytes
;
972 int found_hex_digits
= 0, found_non_octal_digits
= -1;
975 int number_beginning
= ctxp
->lexer
->position
.col
;
979 for (i
= 0; i
< TOTAL_PARTS
; i
++)
984 java_next_unicode ();
985 c
= java_peek_unicode ();
986 if (c
== 'x' || c
== 'X')
989 java_next_unicode ();
990 c
= java_peek_unicode ();
992 else if (JAVA_ASCII_DIGIT (c
))
994 literal_token
[literal_index
++] = '0';
997 else if (c
== '.' || c
== 'e' || c
=='E')
999 literal_token
[literal_index
++] = '0';
1000 /* Handle C during floating-point parsing. */
1004 /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}. */
1008 java_next_unicode ();
1009 SET_LVAL_NODE (long_zero_node
);
1010 return (INT_LIT_TK
);
1012 java_next_unicode ();
1013 SET_LVAL_NODE (float_zero_node
);
1016 java_next_unicode ();
1017 SET_LVAL_NODE (double_zero_node
);
1020 SET_LVAL_NODE (integer_zero_node
);
1021 return (INT_LIT_TK
);
1026 /* Terminate LITERAL_TOKEN in case we bail out on large tokens. */
1027 literal_token
[MAX_TOKEN_LEN
] = '\0';
1029 /* Parse the first part of the literal, until we find something
1030 which is not a number. */
1031 while ((radix
== 16 ? JAVA_ASCII_HEXDIGIT (c
) : JAVA_ASCII_DIGIT (c
))
1032 && literal_index
< MAX_TOKEN_LEN
)
1034 /* We store in a string (in case it turns out to be a FP) and in
1035 PARTS if we have to process a integer literal. */
1036 int numeric
= hex_value (c
);
1039 /* Remember when we find a valid hexadecimal digit. */
1041 found_hex_digits
= 1;
1042 /* Remember when we find an invalid octal digit. */
1043 else if (radix
== 8 && numeric
>= 8 && found_non_octal_digits
< 0)
1044 found_non_octal_digits
= literal_index
;
1046 literal_token
[literal_index
++] = c
;
1047 /* This section of code if borrowed from gcc/c-lex.c. */
1048 for (count
= 0; count
< TOTAL_PARTS
; count
++)
1050 parts
[count
] *= radix
;
1053 parts
[count
] += (parts
[count
-1] >> HOST_BITS_PER_CHAR
);
1054 parts
[count
-1] &= (1 << HOST_BITS_PER_CHAR
) - 1;
1057 parts
[0] += numeric
;
1059 if (parts
[TOTAL_PARTS
-1] != 0)
1061 /* End borrowed section. */
1062 java_next_unicode ();
1063 c
= java_peek_unicode ();
1066 /* If we have something from the FP char set but not a digit, parse
1068 if (JAVA_ASCII_FPCHAR (c
) && !JAVA_ASCII_DIGIT (c
))
1070 /* stage==0: seen digits only
1071 * stage==1: seen '.'
1072 * stage==2: seen 'e' or 'E'.
1073 * stage==3: seen '+' or '-' after 'e' or 'E'.
1074 * stage==4: seen type suffix ('f'/'F'/'d'/'D')
1077 int seen_digit
= (literal_index
? 1 : 0);
1078 int seen_exponent
= 0;
1079 int fflag
= 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1080 double unless specified. */
1082 /* It is ok if the radix is 8 because this just means we've
1083 seen a leading `0'. However, radix==16 is invalid. */
1085 java_lex_error ("Can't express non-decimal FP literal", 0);
1088 for (; literal_index
< MAX_TOKEN_LEN
;)
1095 literal_token
[literal_index
++ ] = c
;
1096 java_next_unicode ();
1097 c
= java_peek_unicode ();
1098 if (literal_index
== 1 && !JAVA_ASCII_DIGIT (c
))
1099 BUILD_OPERATOR (DOT_TK
);
1102 java_lex_error ("Invalid character in FP literal", 0);
1105 if ((c
== 'e' || c
== 'E') && literal_index
< MAX_TOKEN_LEN
)
1109 /* {E,e} must have seen at least a digit. */
1112 ("Invalid FP literal, mantissa must have digit", 0);
1116 literal_token
[literal_index
++] = c
;
1117 java_next_unicode ();
1118 c
= java_peek_unicode ();
1121 java_lex_error ("Invalid character in FP literal", 0);
1123 if ( c
== 'f' || c
== 'F' || c
== 'd' || c
== 'D')
1125 fflag
= ((c
== 'd') || (c
== 'D')) ? 0 : 1;
1126 stage
= 4; /* So we fall through. */
1129 if ((c
=='-' || c
=='+') && stage
== 2
1130 && literal_index
< MAX_TOKEN_LEN
)
1133 literal_token
[literal_index
++] = c
;
1134 java_next_unicode ();
1135 c
= java_peek_unicode ();
1138 if (((stage
== 0 && JAVA_ASCII_FPCHAR (c
))
1139 || (stage
== 1 && JAVA_ASCII_FPCHAR (c
) && !(c
== '.'))
1140 || (stage
== 2 && (JAVA_ASCII_DIGIT (c
) || JAVA_FP_PM (c
)))
1141 || (stage
== 3 && JAVA_ASCII_DIGIT (c
)))
1142 && literal_index
< MAX_TOKEN_LEN
)
1144 if (JAVA_ASCII_DIGIT (c
))
1148 literal_token
[literal_index
++ ] = c
;
1149 java_next_unicode ();
1150 c
= java_peek_unicode ();
1152 else if (literal_index
< MAX_TOKEN_LEN
)
1154 if (stage
== 4) /* Don't push back fF/dD. */
1155 java_next_unicode ();
1157 /* An exponent (if any) must have seen a digit. */
1158 if (seen_exponent
&& !seen_digit
)
1160 ("Invalid FP literal, exponent must have digit", 0);
1162 literal_token
[literal_index
] = '\0';
1165 java_perform_atof (java_lval
, literal_token
,
1166 fflag
, number_beginning
);
1171 } /* JAVA_ASCII_FPCHAR (c) */
1173 /* Here we get back to converting the integral literal. */
1174 if (radix
== 16 && ! found_hex_digits
)
1176 ("0x must be followed by at least one hexadecimal digit", 0);
1177 else if (radix
== 8 && found_non_octal_digits
>= 0)
1179 int back
= literal_index
- found_non_octal_digits
;
1180 ctxp
->lexer
->position
.col
-= back
;
1181 java_lex_error ("Octal literal contains digit out of range", 0);
1182 ctxp
->lexer
->position
.col
+= back
;
1184 else if (c
== 'L' || c
== 'l')
1186 java_next_unicode ();
1190 /* This section of code is borrowed from gcc/c-lex.c. */
1193 bytes
= GET_TYPE_PRECISION (long_type_node
);
1194 for (i
= bytes
; i
< TOTAL_PARTS
; i
++)
1202 for (i
= 0; i
< HOST_BITS_PER_WIDE_INT
/ HOST_BITS_PER_CHAR
; i
++)
1204 high
|= ((HOST_WIDE_INT
) parts
[i
+ (HOST_BITS_PER_WIDE_INT
1205 / HOST_BITS_PER_CHAR
)]
1206 << (i
* HOST_BITS_PER_CHAR
));
1207 low
|= (HOST_WIDE_INT
) parts
[i
] << (i
* HOST_BITS_PER_CHAR
);
1209 /* End borrowed section. */
1212 /* Range checking. */
1213 /* Temporarily set type to unsigned. */
1214 value
= build_int_cst_wide (long_suffix
1215 ? unsigned_long_type_node
1216 : unsigned_int_type_node
, low
, high
);
1217 SET_LVAL_NODE (value
);
1219 /* For base 10 numbers, only values up to the highest value
1220 (plus one) can be written. For instance, only ints up to
1221 2147483648 can be written. The special case of the largest
1222 negative value is handled elsewhere. For other bases, any
1223 number can be represented. */
1224 if (overflow
|| (radix
== 10
1225 && tree_int_cst_lt (long_suffix
1231 JAVA_RANGE_ERROR ("Numeric overflow for 'long' literal");
1233 JAVA_RANGE_ERROR ("Numeric overflow for 'int' literal");
1236 /* Sign extend the value. */
1237 value
= build_int_cst_wide (long_suffix
? long_type_node
: int_type_node
,
1239 value
= force_fit_type (value
, 0, false, false);
1243 value
= copy_node (value
);
1244 JAVA_NOT_RADIX10_FLAG (value
) = 1;
1247 SET_LVAL_NODE (value
);
1252 /* We may have an ID here. */
1253 if (JAVA_START_CHAR_P (c
))
1255 int ascii_index
= 0, all_ascii
= 1;
1257 /* Keyword, boolean literal or null literal. */
1258 while (c
!= UEOF
&& JAVA_PART_CHAR_P (c
))
1260 java_unicode_2_utf8 (c
);
1263 java_next_unicode ();
1265 c
= java_peek_unicode ();
1268 obstack_1grow (&temporary_obstack
, '\0');
1269 string
= obstack_finish (&temporary_obstack
);
1271 /* If we have something all ascii, we consider a keyword, a boolean
1272 literal, a null literal or an all ASCII identifier. Otherwise,
1273 this is an identifier (possibly not respecting formation rule). */
1276 const struct java_keyword
*kw
;
1277 if ((kw
=java_keyword (string
, ascii_index
)))
1281 case PUBLIC_TK
: case PROTECTED_TK
: case STATIC_TK
:
1282 case ABSTRACT_TK
: case FINAL_TK
: case NATIVE_TK
:
1283 case SYNCHRONIZED_TK
: case TRANSIENT_TK
: case VOLATILE_TK
:
1284 case PRIVATE_TK
: case STRICT_TK
:
1285 SET_MODIFIER_CTX (kw
->token
);
1288 SET_LVAL_NODE (float_type_node
);
1291 SET_LVAL_NODE (double_type_node
);
1294 SET_LVAL_NODE (boolean_type_node
);
1297 SET_LVAL_NODE (byte_type_node
);
1300 SET_LVAL_NODE (short_type_node
);
1303 SET_LVAL_NODE (int_type_node
);
1306 SET_LVAL_NODE (long_type_node
);
1309 SET_LVAL_NODE (char_type_node
);
1312 /* Keyword based literals. */
1315 SET_LVAL_NODE ((kw
->token
== TRUE_TK
?
1316 boolean_true_node
: boolean_false_node
));
1319 SET_LVAL_NODE (null_pointer_node
);
1325 BUILD_OPERATOR (kw
->token
);
1331 /* Some keyword we want to retain information on the location
1332 they where found. */
1344 BUILD_OPERATOR (kw
->token
);
1352 java_lval
->node
= BUILD_ID_WFL (GET_IDENTIFIER (string
));
1356 java_next_unicode ();
1358 /* Character literals. */
1363 if ((c
= java_get_unicode ()) == '\\')
1364 char_lit
= java_parse_escape_sequence ();
1367 if (c
== '\n' || c
== '\'')
1368 java_lex_error ("Invalid character literal", 0);
1372 c
= java_get_unicode ();
1374 if ((c
== '\n') || (c
== UEOF
))
1375 java_lex_error ("Character literal not terminated at end of line", 0);
1377 java_lex_error ("Syntax error in character literal", 0);
1379 if (char_lit
== JAVA_CHAR_ERROR
)
1380 char_lit
= 0; /* We silently convert it to zero. */
1382 SET_LVAL_NODE (build_int_cst (char_type_node
, char_lit
));
1386 /* String literals. */
1394 c
= java_peek_unicode ();
1395 if (c
== '\n' || c
== UEOF
) /* ULT. */
1397 java_lex_error ("String not terminated at end of line", 0);
1400 java_next_unicode ();
1404 c
= java_parse_escape_sequence ();
1405 if (c
== JAVA_CHAR_ERROR
)
1408 c
= 0; /* We silently convert it to zero. */
1410 java_unicode_2_utf8 (c
);
1413 obstack_1grow (&temporary_obstack
, '\0');
1414 string
= obstack_finish (&temporary_obstack
);
1416 if (!no_error
|| (c
!= '"'))
1417 java_lval
->node
= error_mark_node
; /* FIXME: Requires further
1420 java_lval
->node
= build_string (strlen (string
), string
);
1422 obstack_free (&temporary_obstack
, string
);
1423 return STRING_LIT_TK
;
1429 /* Check for comment. */
1430 switch (c
= java_peek_unicode ())
1433 java_next_unicode ();
1436 c
= java_get_unicode ();
1439 /* It is ok to end a `//' comment with EOF, unless
1440 we're being pedantic. */
1442 java_lex_error ("Comment not terminated at end of input",
1446 if (c
== '\n') /* ULT */
1452 java_next_unicode ();
1453 if ((c
= java_get_unicode ()) == '*')
1455 c
= java_get_unicode ();
1458 /* Empty documentation comment. We have to reset
1459 the deprecation marker as only the most recent
1460 doc comment applies. */
1461 ctxp
->deprecated
= 0;
1464 java_parse_doc_section (c
);
1467 java_parse_end_comment ((c
= java_get_unicode ()));
1472 java_next_unicode ();
1473 BUILD_OPERATOR2 (DIV_ASSIGN_TK
);
1476 BUILD_OPERATOR (DIV_TK
);
1480 BUILD_OPERATOR (OP_TK
);
1485 java_lval
->operator.token
= OCB_TK
;
1486 java_lval
->operator.location
= BUILD_LOCATION();
1487 #ifdef USE_MAPPED_LOCATION
1488 if (ctxp
->ccb_indent
== 1)
1489 ctxp
->first_ccb_indent1
= input_location
;
1491 if (ctxp
->ccb_indent
== 1)
1492 ctxp
->first_ccb_indent1
= input_line
;
1500 java_lval
->operator.token
= CCB_TK
;
1501 java_lval
->operator.location
= BUILD_LOCATION();
1502 #ifdef USE_MAPPED_LOCATION
1503 if (ctxp
->ccb_indent
== 1)
1504 ctxp
->last_ccb_indent1
= input_location
;
1506 if (ctxp
->ccb_indent
== 1)
1507 ctxp
->last_ccb_indent1
= input_line
;
1512 BUILD_OPERATOR (OSB_TK
);
1520 BUILD_OPERATOR (DOT_TK
);
1524 c
= java_peek_unicode ();
1527 java_next_unicode ();
1528 BUILD_OPERATOR (EQ_TK
);
1532 /* Equals is used in two different locations. In the
1533 variable_declarator: rule, it has to be seen as '=' as opposed
1534 to being seen as an ordinary assignment operator in
1535 assignment_operators: rule. */
1536 BUILD_OPERATOR (ASSIGN_TK
);
1540 switch ((c
= java_peek_unicode ()))
1543 java_next_unicode ();
1544 BUILD_OPERATOR (GTE_TK
);
1546 java_next_unicode ();
1547 switch ((c
= java_peek_unicode ()))
1550 java_next_unicode ();
1551 c
= java_peek_unicode ();
1554 java_next_unicode ();
1555 BUILD_OPERATOR2 (ZRS_ASSIGN_TK
);
1559 BUILD_OPERATOR (ZRS_TK
);
1562 java_next_unicode ();
1563 BUILD_OPERATOR2 (SRS_ASSIGN_TK
);
1565 BUILD_OPERATOR (SRS_TK
);
1568 BUILD_OPERATOR (GT_TK
);
1572 switch ((c
= java_peek_unicode ()))
1575 java_next_unicode ();
1576 BUILD_OPERATOR (LTE_TK
);
1578 java_next_unicode ();
1579 if ((c
= java_peek_unicode ()) == '=')
1581 java_next_unicode ();
1582 BUILD_OPERATOR2 (LS_ASSIGN_TK
);
1586 BUILD_OPERATOR (LS_TK
);
1589 BUILD_OPERATOR (LT_TK
);
1593 switch ((c
= java_peek_unicode ()))
1596 java_next_unicode ();
1597 BUILD_OPERATOR (BOOL_AND_TK
);
1599 java_next_unicode ();
1600 BUILD_OPERATOR2 (AND_ASSIGN_TK
);
1602 BUILD_OPERATOR (AND_TK
);
1606 switch ((c
= java_peek_unicode ()))
1609 java_next_unicode ();
1610 BUILD_OPERATOR (BOOL_OR_TK
);
1612 java_next_unicode ();
1613 BUILD_OPERATOR2 (OR_ASSIGN_TK
);
1615 BUILD_OPERATOR (OR_TK
);
1619 switch ((c
= java_peek_unicode ()))
1622 java_next_unicode ();
1623 BUILD_OPERATOR (INCR_TK
);
1625 java_next_unicode ();
1626 BUILD_OPERATOR2 (PLUS_ASSIGN_TK
);
1628 BUILD_OPERATOR (PLUS_TK
);
1632 switch ((c
= java_peek_unicode ()))
1635 java_next_unicode ();
1636 BUILD_OPERATOR (DECR_TK
);
1638 java_next_unicode ();
1639 BUILD_OPERATOR2 (MINUS_ASSIGN_TK
);
1641 BUILD_OPERATOR (MINUS_TK
);
1645 if ((c
= java_peek_unicode ()) == '=')
1647 java_next_unicode ();
1648 BUILD_OPERATOR2 (MULT_ASSIGN_TK
);
1652 BUILD_OPERATOR (MULT_TK
);
1656 if ((c
= java_peek_unicode ()) == '=')
1658 java_next_unicode ();
1659 BUILD_OPERATOR2 (XOR_ASSIGN_TK
);
1663 BUILD_OPERATOR (XOR_TK
);
1667 if ((c
= java_peek_unicode ()) == '=')
1669 java_next_unicode ();
1670 BUILD_OPERATOR2 (REM_ASSIGN_TK
);
1674 BUILD_OPERATOR (REM_TK
);
1678 if ((c
= java_peek_unicode()) == '=')
1680 java_next_unicode ();
1681 BUILD_OPERATOR (NEQ_TK
);
1685 BUILD_OPERATOR (NEG_TK
);
1689 BUILD_OPERATOR (REL_QM_TK
);
1691 BUILD_OPERATOR (REL_CL_TK
);
1693 BUILD_OPERATOR (NOT_TK
);
1696 if (c
== 0x1a) /* CTRL-Z. */
1698 if ((c
= java_peek_unicode ()) == UEOF
)
1699 return 0; /* Ok here. */
1702 /* Everything else is an invalid character in the input. */
1704 char lex_error_buffer
[128];
1705 sprintf (lex_error_buffer
, "Invalid character '%s' in input",
1706 java_sprint_unicode (c
));
1707 java_lex_error (lex_error_buffer
, -1);
1714 /* The exported interface to the lexer. */
1716 java_lex (YYSTYPE
*java_lval
)
1720 timevar_push (TV_LEX
);
1721 r
= do_java_lex (java_lval
);
1722 timevar_pop (TV_LEX
);
1726 /* This is called by the parser to see if an error should be generated
1727 due to numeric overflow. This function only handles the particular
1728 case of the largest negative value, and is only called in the case
1729 where this value is not preceded by `-'. */
1731 error_if_numeric_overflow (tree value
)
1733 if (TREE_CODE (value
) == INTEGER_CST
1734 && !JAVA_NOT_RADIX10_FLAG (value
)
1735 && tree_int_cst_sgn (value
) < 0)
1737 if (TREE_TYPE (value
) == long_type_node
)
1738 java_lex_error ("Numeric overflow for 'long' literal", 0);
1740 java_lex_error ("Numeric overflow for 'int' literal", 0);
1744 #endif /* JC1_LITE */
1747 java_unicode_2_utf8 (unicode_t unicode
)
1749 if (RANGE (unicode
, 0x01, 0x7f))
1750 obstack_1grow (&temporary_obstack
, (char)unicode
);
1751 else if (RANGE (unicode
, 0x80, 0x7ff) || unicode
== 0)
1753 obstack_1grow (&temporary_obstack
,
1754 (unsigned char)(0xc0 | ((0x7c0 & unicode
) >> 6)));
1755 obstack_1grow (&temporary_obstack
,
1756 (unsigned char)(0x80 | (unicode
& 0x3f)));
1758 else /* Range 0x800-0xffff. */
1760 obstack_1grow (&temporary_obstack
,
1761 (unsigned char)(0xe0 | (unicode
& 0xf000) >> 12));
1762 obstack_1grow (&temporary_obstack
,
1763 (unsigned char)(0x80 | (unicode
& 0x0fc0) >> 6));
1764 obstack_1grow (&temporary_obstack
,
1765 (unsigned char)(0x80 | (unicode
& 0x003f)));
1771 build_wfl_node (tree node
)
1773 #ifdef USE_MAPPED_LOCATION
1774 node
= build_expr_wfl (node
, input_location
);
1776 node
= build_expr_wfl (node
, ctxp
->filename
,
1777 ctxp
->lexer
->token_start
.line
,
1778 ctxp
->lexer
->token_start
.col
);
1780 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1781 TREE_TYPE (node
) = NULL_TREE
;
1787 java_lex_error (const char *msg ATTRIBUTE_UNUSED
, int forward ATTRIBUTE_UNUSED
)
1790 int col
= (ctxp
->lexer
->position
.col
1791 + forward
* ctxp
->lexer
->next_columns
);
1792 #if USE_MAPPED_LOCATION
1793 source_location save_location
= input_location
;
1794 LINEMAP_POSITION_FOR_COLUMN (input_location
, &line_table
, col
);
1796 /* Might be caught in the middle of some error report. */
1797 ctxp
->java_error_flag
= 0;
1800 input_location
= save_location
;
1802 java_lc save
= ctxp
->lexer
->token_start
;
1803 ctxp
->lexer
->token_start
.line
= ctxp
->lexer
->position
.line
;
1804 ctxp
->lexer
->token_start
.col
= col
;
1806 /* Might be caught in the middle of some error report. */
1807 ctxp
->java_error_flag
= 0;
1810 ctxp
->lexer
->token_start
= save
;
1817 java_is_eol (FILE *fp
, int c
)
1824 if (next
!= '\n' && next
!= EOF
)
1836 java_get_line_col (const char *filename ATTRIBUTE_UNUSED
,
1837 int line ATTRIBUTE_UNUSED
, int col ATTRIBUTE_UNUSED
)
1842 /* Dumb implementation. Doesn't try to cache or optimize things. */
1843 /* First line of the file is line 1, first column is 1. */
1845 /* COL == -1 means, at the CR/LF in LINE. */
1846 /* COL == -2 means, at the first non space char in LINE. */
1849 int c
, ccol
, cline
= 1;
1850 int current_line_col
= 0;
1851 int first_non_space
= 0;
1854 if (!(fp
= fopen (filename
, "r")))
1855 fatal_error ("can't open %s: %m", filename
);
1857 while (cline
!= line
)
1862 static const char msg
[] = "<<file too short - unexpected EOF>>";
1863 obstack_grow (&temporary_obstack
, msg
, sizeof(msg
)-1);
1866 if (java_is_eol (fp
, c
))
1870 /* Gather the chars of the current line in a buffer. */
1874 if (c
< 0 || java_is_eol (fp
, c
))
1876 if (!first_non_space
&& !JAVA_WHITE_SPACE_P (c
))
1877 first_non_space
= current_line_col
;
1878 obstack_1grow (&temporary_obstack
, c
);
1883 obstack_1grow (&temporary_obstack
, '\n');
1887 col
= current_line_col
;
1888 first_non_space
= 0;
1891 col
= first_non_space
;
1893 first_non_space
= 0;
1895 /* Place the '^' a the right position. */
1896 base
= obstack_base (&temporary_obstack
);
1897 for (col
+= 2, ccol
= 0; ccol
< col
; ccol
++)
1899 /* Compute \t when reaching first_non_space. */
1900 char c
= (first_non_space
?
1901 (base
[ccol
] == '\t' ? '\t' : ' ') : ' ');
1902 obstack_1grow (&temporary_obstack
, c
);
1904 obstack_grow0 (&temporary_obstack
, "^", 1);
1907 return obstack_finish (&temporary_obstack
);
1913 utf8_cmp (const unsigned char *str
, int length
, const char *name
)
1915 const unsigned char *limit
= str
+ length
;
1918 for (i
= 0; name
[i
]; ++i
)
1920 int ch
= UTF8_GET (str
, limit
);
1922 return ch
- name
[i
];
1925 return str
== limit
? 0 : 1;
1928 /* A sorted list of all C++ keywords. */
1930 static const char *const cxx_keywords
[] =
2038 /* Return true if NAME is a C++ keyword. */
2041 cxx_keyword_p (const char *name
, int length
)
2043 int last
= ARRAY_SIZE (cxx_keywords
);
2045 int mid
= (last
+ first
) / 2;
2048 for (mid
= (last
+ first
) / 2;
2050 old
= mid
, mid
= (last
+ first
) / 2)
2052 int kwl
= strlen (cxx_keywords
[mid
]);
2053 int min_length
= kwl
> length
? length
: kwl
;
2054 int r
= utf8_cmp ((const unsigned char *) name
, min_length
, cxx_keywords
[mid
]);
2059 /* We've found a match if all the remaining characters are `$'. */
2060 for (i
= min_length
; i
< length
&& name
[i
] == '$'; ++i
)
2074 #endif /* JC1_LITE */