1 /* Language lexer for the GNU compiler for the Java(TM) language.
2 Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003
3 Free Software Foundation, Inc.
4 Contributed by Alexandre Petit-Bianco (apbianco@cygnus.com)
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to
20 the Free Software Foundation, 59 Temple Place - Suite 330,
21 Boston, MA 02111-1307, USA.
23 Java and all Java-based marks are trademarks or registered trademarks
24 of Sun Microsystems, Inc. in the United States and other countries.
25 The Free Software Foundation is independent of Sun Microsystems, Inc. */
27 /* It defines java_lex (yylex) that reads a Java ASCII source file
28 possibly containing Unicode escape sequence or utf8 encoded
29 characters and returns a token for everything found but comments,
30 white spaces and line terminators. When necessary, it also fills
31 the java_lval (yylval) union. It's implemented to be called by a
32 re-entrant parser generated by Bison.
34 The lexical analysis conforms to the Java grammar described in "The
35 Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
36 Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
40 #include "chartables.h"
42 /* Function declarations. */
43 static char *java_sprint_unicode (struct java_line
*, int);
44 static void java_unicode_2_utf8 (unicode_t
);
45 static void java_lex_error (const char *, int);
47 static int java_is_eol (FILE *, int);
48 static tree
build_wfl_node (tree
);
50 static void java_store_unicode (struct java_line
*, unicode_t
, int);
51 static int java_parse_escape_sequence (void);
52 static int java_start_char_p (unicode_t
);
53 static int java_part_char_p (unicode_t
);
54 static int java_parse_doc_section (int);
55 static void java_parse_end_comment (int);
56 static int java_get_unicode (void);
57 static int java_read_unicode (java_lexer
*, int *);
58 static int java_read_unicode_collapsing_terminators (java_lexer
*, int *);
59 static void java_store_unicode (struct java_line
*, unicode_t
, int);
60 static int java_read_char (java_lexer
*);
61 static void java_allocate_new_line (void);
62 static void java_unget_unicode (void);
63 static unicode_t
java_sneak_unicode (void);
65 static int utf8_cmp (const unsigned char *, int, const char *);
68 java_lexer
*java_new_lexer (FILE *, const char *);
70 static void error_if_numeric_overflow (tree
);
74 /* This is nonzero if we have initialized `need_byteswap'. */
75 static int byteswap_init
= 0;
77 /* Some versions of iconv() (e.g., glibc 2.1.3) will return UCS-2 in
78 big-endian order -- not native endian order. We handle this by
79 doing a conversion once at startup and seeing what happens. This
80 flag holds the results of this determination. */
81 static int need_byteswap
= 0;
85 java_init_lex (FILE *finput
, const char *encoding
)
88 int java_lang_imported
= 0;
91 java_lang_id
= get_identifier ("java.lang");
93 inst_id
= get_identifier ("inst$");
95 wpv_id
= get_identifier ("write_parm_value$");
97 if (!java_lang_imported
)
99 tree node
= build_tree_list
100 (build_expr_wfl (java_lang_id
, NULL
, 0, 0), NULL_TREE
);
101 read_import_dir (TREE_PURPOSE (node
));
102 TREE_CHAIN (node
) = ctxp
->import_demand_list
;
103 ctxp
->import_demand_list
= node
;
104 java_lang_imported
= 1;
108 wfl_operator
= build_expr_wfl (NULL_TREE
, ctxp
->filename
, 0, 0);
110 label_id
= get_identifier ("$L");
112 wfl_append
= build_expr_wfl (get_identifier ("append"), NULL
, 0, 0);
113 if (!wfl_string_buffer
)
115 build_expr_wfl (get_identifier (flag_emit_class_files
116 ? "java.lang.StringBuffer"
117 : "gnu.gcj.runtime.StringBuffer"),
120 wfl_to_string
= build_expr_wfl (get_identifier ("toString"), NULL
, 0, 0);
122 CPC_INITIALIZER_LIST (ctxp
) = CPC_STATIC_INITIALIZER_LIST (ctxp
) =
123 CPC_INSTANCE_INITIALIZER_LIST (ctxp
) = NULL_TREE
;
125 memset (ctxp
->modifier_ctx
, 0, sizeof (ctxp
->modifier_ctx
));
126 current_jcf
= ggc_alloc_cleared (sizeof (JCF
));
127 ctxp
->current_parsed_class
= NULL
;
128 ctxp
->package
= NULL_TREE
;
131 ctxp
->filename
= input_filename
;
132 ctxp
->lineno
= lineno
= 0;
135 ctxp
->java_error_flag
= 0;
136 ctxp
->lexer
= java_new_lexer (finput
, encoding
);
140 java_sprint_unicode (struct java_line
*line
, int i
)
142 static char buffer
[10];
143 if (line
->unicode_escape_p
[i
] || line
->line
[i
] > 128)
144 sprintf (buffer
, "\\u%04x", line
->line
[i
]);
147 buffer
[0] = line
->line
[i
];
154 java_sneak_unicode (void)
156 return (ctxp
->c_line
->line
[ctxp
->c_line
->current
]);
160 java_unget_unicode (void)
162 if (!ctxp
->c_line
->current
)
163 /* Can't unget unicode. */
166 ctxp
->c_line
->current
--;
167 ctxp
->c_line
->char_col
-= JAVA_COLUMN_DELTA (0);
171 java_allocate_new_line (void)
173 unicode_t ahead
= (ctxp
->c_line
? ctxp
->c_line
->ahead
[0] : '\0');
174 char ahead_escape_p
= (ctxp
->c_line
?
175 ctxp
->c_line
->unicode_escape_ahead_p
: 0);
177 if (ctxp
->c_line
&& !ctxp
->c_line
->white_space_only
)
181 free (ctxp
->p_line
->unicode_escape_p
);
182 free (ctxp
->p_line
->line
);
185 ctxp
->p_line
= ctxp
->c_line
;
186 ctxp
->c_line
= NULL
; /* Reallocated. */
191 ctxp
->c_line
= xmalloc (sizeof (struct java_line
));
192 ctxp
->c_line
->max
= JAVA_LINE_MAX
;
193 ctxp
->c_line
->line
= xmalloc (sizeof (unicode_t
)*ctxp
->c_line
->max
);
194 ctxp
->c_line
->unicode_escape_p
=
195 xmalloc (sizeof (char)*ctxp
->c_line
->max
);
196 ctxp
->c_line
->white_space_only
= 0;
199 ctxp
->c_line
->line
[0] = ctxp
->c_line
->size
= 0;
200 ctxp
->c_line
->char_col
= ctxp
->c_line
->current
= 0;
203 ctxp
->c_line
->line
[ctxp
->c_line
->size
] = ahead
;
204 ctxp
->c_line
->unicode_escape_p
[ctxp
->c_line
->size
] = ahead_escape_p
;
205 ctxp
->c_line
->size
++;
207 ctxp
->c_line
->ahead
[0] = 0;
208 ctxp
->c_line
->unicode_escape_ahead_p
= 0;
209 ctxp
->c_line
->lineno
= ++lineno
;
210 ctxp
->c_line
->white_space_only
= 1;
213 /* Create a new lexer object. */
216 java_new_lexer (FILE *finput
, const char *encoding
)
218 java_lexer
*lex
= xmalloc (sizeof (java_lexer
));
221 lex
->finput
= finput
;
223 lex
->unget_value
= 0;
227 lex
->handle
= iconv_open ("UCS-2", encoding
);
228 if (lex
->handle
!= (iconv_t
) -1)
234 lex
->read_anything
= 0;
235 lex
->use_fallback
= 0;
237 /* Work around broken iconv() implementations by doing checking at
238 runtime. We assume that if the UTF-8 => UCS-2 encoder is broken,
239 then all UCS-2 encoders will be broken. Perhaps not a valid
247 handle
= iconv_open ("UCS-2", "UTF-8");
248 if (handle
!= (iconv_t
) -1)
255 /* This is the UTF-8 encoding of \ufeff. */
262 outp
= (char *) &result
;
265 r
= iconv (handle
, (ICONV_CONST
char **) &inp
, &inc
,
267 iconv_close (handle
);
268 /* Conversion must be complete for us to use the result. */
269 if (r
!= (size_t) -1 && inc
== 0 && outc
== 0)
270 need_byteswap
= (result
!= 0xfeff);
274 lex
->byte_swap
= need_byteswap
;
277 #endif /* HAVE_ICONV */
279 /* If iconv failed, use the internal decoder if the default
280 encoding was requested. This code is used on platforms where
281 iconv exists but is insufficient for our needs. For
282 instance, on Solaris 2.5 iconv cannot handle UTF-8 or UCS-2.
284 On Solaris the default encoding, as returned by nl_langinfo(),
285 is `646' (aka ASCII), but the Solaris iconv_open() doesn't
286 understand that. We work around that by pretending
287 `646' to be the same as UTF-8. */
288 if (strcmp (encoding
, DEFAULT_ENCODING
) && strcmp (encoding
, "646"))
292 lex
->use_fallback
= 1;
293 #endif /* HAVE_ICONV */
297 fatal_error ("unknown encoding: `%s'\nThis might mean that your locale's encoding is not supported\nby your system's iconv(3) implementation. If you aren't trying\nto use a particular encoding for your input file, try the\n`--encoding=UTF-8' option", encoding
);
303 java_destroy_lexer (java_lexer
*lex
)
306 if (! lex
->use_fallback
)
307 iconv_close (lex
->handle
);
313 java_read_char (java_lexer
*lex
)
315 if (lex
->unget_value
)
317 unicode_t r
= lex
->unget_value
;
318 lex
->unget_value
= 0;
323 if (! lex
->use_fallback
)
325 size_t ir
, inbytesleft
, in_save
, out_count
, out_save
;
329 /* If there is data which has already been converted, use it. */
330 if (lex
->out_first
== -1 || lex
->out_first
>= lex
->out_last
)
337 /* See if we need to read more data. If FIRST == 0 then
338 the previous conversion attempt ended in the middle of
339 a character at the end of the buffer. Otherwise we
340 only have to read if the buffer is empty. */
341 if (lex
->first
== 0 || lex
->first
>= lex
->last
)
345 if (lex
->first
>= lex
->last
)
350 if (feof (lex
->finput
))
352 r
= fread (&lex
->buffer
[lex
->last
], 1,
353 sizeof (lex
->buffer
) - lex
->last
,
358 inbytesleft
= lex
->last
- lex
->first
;
359 out_count
= sizeof (lex
->out_buffer
) - lex
->out_last
;
361 if (inbytesleft
== 0)
363 /* We've tried to read and there is nothing left. */
367 in_save
= inbytesleft
;
368 out_save
= out_count
;
369 inp
= &lex
->buffer
[lex
->first
];
370 outp
= &lex
->out_buffer
[lex
->out_last
];
371 ir
= iconv (lex
->handle
, (ICONV_CONST
char **) &inp
,
372 &inbytesleft
, &outp
, &out_count
);
374 /* If we haven't read any bytes, then look to see if we
376 if (! lex
->read_anything
&& out_save
- out_count
>= 2)
378 unicode_t uc
= * (unicode_t
*) &lex
->out_buffer
[0];
384 else if (uc
== 0xfffe)
389 lex
->read_anything
= 1;
395 for (i
= 0; i
< out_save
- out_count
; i
+= 2)
397 char t
= lex
->out_buffer
[lex
->out_last
+ i
];
398 lex
->out_buffer
[lex
->out_last
+ i
]
399 = lex
->out_buffer
[lex
->out_last
+ i
+ 1];
400 lex
->out_buffer
[lex
->out_last
+ i
+ 1] = t
;
404 lex
->first
+= in_save
- inbytesleft
;
405 lex
->out_last
+= out_save
- out_count
;
407 /* If we converted anything at all, move along. */
408 if (out_count
!= out_save
)
411 if (ir
== (size_t) -1)
415 /* This is ok. This means that the end of our buffer
416 is in the middle of a character sequence. We just
417 move the valid part of the buffer to the beginning
419 memmove (&lex
->buffer
[0], &lex
->buffer
[lex
->first
],
420 lex
->last
- lex
->first
);
421 lex
->last
-= lex
->first
;
426 /* A more serious error. */
427 java_lex_error ("unrecognized character in input stream",
435 if (lex
->out_first
== -1 || lex
->out_first
>= lex
->out_last
)
437 /* Don't have any data. */
442 result
= * ((unicode_t
*) &lex
->out_buffer
[lex
->out_first
]);
447 #endif /* HAVE_ICONV */
450 c
= getc (lex
->finput
);
455 return (unicode_t
) c
;
458 if ((c
& 0xe0) == 0xc0)
460 c1
= getc (lex
->finput
);
461 if ((c1
& 0xc0) == 0x80)
463 unicode_t r
= (unicode_t
)(((c
& 0x1f) << 6) + (c1
& 0x3f));
464 /* Check for valid 2-byte characters. We explicitly
465 allow \0 because this encoding is common in the
467 if (r
== 0 || (r
>= 0x80 && r
<= 0x7ff))
471 else if ((c
& 0xf0) == 0xe0)
473 c1
= getc (lex
->finput
);
474 if ((c1
& 0xc0) == 0x80)
476 c2
= getc (lex
->finput
);
477 if ((c2
& 0xc0) == 0x80)
479 unicode_t r
= (unicode_t
)(((c
& 0xf) << 12) +
482 /* Check for valid 3-byte characters.
483 Don't allow surrogate, \ufffe or \uffff. */
484 if (IN_RANGE (r
, 0x800, 0xffff)
485 && ! IN_RANGE (r
, 0xd800, 0xdfff)
486 && r
!= 0xfffe && r
!= 0xffff)
492 /* We simply don't support invalid characters. We also
493 don't support 4-, 5-, or 6-byte UTF-8 sequences, as these
494 cannot be valid Java characters. */
495 java_lex_error ("malformed UTF-8 character", 0);
499 /* We only get here on error. */
504 java_store_unicode (struct java_line
*l
, unicode_t c
, int unicode_escape_p
)
506 if (l
->size
== l
->max
)
508 l
->max
+= JAVA_LINE_MAX
;
509 l
->line
= xrealloc (l
->line
, sizeof (unicode_t
)*l
->max
);
510 l
->unicode_escape_p
= xrealloc (l
->unicode_escape_p
,
511 sizeof (char)*l
->max
);
513 l
->line
[l
->size
] = c
;
514 l
->unicode_escape_p
[l
->size
++] = unicode_escape_p
;
518 java_read_unicode (java_lexer
*lex
, int *unicode_escape_p
)
522 c
= java_read_char (lex
);
523 *unicode_escape_p
= 0;
532 if ((lex
->bs_count
) % 2 == 1)
534 /* Odd number of \ seen. */
535 c
= java_read_char (lex
);
538 unicode_t unicode
= 0;
541 /* Recognize any number of `u's in \u. */
542 while ((c
= java_read_char (lex
)) == 'u')
550 java_lex_error ("prematurely terminated \\u sequence", 0);
555 unicode
|= (unicode_t
)(hex_value (c
) << shift
);
558 java_lex_error ("non-hex digit in \\u sequence", 0);
562 c
= java_read_char (lex
);
568 lex
->unget_value
= c
;
571 *unicode_escape_p
= 1;
574 lex
->unget_value
= c
;
576 return (unicode_t
) '\\';
580 java_read_unicode_collapsing_terminators (java_lexer
*lex
,
581 int *unicode_escape_p
)
583 int c
= java_read_unicode (lex
, unicode_escape_p
);
587 /* We have to read ahead to see if we got \r\n. In that case we
588 return a single line terminator. */
590 c
= java_read_unicode (lex
, &dummy
);
591 if (c
!= '\n' && c
!= UEOF
)
592 lex
->unget_value
= c
;
593 /* In either case we must return a newline. */
601 java_get_unicode (void)
603 /* It's time to read a line when... */
604 if (!ctxp
->c_line
|| ctxp
->c_line
->current
== ctxp
->c_line
->size
)
609 if (ctxp
->lexer
->hit_eof
)
612 java_allocate_new_line ();
613 if (ctxp
->c_line
->line
[0] != '\n')
617 int unicode_escape_p
;
618 c
= java_read_unicode_collapsing_terminators (ctxp
->lexer
,
623 java_store_unicode (ctxp
->c_line
, c
, unicode_escape_p
);
624 if (ctxp
->c_line
->white_space_only
625 && !JAVA_WHITE_SPACE_P (c
)
627 ctxp
->c_line
->white_space_only
= 0;
629 if ((c
== '\n') || (c
== UEOF
))
633 if (c
== UEOF
&& ! found_chars
)
635 ctxp
->lexer
->hit_eof
= 1;
640 ctxp
->c_line
->char_col
+= JAVA_COLUMN_DELTA (0);
641 JAVA_LEX_CHAR (ctxp
->c_line
->line
[ctxp
->c_line
->current
]);
642 return ctxp
->c_line
->line
[ctxp
->c_line
->current
++];
645 /* Parse the end of a C style comment.
646 * C is the first character following the '/' and '*'. */
648 java_parse_end_comment (int c
)
650 for ( ;; c
= java_get_unicode ())
655 java_lex_error ("Comment not terminated at end of input", 0);
658 switch (c
= java_get_unicode ())
661 java_lex_error ("Comment not terminated at end of input", 0);
665 case '*': /* Reparse only '*'. */
666 java_unget_unicode ();
672 /* Parse the documentation section. Keywords must be at the beginning
673 of a documentation comment line (ignoring white space and any `*'
674 character). Parsed keyword(s): @DEPRECATED. */
677 java_parse_doc_section (int c
)
679 int valid_tag
= 0, seen_star
= 0;
681 while (JAVA_WHITE_SPACE_P (c
) || (c
== '*') || c
== '\n')
693 c
= java_get_unicode();
697 java_lex_error ("Comment not terminated at end of input", 0);
699 if (seen_star
&& (c
== '/'))
700 return 1; /* Goto step1 in caller. */
702 /* We're parsing `@deprecated'. */
703 if (valid_tag
&& (c
== '@'))
708 while (tag_index
< 10 && c
!= UEOF
&& c
!= ' ' && c
!= '\n')
710 c
= java_get_unicode ();
711 tag
[tag_index
++] = c
;
715 java_lex_error ("Comment not terminated at end of input", 0);
716 tag
[tag_index
] = '\0';
718 if (!strcmp (tag
, "deprecated"))
719 ctxp
->deprecated
= 1;
721 java_unget_unicode ();
725 /* Return true if C is a valid start character for a Java identifier.
726 This is only called if C >= 128 -- smaller values are handled
727 inline. However, this function handles all values anyway. */
729 java_start_char_p (unicode_t c
)
731 unsigned int hi
= c
/ 256;
732 const char *const page
= type_table
[hi
];
733 unsigned long val
= (unsigned long) page
;
736 if ((val
& ~ (LETTER_PART
| LETTER_START
)) != 0)
737 flags
= page
[c
& 255];
741 return flags
& LETTER_START
;
744 /* Return true if C is a valid part character for a Java identifier.
745 This is only called if C >= 128 -- smaller values are handled
746 inline. However, this function handles all values anyway. */
748 java_part_char_p (unicode_t c
)
750 unsigned int hi
= c
/ 256;
751 const char *const page
= type_table
[hi
];
752 unsigned long val
= (unsigned long) page
;
755 if ((val
& ~ (LETTER_PART
| LETTER_START
)) != 0)
756 flags
= page
[c
& 255];
760 return flags
& LETTER_PART
;
764 java_parse_escape_sequence (void)
769 switch (c
= java_get_unicode ())
772 return (unicode_t
)0x8;
774 return (unicode_t
)0x9;
776 return (unicode_t
)0xa;
778 return (unicode_t
)0xc;
780 return (unicode_t
)0xd;
782 return (unicode_t
)0x22;
784 return (unicode_t
)0x27;
786 return (unicode_t
)0x5c;
787 case '0': case '1': case '2': case '3': case '4':
788 case '5': case '6': case '7':
791 int octal_escape_index
= 0;
795 for (; octal_escape_index
< max
&& RANGE (c
, '0', '7');
796 c
= java_get_unicode ())
798 if (octal_escape_index
== 0 && c
> '3')
800 /* According to the grammar, `\477' has a well-defined
801 meaning -- it is `\47' followed by `7'. */
804 octal_escape
[octal_escape_index
++] = c
;
807 java_unget_unicode ();
809 for (char_lit
=0, i
= 0, shift
= 3*(octal_escape_index
-1);
810 i
< octal_escape_index
; i
++, shift
-= 3)
811 char_lit
|= (octal_escape
[i
] - '0') << shift
;
816 java_lex_error ("Invalid character in escape sequence", 0);
817 return JAVA_CHAR_ERROR
;
822 #define IS_ZERO(X) REAL_VALUES_EQUAL (X, dconst0)
824 /* Subroutine of java_lex: converts floating-point literals to tree
825 nodes. LITERAL_TOKEN is the input literal, JAVA_LVAL is where to
826 store the result. FFLAG indicates whether the literal was tagged
827 with an 'f', indicating it is of type 'float'; NUMBER_BEGINNING
828 is the line number on which to report any error. */
830 static void java_perform_atof (YYSTYPE
*, char *, int, int);
833 java_perform_atof (YYSTYPE
*java_lval
, char *literal_token
, int fflag
,
834 int number_beginning
)
836 REAL_VALUE_TYPE value
;
837 tree type
= (fflag
? FLOAT_TYPE_NODE
: DOUBLE_TYPE_NODE
);
839 SET_REAL_VALUE_ATOF (value
,
840 REAL_VALUE_ATOF (literal_token
, TYPE_MODE (type
)));
842 if (REAL_VALUE_ISINF (value
) || REAL_VALUE_ISNAN (value
))
844 JAVA_FLOAT_RANGE_ERROR (fflag
? "float" : "double");
847 else if (IS_ZERO (value
))
849 /* We check to see if the value is really 0 or if we've found an
850 underflow. We do this in the most primitive imaginable way. */
852 char *p
= literal_token
;
855 while (*p
&& *p
!= 'e' && *p
!= 'E')
857 if (*p
!= '0' && *p
!= '.')
866 int i
= ctxp
->c_line
->current
;
867 ctxp
->c_line
->current
= number_beginning
;
868 java_lex_error ("Floating point literal underflow", 0);
869 ctxp
->c_line
->current
= i
;
873 SET_LVAL_NODE_TYPE (build_real (type
, value
), type
);
877 static int yylex (YYSTYPE
*);
881 yylex (YYSTYPE
*java_lval
)
883 java_lex (YYSTYPE
*java_lval
)
887 unicode_t first_unicode
;
888 int ascii_index
, all_ascii
;
891 /* Translation of the Unicode escape in the raw stream of Unicode
892 characters. Takes care of line terminator. */
894 /* Skip white spaces: SP, TAB and FF or ULT. */
895 for (c
= java_get_unicode ();
896 c
== '\n' || JAVA_WHITE_SPACE_P (c
); c
= java_get_unicode ())
899 ctxp
->elc
.line
= ctxp
->c_line
->lineno
;
900 ctxp
->elc
.col
= ctxp
->c_line
->char_col
-2;
903 ctxp
->elc
.col
= (ctxp
->elc
.col
< 0 ? 0 : ctxp
->elc
.col
);
905 if (c
== 0x1a) /* CTRL-Z. */
907 if ((c
= java_get_unicode ()) == UEOF
)
908 return 0; /* Ok here. */
910 java_unget_unicode (); /* Caught later, at the end of the
913 /* Handle EOF here. */
914 if (c
== UEOF
) /* Should probably do something here... */
917 /* Take care of eventual comments. */
920 switch (c
= java_get_unicode ())
925 c
= java_get_unicode ();
928 /* It is ok to end a `//' comment with EOF, unless
929 we're being pedantic. */
931 java_lex_error ("Comment not terminated at end of input",
935 if (c
== '\n') /* ULT */
941 if ((c
= java_get_unicode ()) == '*')
943 if ((c
= java_get_unicode ()) == '/')
944 goto step1
; /* Empty documentation comment. */
945 else if (java_parse_doc_section (c
))
949 java_parse_end_comment ((c
= java_get_unicode ()));
953 java_unget_unicode ();
959 ctxp
->elc
.line
= ctxp
->c_line
->lineno
;
960 ctxp
->elc
.prev_col
= ctxp
->elc
.col
;
961 ctxp
->elc
.col
= ctxp
->c_line
->char_col
- JAVA_COLUMN_DELTA (-1);
962 if (ctxp
->elc
.col
< 0)
965 /* Numeric literals. */
966 if (JAVA_ASCII_DIGIT (c
) || (c
== '.'))
968 /* This section of code is borrowed from gcc/c-lex.c. */
969 #define TOTAL_PARTS ((HOST_BITS_PER_WIDE_INT / HOST_BITS_PER_CHAR) * 2 + 2)
970 int parts
[TOTAL_PARTS
];
971 HOST_WIDE_INT high
, low
;
972 /* End borrowed section. */
973 char literal_token
[256];
974 int literal_index
= 0, radix
= 10, long_suffix
= 0, overflow
= 0, bytes
;
975 int found_hex_digits
= 0, found_non_octal_digits
= 0;
978 int number_beginning
= ctxp
->c_line
->current
;
982 /* We might have a . separator instead of a FP like .[0-9]*. */
985 unicode_t peep
= java_sneak_unicode ();
987 if (!JAVA_ASCII_DIGIT (peep
))
990 BUILD_OPERATOR (DOT_TK
);
994 for (i
= 0; i
< TOTAL_PARTS
; i
++)
999 c
= java_get_unicode ();
1000 if (c
== 'x' || c
== 'X')
1003 c
= java_get_unicode ();
1005 else if (JAVA_ASCII_DIGIT (c
))
1007 else if (c
== '.' || c
== 'e' || c
=='E')
1009 /* Push the '.', 'e', or 'E' back and prepare for a FP
1011 java_unget_unicode ();
1016 /* We have a zero literal: 0, 0{l,L}, 0{f,F}, 0{d,D}. */
1017 JAVA_LEX_LIT ("0", 10);
1021 SET_LVAL_NODE (long_zero_node
);
1022 return (INT_LIT_TK
);
1024 SET_LVAL_NODE (float_zero_node
);
1027 SET_LVAL_NODE (double_zero_node
);
1030 java_unget_unicode ();
1031 SET_LVAL_NODE (integer_zero_node
);
1032 return (INT_LIT_TK
);
1036 /* Parse the first part of the literal, until we find something
1037 which is not a number. */
1038 while ((radix
== 16 && JAVA_ASCII_HEXDIGIT (c
)) ||
1039 JAVA_ASCII_DIGIT (c
))
1041 /* We store in a string (in case it turns out to be a FP) and in
1042 PARTS if we have to process a integer literal. */
1043 int numeric
= hex_value (c
);
1046 /* Remember when we find a valid hexadecimal digit. */
1048 found_hex_digits
= 1;
1049 /* Remember when we find an invalid octal digit. */
1050 else if (radix
== 8 && !JAVA_ASCII_OCTDIGIT (c
))
1051 found_non_octal_digits
= 1;
1053 literal_token
[literal_index
++] = c
;
1054 /* This section of code if borrowed from gcc/c-lex.c. */
1055 for (count
= 0; count
< TOTAL_PARTS
; count
++)
1057 parts
[count
] *= radix
;
1060 parts
[count
] += (parts
[count
-1] >> HOST_BITS_PER_CHAR
);
1061 parts
[count
-1] &= (1 << HOST_BITS_PER_CHAR
) - 1;
1064 parts
[0] += numeric
;
1066 if (parts
[TOTAL_PARTS
-1] != 0)
1068 /* End borrowed section. */
1069 c
= java_get_unicode ();
1072 /* If we have something from the FP char set but not a digit, parse
1074 if (JAVA_ASCII_FPCHAR (c
) && !JAVA_ASCII_DIGIT (c
))
1077 int seen_digit
= (literal_index
? 1 : 0);
1078 int seen_exponent
= 0;
1079 int fflag
= 0; /* 1 for {f,F}, 0 for {d,D}. FP literal are
1080 double unless specified. */
1082 /* It is ok if the radix is 8 because this just means we've
1083 seen a leading `0'. However, radix==16 is invalid. */
1085 java_lex_error ("Can't express non-decimal FP literal", 0);
1095 literal_token
[literal_index
++ ] = c
;
1096 c
= java_get_unicode ();
1099 java_lex_error ("Invalid character in FP literal", 0);
1102 if (c
== 'e' || c
== 'E')
1106 /* {E,e} must have seen at least a digit. */
1109 ("Invalid FP literal, mantissa must have digit", 0);
1113 literal_token
[literal_index
++] = c
;
1114 c
= java_get_unicode ();
1117 java_lex_error ("Invalid character in FP literal", 0);
1119 if ( c
== 'f' || c
== 'F' || c
== 'd' || c
== 'D')
1121 fflag
= ((c
== 'd') || (c
== 'D')) ? 0 : 1;
1122 stage
= 4; /* So we fall through. */
1125 if ((c
=='-' || c
=='+') && stage
== 2)
1128 literal_token
[literal_index
++] = c
;
1129 c
= java_get_unicode ();
1132 if ((stage
== 0 && JAVA_ASCII_FPCHAR (c
)) ||
1133 (stage
== 1 && JAVA_ASCII_FPCHAR (c
) && !(c
== '.')) ||
1134 (stage
== 2 && (JAVA_ASCII_DIGIT (c
) || JAVA_FP_PM (c
))) ||
1135 (stage
== 3 && JAVA_ASCII_DIGIT (c
)))
1137 if (JAVA_ASCII_DIGIT (c
))
1141 literal_token
[literal_index
++ ] = c
;
1142 c
= java_get_unicode ();
1146 if (stage
!= 4) /* Don't push back fF/dD. */
1147 java_unget_unicode ();
1149 /* An exponent (if any) must have seen a digit. */
1150 if (seen_exponent
&& !seen_digit
)
1152 ("Invalid FP literal, exponent must have digit", 0);
1154 literal_token
[literal_index
] = '\0';
1155 JAVA_LEX_LIT (literal_token
, radix
);
1158 java_perform_atof (java_lval
, literal_token
,
1159 fflag
, number_beginning
);
1164 } /* JAVA_ASCII_FPCHAR (c) */
1166 /* Here we get back to converting the integral literal. */
1167 if (radix
== 16 && ! found_hex_digits
)
1169 ("0x must be followed by at least one hexadecimal digit", 0);
1170 else if (radix
== 8 && found_non_octal_digits
)
1171 java_lex_error ("Octal literal contains digit out of range", 0);
1172 else if (c
== 'L' || c
== 'l')
1175 java_unget_unicode ();
1177 #ifdef JAVA_LEX_DEBUG
1178 literal_token
[literal_index
] = '\0'; /* So JAVA_LEX_LIT is safe. */
1179 JAVA_LEX_LIT (literal_token
, radix
);
1181 /* This section of code is borrowed from gcc/c-lex.c. */
1184 bytes
= GET_TYPE_PRECISION (long_type_node
);
1185 for (i
= bytes
; i
< TOTAL_PARTS
; i
++)
1193 for (i
= 0; i
< HOST_BITS_PER_WIDE_INT
/ HOST_BITS_PER_CHAR
; i
++)
1195 high
|= ((HOST_WIDE_INT
) parts
[i
+ (HOST_BITS_PER_WIDE_INT
1196 / HOST_BITS_PER_CHAR
)]
1197 << (i
* HOST_BITS_PER_CHAR
));
1198 low
|= (HOST_WIDE_INT
) parts
[i
] << (i
* HOST_BITS_PER_CHAR
);
1200 /* End borrowed section. */
1203 /* Range checking. */
1204 value
= build_int_2 (low
, high
);
1205 /* Temporarily set type to unsigned. */
1206 SET_LVAL_NODE_TYPE (value
, (long_suffix
1207 ? unsigned_long_type_node
1208 : unsigned_int_type_node
));
1210 /* For base 10 numbers, only values up to the highest value
1211 (plus one) can be written. For instance, only ints up to
1212 2147483648 can be written. The special case of the largest
1213 negative value is handled elsewhere. For other bases, any
1214 number can be represented. */
1215 if (overflow
|| (radix
== 10
1216 && tree_int_cst_lt (long_suffix
1222 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `long' literal");
1224 JAVA_INTEGRAL_RANGE_ERROR ("Numeric overflow for `int' literal");
1227 /* Sign extend the value. */
1228 SET_LVAL_NODE_TYPE (value
, (long_suffix
? long_type_node
: int_type_node
));
1229 force_fit_type (value
, 0);
1230 JAVA_RADIX10_FLAG (value
) = radix
== 10;
1232 SET_LVAL_NODE_TYPE (build_int_2 (low
, high
),
1233 long_suffix
? long_type_node
: int_type_node
);
1238 /* Character literals. */
1242 if ((c
= java_get_unicode ()) == '\\')
1243 char_lit
= java_parse_escape_sequence ();
1246 if (c
== '\n' || c
== '\'')
1247 java_lex_error ("Invalid character literal", 0);
1251 c
= java_get_unicode ();
1253 if ((c
== '\n') || (c
== UEOF
))
1254 java_lex_error ("Character literal not terminated at end of line", 0);
1256 java_lex_error ("Syntax error in character literal", 0);
1258 if (char_lit
== JAVA_CHAR_ERROR
)
1259 char_lit
= 0; /* We silently convert it to zero. */
1261 JAVA_LEX_CHAR_LIT (char_lit
);
1262 SET_LVAL_NODE_TYPE (build_int_2 (char_lit
, 0), char_type_node
);
1266 /* String literals. */
1272 for (no_error
= 1, c
= java_get_unicode ();
1273 c
!= UEOF
&& c
!= '"' && c
!= '\n'; c
= java_get_unicode ())
1276 c
= java_parse_escape_sequence ();
1277 if (c
== JAVA_CHAR_ERROR
)
1280 c
= 0; /* We silently convert it to zero. */
1282 java_unicode_2_utf8 (c
);
1284 if (c
== '\n' || c
== UEOF
) /* ULT. */
1286 lineno
--; /* Refer to the line where the terminator was seen. */
1287 java_lex_error ("String not terminated at end of line", 0);
1291 obstack_1grow (&temporary_obstack
, '\0');
1292 string
= obstack_finish (&temporary_obstack
);
1294 if (!no_error
|| (c
!= '"'))
1295 java_lval
->node
= error_mark_node
; /* FIXME: Requires futher
1298 java_lval
->node
= build_string (strlen (string
), string
);
1300 obstack_free (&temporary_obstack
, string
);
1301 return STRING_LIT_TK
;
1309 BUILD_OPERATOR (OP_TK
);
1315 if (ctxp
->ccb_indent
== 1)
1316 ctxp
->first_ccb_indent1
= lineno
;
1318 BUILD_OPERATOR (OCB_TK
);
1322 if (ctxp
->ccb_indent
== 1)
1323 ctxp
->last_ccb_indent1
= lineno
;
1324 BUILD_OPERATOR (CCB_TK
);
1327 BUILD_OPERATOR (OSB_TK
);
1339 BUILD_OPERATOR (DOT_TK
);
1340 /* return DOT_TK; */
1347 if ((c
= java_get_unicode ()) == '=')
1349 BUILD_OPERATOR (EQ_TK
);
1353 /* Equals is used in two different locations. In the
1354 variable_declarator: rule, it has to be seen as '=' as opposed
1355 to being seen as an ordinary assignment operator in
1356 assignment_operators: rule. */
1357 java_unget_unicode ();
1358 BUILD_OPERATOR (ASSIGN_TK
);
1362 switch ((c
= java_get_unicode ()))
1365 BUILD_OPERATOR (GTE_TK
);
1367 switch ((c
= java_get_unicode ()))
1370 if ((c
= java_get_unicode ()) == '=')
1372 BUILD_OPERATOR2 (ZRS_ASSIGN_TK
);
1376 java_unget_unicode ();
1377 BUILD_OPERATOR (ZRS_TK
);
1380 BUILD_OPERATOR2 (SRS_ASSIGN_TK
);
1382 java_unget_unicode ();
1383 BUILD_OPERATOR (SRS_TK
);
1386 java_unget_unicode ();
1387 BUILD_OPERATOR (GT_TK
);
1391 switch ((c
= java_get_unicode ()))
1394 BUILD_OPERATOR (LTE_TK
);
1396 if ((c
= java_get_unicode ()) == '=')
1398 BUILD_OPERATOR2 (LS_ASSIGN_TK
);
1402 java_unget_unicode ();
1403 BUILD_OPERATOR (LS_TK
);
1406 java_unget_unicode ();
1407 BUILD_OPERATOR (LT_TK
);
1411 switch ((c
= java_get_unicode ()))
1414 BUILD_OPERATOR (BOOL_AND_TK
);
1416 BUILD_OPERATOR2 (AND_ASSIGN_TK
);
1418 java_unget_unicode ();
1419 BUILD_OPERATOR (AND_TK
);
1423 switch ((c
= java_get_unicode ()))
1426 BUILD_OPERATOR (BOOL_OR_TK
);
1428 BUILD_OPERATOR2 (OR_ASSIGN_TK
);
1430 java_unget_unicode ();
1431 BUILD_OPERATOR (OR_TK
);
1435 switch ((c
= java_get_unicode ()))
1438 BUILD_OPERATOR (INCR_TK
);
1440 BUILD_OPERATOR2 (PLUS_ASSIGN_TK
);
1442 java_unget_unicode ();
1443 BUILD_OPERATOR (PLUS_TK
);
1447 switch ((c
= java_get_unicode ()))
1450 BUILD_OPERATOR (DECR_TK
);
1452 BUILD_OPERATOR2 (MINUS_ASSIGN_TK
);
1454 java_unget_unicode ();
1455 BUILD_OPERATOR (MINUS_TK
);
1459 if ((c
= java_get_unicode ()) == '=')
1461 BUILD_OPERATOR2 (MULT_ASSIGN_TK
);
1465 java_unget_unicode ();
1466 BUILD_OPERATOR (MULT_TK
);
1470 if ((c
= java_get_unicode ()) == '=')
1472 BUILD_OPERATOR2 (DIV_ASSIGN_TK
);
1476 java_unget_unicode ();
1477 BUILD_OPERATOR (DIV_TK
);
1481 if ((c
= java_get_unicode ()) == '=')
1483 BUILD_OPERATOR2 (XOR_ASSIGN_TK
);
1487 java_unget_unicode ();
1488 BUILD_OPERATOR (XOR_TK
);
1492 if ((c
= java_get_unicode ()) == '=')
1494 BUILD_OPERATOR2 (REM_ASSIGN_TK
);
1498 java_unget_unicode ();
1499 BUILD_OPERATOR (REM_TK
);
1503 if ((c
= java_get_unicode()) == '=')
1505 BUILD_OPERATOR (NEQ_TK
);
1509 java_unget_unicode ();
1510 BUILD_OPERATOR (NEG_TK
);
1515 BUILD_OPERATOR (REL_QM_TK
);
1518 BUILD_OPERATOR (REL_CL_TK
);
1520 BUILD_OPERATOR (NOT_TK
);
1523 /* Keyword, boolean literal or null literal. */
1524 for (first_unicode
= c
, all_ascii
= 1, ascii_index
= 0;
1525 c
!= UEOF
&& JAVA_PART_CHAR_P (c
); c
= java_get_unicode ())
1527 java_unicode_2_utf8 (c
);
1528 if (all_ascii
&& c
>= 128)
1533 obstack_1grow (&temporary_obstack
, '\0');
1534 string
= obstack_finish (&temporary_obstack
);
1536 java_unget_unicode ();
1538 /* If we have something all ascii, we consider a keyword, a boolean
1539 literal, a null literal or an all ASCII identifier. Otherwise,
1540 this is an identifier (possibly not respecting formation rule). */
1543 const struct java_keyword
*kw
;
1544 if ((kw
=java_keyword (string
, ascii_index
)))
1546 JAVA_LEX_KW (string
);
1549 case PUBLIC_TK
: case PROTECTED_TK
: case STATIC_TK
:
1550 case ABSTRACT_TK
: case FINAL_TK
: case NATIVE_TK
:
1551 case SYNCHRONIZED_TK
: case TRANSIENT_TK
: case VOLATILE_TK
:
1552 case PRIVATE_TK
: case STRICT_TK
:
1553 SET_MODIFIER_CTX (kw
->token
);
1556 SET_LVAL_NODE (float_type_node
);
1559 SET_LVAL_NODE (double_type_node
);
1562 SET_LVAL_NODE (boolean_type_node
);
1565 SET_LVAL_NODE (byte_type_node
);
1568 SET_LVAL_NODE (short_type_node
);
1571 SET_LVAL_NODE (int_type_node
);
1574 SET_LVAL_NODE (long_type_node
);
1577 SET_LVAL_NODE (char_type_node
);
1580 /* Keyword based literals. */
1583 SET_LVAL_NODE ((kw
->token
== TRUE_TK
?
1584 boolean_true_node
: boolean_false_node
));
1587 SET_LVAL_NODE (null_pointer_node
);
1593 BUILD_OPERATOR (kw
->token
);
1599 /* Some keyword we want to retain information on the location
1600 they where found. */
1612 BUILD_OPERATOR (kw
->token
);
1620 /* We may have an ID here. */
1621 if (JAVA_START_CHAR_P (first_unicode
))
1623 JAVA_LEX_ID (string
);
1624 java_lval
->node
= BUILD_ID_WFL (GET_IDENTIFIER (string
));
1628 /* Everything else is an invalid character in the input. */
1630 char lex_error_buffer
[128];
1631 sprintf (lex_error_buffer
, "Invalid character `%s' in input",
1632 java_sprint_unicode (ctxp
->c_line
, ctxp
->c_line
->current
));
1633 java_lex_error (lex_error_buffer
, 1);
1639 /* This is called by the parser to see if an error should be generated
1640 due to numeric overflow. This function only handles the particular
1641 case of the largest negative value, and is only called in the case
1642 where this value is not preceded by `-'. */
1644 error_if_numeric_overflow (tree value
)
1646 if (TREE_CODE (value
) == INTEGER_CST
1647 && JAVA_RADIX10_FLAG (value
)
1648 && tree_int_cst_sgn (value
) < 0)
1650 if (TREE_TYPE (value
) == long_type_node
)
1651 java_lex_error ("Numeric overflow for `long' literal", 0);
1653 java_lex_error ("Numeric overflow for `int' literal", 0);
1656 #endif /* JC1_LITE */
1659 java_unicode_2_utf8 (unicode_t unicode
)
1661 if (RANGE (unicode
, 0x01, 0x7f))
1662 obstack_1grow (&temporary_obstack
, (char)unicode
);
1663 else if (RANGE (unicode
, 0x80, 0x7ff) || unicode
== 0)
1665 obstack_1grow (&temporary_obstack
,
1666 (unsigned char)(0xc0 | ((0x7c0 & unicode
) >> 6)));
1667 obstack_1grow (&temporary_obstack
,
1668 (unsigned char)(0x80 | (unicode
& 0x3f)));
1670 else /* Range 0x800-0xffff. */
1672 obstack_1grow (&temporary_obstack
,
1673 (unsigned char)(0xe0 | (unicode
& 0xf000) >> 12));
1674 obstack_1grow (&temporary_obstack
,
1675 (unsigned char)(0x80 | (unicode
& 0x0fc0) >> 6));
1676 obstack_1grow (&temporary_obstack
,
1677 (unsigned char)(0x80 | (unicode
& 0x003f)));
1683 build_wfl_node (tree node
)
1685 node
= build_expr_wfl (node
, ctxp
->filename
, ctxp
->elc
.line
, ctxp
->elc
.col
);
1686 /* Prevent java_complete_lhs from short-circuiting node (if constant). */
1687 TREE_TYPE (node
) = NULL_TREE
;
1693 java_lex_error (const char *msg ATTRIBUTE_UNUSED
, int forward ATTRIBUTE_UNUSED
)
1696 ctxp
->elc
.line
= ctxp
->c_line
->lineno
;
1697 ctxp
->elc
.col
= ctxp
->c_line
->char_col
-1+forward
;
1699 /* Might be caught in the middle of some error report. */
1700 ctxp
->java_error_flag
= 0;
1708 java_is_eol (FILE *fp
, int c
)
1715 if (next
!= '\n' && next
!= EOF
)
1727 java_get_line_col (const char *filename ATTRIBUTE_UNUSED
,
1728 int line ATTRIBUTE_UNUSED
, int col ATTRIBUTE_UNUSED
)
1733 /* Dumb implementation. Doesn't try to cache or optimize things. */
1734 /* First line of the file is line 1, first column is 1. */
1736 /* COL == -1 means, at the CR/LF in LINE. */
1737 /* COL == -2 means, at the first non space char in LINE. */
1740 int c
, ccol
, cline
= 1;
1741 int current_line_col
= 0;
1742 int first_non_space
= 0;
1745 if (!(fp
= fopen (filename
, "r")))
1746 fatal_io_error ("can't open %s", filename
);
1748 while (cline
!= line
)
1753 static const char msg
[] = "<<file too short - unexpected EOF>>";
1754 obstack_grow (&temporary_obstack
, msg
, sizeof(msg
)-1);
1757 if (java_is_eol (fp
, c
))
1761 /* Gather the chars of the current line in a buffer. */
1765 if (c
< 0 || java_is_eol (fp
, c
))
1767 if (!first_non_space
&& !JAVA_WHITE_SPACE_P (c
))
1768 first_non_space
= current_line_col
;
1769 obstack_1grow (&temporary_obstack
, c
);
1774 obstack_1grow (&temporary_obstack
, '\n');
1778 col
= current_line_col
;
1779 first_non_space
= 0;
1782 col
= first_non_space
;
1784 first_non_space
= 0;
1786 /* Place the '^' a the right position. */
1787 base
= obstack_base (&temporary_obstack
);
1788 for (ccol
= 1; ccol
<= col
+3; ccol
++)
1790 /* Compute \t when reaching first_non_space. */
1791 char c
= (first_non_space
?
1792 (base
[ccol
-1] == '\t' ? '\t' : ' ') : ' ');
1793 obstack_1grow (&temporary_obstack
, c
);
1795 obstack_grow0 (&temporary_obstack
, "^", 1);
1798 return obstack_finish (&temporary_obstack
);
1804 utf8_cmp (const unsigned char *str
, int length
, const char *name
)
1806 const unsigned char *limit
= str
+ length
;
1809 for (i
= 0; name
[i
]; ++i
)
1811 int ch
= UTF8_GET (str
, limit
);
1813 return ch
- name
[i
];
1816 return str
== limit
? 0 : 1;
1819 /* A sorted list of all C++ keywords. */
1821 static const char *const cxx_keywords
[] =
1929 /* Return true if NAME is a C++ keyword. */
1932 cxx_keyword_p (const char *name
, int length
)
1934 int last
= ARRAY_SIZE (cxx_keywords
);
1936 int mid
= (last
+ first
) / 2;
1939 for (mid
= (last
+ first
) / 2;
1941 old
= mid
, mid
= (last
+ first
) / 2)
1943 int kwl
= strlen (cxx_keywords
[mid
]);
1944 int min_length
= kwl
> length
? length
: kwl
;
1945 int r
= utf8_cmp (name
, min_length
, cxx_keywords
[mid
]);
1950 /* We've found a match if all the remaining characters are `$'. */
1951 for (i
= min_length
; i
< length
&& name
[i
] == '$'; ++i
)
1965 #endif /* JC1_LITE */