1 ------------------------------------------------------------------------------
3 -- GNAT COMPILER COMPONENTS --
9 -- Copyright (C) 1992-2023, Free Software Foundation, Inc. --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
17 -- for more details. You should have received a copy of the GNU General --
18 -- Public License distributed with GNAT; see file COPYING3. If not, go to --
19 -- http://www.gnu.org/licenses for a complete copy of the license. --
21 -- GNAT was originally developed by the GNAT team at New York University. --
22 -- Extensive contributions were provided by Ada Core Technologies Inc. --
24 ------------------------------------------------------------------------------
26 with Namet
; use Namet
;
27 with Types
; use Types
;
28 with Uintp
; use Uintp
;
29 with Urealp
; use Urealp
;
33 -- The scanner maintains a current state in the global variables defined
34 -- in this package. The call to the Scan routine advances this state to
35 -- the next token. The state is initialized by the call to one of the
36 -- initialization routines in Sinput.
38 -- The following type is used to identify token types returned by Scan.
39 -- The class column in this table indicates the token classes which
40 -- apply to the token, as defined by subsequent subtype declarations.
44 -- Token name Token type Class(es)
46 Tok_Integer_Literal
, -- numeric lit Literal, Lit_Or_Name
48 Tok_Real_Literal
, -- numeric lit Literal, Lit_Or_Name
50 Tok_String_Literal
, -- string lit Literal. Lit_Or_Name
52 Tok_Char_Literal
, -- char lit Name, Literal. Lit_Or_Name
54 Tok_Operator_Symbol
, -- op symbol Name, Literal, Lit_Or_Name, Desig
56 Tok_Identifier
, -- identifier Name, Lit_Or_Name, Desig
58 Tok_At_Sign
, -- @ AI12-0125-3 : target name
60 Tok_Double_Asterisk
, -- **
62 Tok_Ampersand
, -- & Binary_Addop
63 Tok_Minus
, -- - Binary_Addop, Unary_Addop
64 Tok_Plus
, -- + Binary_Addop, Unary_Addop
66 Tok_Asterisk
, -- * Mulop
77 -- Note: Tok_Raise is in no categories now, it used to be Cterm, Eterm,
78 -- After_SM, but now that Ada 2012 has added raise expressions, the
79 -- raise token can appear anywhere. Note in particular that Tok_Raise
80 -- being in Eterm stopped the parser from recognizing "return raise
81 -- exception-name". This degrades error recovery slightly, and perhaps
82 -- we could do better, but not worth the effort.
84 -- Ada 2022 introduces square brackets as delimiters for array and
85 -- container aggregates.
87 -- The left delimiter token of interpolated strings, and tokens { and }
88 -- of interpolated expressions are currently placed in no category since
89 -- they don't fit well in the existing categories.
91 Tok_Left_Interpolated_String
, -- f"
92 Tok_Left_Curly_Bracket
, -- {
94 Tok_Right_Curly_Bracket
, -- }
97 Tok_Apostrophe
, -- ' Namext
99 Tok_Left_Bracket
, -- [ Namext
100 Tok_Left_Paren
, -- ( Namext, Consk
102 Tok_Delta
, -- DELTA Atkwd, Sterm, Consk
103 Tok_Digits
, -- DIGITS Atkwd, Sterm, Consk
104 Tok_Range
, -- RANGE Atkwd, Sterm, Consk
106 Tok_Right_Paren
, -- ) Sterm
107 Tok_Right_Bracket
, -- ] Sterm
108 Tok_Comma
, -- , Sterm
110 Tok_And
, -- AND Logop, Sterm
111 Tok_Or
, -- OR Logop, Sterm
112 Tok_Xor
, -- XOR Logop, Sterm
114 Tok_Less
, -- < Relop, Sterm
115 Tok_Equal
, -- = Relop, Sterm
116 Tok_Greater
, -- > Relop, Sterm
117 Tok_Not_Equal
, -- /= Relop, Sterm
118 Tok_Greater_Equal
, -- >= Relop, Sterm
119 Tok_Less_Equal
, -- <= Relop, Sterm
121 Tok_In
, -- IN Relop, Sterm
122 Tok_Not
, -- NOT Relop, Sterm
124 Tok_Box
, -- <> Relop, Eterm, Sterm
125 Tok_Colon_Equal
, -- := Eterm, Sterm
126 Tok_Colon
, -- : Eterm, Sterm
127 Tok_Greater_Greater
, -- >> Eterm, Sterm
129 Tok_Abstract
, -- ABSTRACT Eterm, Sterm
130 Tok_Access
, -- ACCESS Eterm, Sterm
131 Tok_Aliased
, -- ALIASED Eterm, Sterm
132 Tok_All
, -- ALL Eterm, Sterm
133 Tok_Array
, -- ARRAY Eterm, Sterm
134 Tok_At
, -- AT Eterm, Sterm
135 Tok_Body
, -- BODY Eterm, Sterm
136 Tok_Constant
, -- CONSTANT Eterm, Sterm
137 Tok_Do
, -- DO Eterm, Sterm
138 Tok_Is
, -- IS Eterm, Sterm
139 Tok_Interface
, -- INTERFACE Eterm, Sterm
140 Tok_Limited
, -- LIMITED Eterm, Sterm
141 Tok_Of
, -- OF Eterm, Sterm
142 Tok_Out
, -- OUT Eterm, Sterm
143 Tok_Record
, -- RECORD Eterm, Sterm
144 Tok_Renames
, -- RENAMES Eterm, Sterm
145 Tok_Reverse
, -- REVERSE Eterm, Sterm
146 Tok_Some
, -- SOME Eterm, Sterm
147 Tok_Tagged
, -- TAGGED Eterm, Sterm
148 Tok_Then
, -- THEN Eterm, Sterm
150 Tok_Less_Less
, -- << Eterm, Sterm, After_SM
152 Tok_Abort
, -- ABORT Eterm, Sterm, After_SM
153 Tok_Accept
, -- ACCEPT Eterm, Sterm, After_SM
154 Tok_Case
, -- CASE Eterm, Sterm, After_SM
155 Tok_Delay
, -- DELAY Eterm, Sterm, After_SM
156 Tok_Else
, -- ELSE Eterm, Sterm, After_SM
157 Tok_Elsif
, -- ELSIF Eterm, Sterm, After_SM
158 Tok_End
, -- END Eterm, Sterm, After_SM
159 Tok_Exception
, -- EXCEPTION Eterm, Sterm, After_SM
160 Tok_Exit
, -- EXIT Eterm, Sterm, After_SM
161 Tok_Goto
, -- GOTO Eterm, Sterm, After_SM
162 Tok_If
, -- IF Eterm, Sterm, After_SM
163 Tok_Pragma
, -- PRAGMA Eterm, Sterm, After_SM
164 Tok_Requeue
, -- REQUEUE Eterm, Sterm, After_SM
165 Tok_Return
, -- RETURN Eterm, Sterm, After_SM
166 Tok_Select
, -- SELECT Eterm, Sterm, After_SM
167 Tok_Terminate
, -- TERMINATE Eterm, Sterm, After_SM
168 Tok_Until
, -- UNTIL Eterm, Sterm, After_SM
169 Tok_When
, -- WHEN Eterm, Sterm, After_SM
171 Tok_Begin
, -- BEGIN Eterm, Sterm, After_SM, Labeled_Stmt
172 Tok_Declare
, -- DECLARE Eterm, Sterm, After_SM, Labeled_Stmt
173 Tok_For
, -- FOR Eterm, Sterm, After_SM, Labeled_Stmt
174 Tok_Loop
, -- LOOP Eterm, Sterm, After_SM, Labeled_Stmt
175 Tok_While
, -- WHILE Eterm, Sterm, After_SM, Labeled_Stmt
177 Tok_Entry
, -- ENTRY Eterm, Sterm, Declk, Deckn, After_SM
178 Tok_Protected
, -- PROTECTED Eterm, Sterm, Declk, Deckn, After_SM
179 Tok_Task
, -- TASK Eterm, Sterm, Declk, Deckn, After_SM
180 Tok_Type
, -- TYPE Eterm, Sterm, Declk, Deckn, After_SM
181 Tok_Subtype
, -- SUBTYPE Eterm, Sterm, Declk, Deckn, After_SM
182 Tok_Overriding
, -- OVERRIDING Eterm, Sterm, Declk, Declk, After_SM
183 Tok_Synchronized
, -- SYNCHRONIZED Eterm, Sterm, Declk, Deckn, After_SM
184 Tok_Use
, -- USE Eterm, Sterm, Declk, Deckn, After_SM
186 Tok_Function
, -- FUNCTION Eterm, Sterm, Cunit, Declk, After_SM
187 Tok_Generic
, -- GENERIC Eterm, Sterm, Cunit, Declk, After_SM
188 Tok_Package
, -- PACKAGE Eterm, Sterm, Cunit, Declk, After_SM
189 Tok_Procedure
, -- PROCEDURE Eterm, Sterm, Cunit, Declk, After_SM
191 Tok_Private
, -- PRIVATE Eterm, Sterm, Cunit, After_SM
192 Tok_With
, -- WITH Eterm, Sterm, Cunit, After_SM
193 Tok_Separate
, -- SEPARATE Eterm, Sterm, Cunit, After_SM
195 Tok_EOF
, -- End of file Eterm, Sterm, Cterm, After_SM
197 Tok_Semicolon
, -- ; Eterm, Sterm, Cterm
199 Tok_Arrow
, -- => Sterm, Cterm, Chtok
201 Tok_Vertical_Bar
, -- | Cterm, Sterm, Chtok
203 Tok_Dot_Dot
, -- .. Sterm, Chtok
208 Tok_External_As_List
,
209 -- These four entries represent keywords for the project file language
210 -- and can be returned only in the case of scanning project files.
213 -- This entry is used when scanning project files (where it represents
214 -- an entire comment), and in preprocessing with the -C switch set
215 -- (where it represents just the "--" of a comment). For the project
216 -- file case, the text of the comment is stored in Comment_Id.
219 -- Represents an end of line. Not used during normal compilation scans
220 -- where end of line is ignored. Active for preprocessor scanning.
223 -- Special character used by the preprocessor. The character itself is
224 -- stored in Special_Character below.
227 -- No_Token is used for initializing Token values to indicate that
228 -- no value has been set yet.
230 function Keyword_Name
(Token
: Token_Type
) return Name_Id
;
231 -- Given a token that is a reserved word, return the corresponding Name_Id
232 -- in lower case. E.g. Keyword_Name (Tok_Begin) = Name_Find ("begin").
233 -- It is an error to pass any other kind of token.
235 -- Note: in the RM, operator symbol is a special case of string literal.
236 -- We distinguish at the lexical level in this compiler, since there are
237 -- many syntactic situations in which only an operator symbol is allowed.
239 -- The following subtype declarations group the token types into classes.
240 -- These are used for class tests in the parser.
242 subtype Token_Class_Numeric_Literal
is
243 Token_Type
range Tok_Integer_Literal
.. Tok_Real_Literal
;
246 subtype Token_Class_Literal
is
247 Token_Type
range Tok_Integer_Literal
.. Tok_Operator_Symbol
;
250 subtype Token_Class_Lit_Or_Name
is
251 Token_Type
range Tok_Integer_Literal
.. Tok_Identifier
;
253 subtype Token_Class_Binary_Addop
is
254 Token_Type
range Tok_Ampersand
.. Tok_Plus
;
255 -- Binary adding operator (& + -)
257 subtype Token_Class_Unary_Addop
is
258 Token_Type
range Tok_Minus
.. Tok_Plus
;
259 -- Unary adding operator (+ -)
261 subtype Token_Class_Mulop
is
262 Token_Type
range Tok_Asterisk
.. Tok_Slash
;
263 -- Multiplying operator
265 subtype Token_Class_Logop
is
266 Token_Type
range Tok_And
.. Tok_Xor
;
267 -- Logical operator (and, or, xor)
269 subtype Token_Class_Relop
is
270 Token_Type
range Tok_Less
.. Tok_Box
;
271 -- Relational operator (= /= < <= > >= not, in plus <> to catch misuse
272 -- of Pascal style not equal operator).
274 subtype Token_Class_Name
is
275 Token_Type
range Tok_Char_Literal
.. Tok_At_Sign
;
276 -- First token of name (4.1),
277 -- (identifier, char literal, operator symbol)
278 -- Includes '@' after Ada2012 corrigendum.
280 subtype Token_Class_Desig
is
281 Token_Type
range Tok_Operator_Symbol
.. Tok_At_Sign
;
282 -- Token which can be a Designator (identifier, operator symbol)
284 subtype Token_Class_Namext
is
285 Token_Type
range Tok_Dot
.. Tok_Left_Paren
;
286 -- Name extension tokens. These are tokens which can appear immediately
287 -- after a name to extend it recursively (period, quote, left paren)
289 subtype Token_Class_Consk
is
290 Token_Type
range Tok_Left_Paren
.. Tok_Range
;
291 -- Keywords which can start constraint
292 -- (left paren, delta, digits, range)
294 subtype Token_Class_Eterm
is
295 Token_Type
range Tok_Colon_Equal
.. Tok_Semicolon
;
296 -- Expression terminators. These tokens can never appear within a simple
297 -- expression. This is used for error recovery purposes (if we encounter
298 -- an error in an expression, we simply scan to the next Eterm token).
300 subtype Token_Class_Sterm
is
301 Token_Type
range Tok_Delta
.. Tok_Dot_Dot
;
302 -- Simple_Expression terminators. A Simple_Expression must be followed
303 -- by a token in this class, or an error message is issued complaining
304 -- about a missing binary operator.
306 subtype Token_Class_Atkwd
is
307 Token_Type
range Tok_Delta
.. Tok_Range
;
308 -- Attribute keywords. This class includes keywords which can be used
309 -- as an Attribute_Designator, namely DELTA, DIGITS and RANGE
311 subtype Token_Class_Cterm
is
312 Token_Type
range Tok_EOF
.. Tok_Vertical_Bar
;
313 -- Choice terminators. These tokens terminate a choice. This is used for
314 -- error recovery purposes (if we encounter an error in a Choice, we
315 -- simply scan to the next Cterm token).
317 subtype Token_Class_Chtok
is
318 Token_Type
range Tok_Arrow
.. Tok_Dot_Dot
;
319 -- Choice tokens. These tokens signal a choice when used in an Aggregate
321 subtype Token_Class_Cunit
is
322 Token_Type
range Tok_Function
.. Tok_Separate
;
323 -- Tokens which can begin a compilation unit
325 subtype Token_Class_Declk
is
326 Token_Type
range Tok_Entry
.. Tok_Procedure
;
327 -- Keywords which start a declaration
329 subtype Token_Class_Deckn
is
330 Token_Type
range Tok_Entry
.. Tok_Use
;
331 -- Keywords which start a declaration but can't start a compilation unit
333 subtype Token_Class_After_SM
is
334 Token_Type
range Tok_Less_Less
.. Tok_EOF
;
335 -- Tokens which always, or almost always, appear after a semicolon. Used
336 -- in the Resync_Past_Semicolon routine to avoid gobbling up stuff when
337 -- a semicolon is missing. Of significance only for error recovery.
339 subtype Token_Class_Labeled_Stmt
is
340 Token_Type
range Tok_Begin
.. Tok_While
;
341 -- Tokens which start labeled statements
343 type Token_Flag_Array
is array (Token_Type
) of Boolean;
344 Is_Reserved_Keyword
: constant Token_Flag_Array
:=
346 (Tok_Mod .. Tok_Rem => True,
347 Tok_New .. Tok_Null => True,
348 Tok_Delta .. Tok_Range => True,
349 Tok_And .. Tok_Xor => True,
350 Tok_In .. Tok_Not => True,
351 Tok_Abstract .. Tok_Then => True,
352 Tok_Abort .. Tok_Separate => True,
354 -- Flag array used to test for reserved word
356 procedure Initialize_Ada_Keywords;
357 -- Set up Token_Type values in Names table entries for Ada reserved
358 -- words. This ignores Ada_Version; Ada_Version is taken into account in
359 -- Snames.Is_Keyword_Name.
361 --------------------------
362 -- Scan State Variables --
363 --------------------------
365 -- Note: these variables can only be referenced during the parsing of a
366 -- file. Reference to any of them from Sem or the expander is wrong.
368 -- These variables are initialized by Scn.Initialize_Scanner, and should
369 -- not be referenced before such a call, except for saving and restoring
372 Scan_Ptr : Source_Ptr := No_Location;
373 -- Current scan pointer location. After a call to Scan, this points
374 -- just past the end of the token just scanned.
376 Token : Token_Type := No_Token;
377 -- Type of current token
379 Token_Ptr : Source_Ptr := No_Location;
380 -- Pointer to first character of current token
382 Current_Line_Start : Source_Ptr := No_Location;
383 -- Pointer to first character of line containing current token
385 Start_Column : Column_Number := No_Column_Number;
386 -- Starting column number (zero origin) of the first non-blank character
387 -- on the line containing the current token. This is used for error
388 -- recovery circuits which depend on looking at the column line up.
390 Type_Token_Location : Source_Ptr := No_Location;
391 -- Within a type declaration, gives the location of the TYPE keyword that
392 -- opened the type declaration. Used in checking the end column of a record
393 -- declaration, which can line up either with the TYPE keyword, or with the
394 -- start of the line containing the RECORD keyword.
396 Checksum : Word := 0;
397 -- Used to accumulate a CRC representing the tokens in the source
398 -- file being compiled. This CRC includes only program tokens, and
399 -- excludes comments.
401 Limited_Checksum : Word := 0;
402 -- Used to accumulate a CRC representing significant tokens in the
403 -- limited view of a package, i.e. visible type names and related
404 -- tagged indicators.
406 First_Non_Blank_Location : Source_Ptr := No_Location;
407 -- Location of first non-blank character on the line containing the
408 -- current token (i.e. the location of the character whose column number
409 -- is stored in Start_Column).
411 Token_Node : Node_Id := Empty;
412 -- Node table Id for the current token. This is set only if the current
413 -- token is one for which the scanner constructs a node (i.e. it is an
414 -- identifier, operator symbol, or literal). For other token types,
415 -- Token_Node is undefined.
417 Token_Name : Name_Id := No_Name;
418 -- For identifiers, this is set to the Name_Id of the identifier scanned.
419 -- For all other tokens, Token_Name is set to Error_Name. Note that it
420 -- would be possible for the caller to extract this information from
421 -- Token_Node. We set Token_Name separately for two reasons. First it
422 -- allows a quicker test for a specific identifier. Second, it allows
423 -- a version of the parser to be built that does not build tree nodes,
424 -- usable as a syntax checker.
426 Prev_Token : Token_Type := No_Token;
427 -- Type of previous token
429 Prev_Token_Ptr : Source_Ptr;
430 -- Pointer to first character of previous token
432 Version_To_Be_Found : Boolean;
433 -- This flag is True if the scanner is still looking for an RCS version
434 -- number in a comment. Normally it is initialized to False so that this
435 -- circuit is not activated. If the -dv switch is set, then this flag is
436 -- initialized to True, and then reset when the version number is found.
437 -- We do things this way to minimize the impact on comment scanning.
439 Character_Code : Char_Code;
440 -- Valid only when Token is Tok_Char_Literal. Contains the value of the
443 Real_Literal_Value : Ureal;
444 -- Valid only when Token is Tok_Real_Literal. Contains the value of the
447 Int_Literal_Value : Uint;
448 -- Valid only when Token = Tok_Integer_Literal, and we are not in
449 -- syntax-only mode. Contains the value of the scanned literal.
451 Based_Literal_Uses_Colon : Boolean;
452 -- Valid only when Token = Tok_Integer_Literal or Tok_Real_Literal. Set
453 -- True only for the case of a based literal using ':' instead of '#
'.
455 String_Literal_Id : String_Id;
456 -- Valid only when Token = Tok_String_Literal or Tok_Operator_Symbol.
457 -- Contains the Id for currently scanned string value.
459 Wide_Character_Found : Boolean := False;
460 -- Valid only when Token = Tok_String_Literal. Set True if wide character
461 -- found (i.e. a character that does not fit in Character, but fits in
462 -- Wide_Wide_Character).
464 Wide_Wide_Character_Found : Boolean := False;
465 -- Valid only when Token = Tok_String_Literal. Set True if wide wide
466 -- character found (i.e. a character that does not fit in Character or
469 subtype Special_Preprocessor_Character is Character with
470 Predicate => Special_Preprocessor_Character in '#
' | '$
';
471 Special_Character : Special_Preprocessor_Character;
473 Comment_Id : Name_Id := No_Name;
474 -- Valid only when Token = Tok_Comment. Store the string that follows
475 -- the "--" of a comment when scanning project files.
477 -- Is it really right for this to be a Name rather than a String, what
478 -- about the case of Wide_Wide_Characters???
480 Inside_Depends : Boolean := False;
481 -- True while parsing the argument of a Depends or Refined_Depends pragma
482 -- or aspect. Used to allow/require nonstandard style rules for =>+ with
485 Inside_Interpolated_String_Literal : Boolean := False;
486 -- True while parsing an interpolated string literal
488 Inside_If_Expression : Nat := 0;
489 -- This is a counter that is set non-zero while scanning out an if
490 -- expression (incremented on entry, decremented on exit). It is used to
491 -- disconnect format checks that normally apply to keywords THEN, ELSE etc.
493 Inside_Pragma : Boolean := False;
494 -- True within a pragma. Used to avoid complaining about reserved words
495 -- within pragmas (see Scan_Reserved_Identifier).
497 --------------------------------------------------------
498 -- Procedures for Saving and Restoring the Scan State --
499 --------------------------------------------------------
501 -- The following procedures can be used to save and restore the entire
502 -- scan state. They are used in cases where it is necessary to backup
503 -- the scan during the parse.
505 type Saved_Scan_State is private;
506 -- Used for saving and restoring the scan state
508 procedure Save_Scan_State (Saved_State : out Saved_Scan_State);
509 pragma Inline (Save_Scan_State);
510 -- Saves the current scan state for possible later restoration. Note that
511 -- there is no harm in saving the state and then never restoring it.
513 procedure Restore_Scan_State (Saved_State : Saved_Scan_State);
514 pragma Inline (Restore_Scan_State);
515 -- Restores a scan state saved by a call to Save_Scan_State.
516 -- The saved scan state must refer to the current source file.
519 type Saved_Scan_State is record
520 Save_Scan_Ptr : Source_Ptr;
521 Save_Token : Token_Type;
522 Save_Token_Ptr : Source_Ptr;
523 Save_Current_Line_Start : Source_Ptr;
524 Save_Start_Column : Column_Number;
525 Save_Checksum : Word;
526 Save_First_Non_Blank_Location : Source_Ptr;
527 Save_Token_Node : Node_Id;
528 Save_Token_Name : Name_Id;
529 Save_Prev_Token : Token_Type;
530 Save_Prev_Token_Ptr : Source_Ptr;