1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // A scanner for Go source text. Takes a []byte as source which can
6 // then be tokenized through repeated calls to the Scan function.
10 // fset := token.NewFileSet() // position information is relative to fset
11 // file := fset.AddFile(filename, fset.Base(), len(src)) // register file
12 // s.Init(file, src, nil /* no error handler */, 0)
14 // pos, tok, lit := s.Scan()
15 // if tok == token.EOF {
18 // // do something here with pos, tok, and lit
33 // A Scanner holds the scanner's internal state while processing
34 // a given text. It can be allocated as part of another data
35 // structure but must be initialized via Init before use.
39 file
*token
.File
// source file handle
40 dir
string // directory portion of file.Name()
42 err ErrorHandler
// error reporting; or nil
43 mode
uint // scanning mode
46 ch
int // current character
47 offset
int // character offset
48 rdOffset
int // reading offset (position after current character)
49 lineOffset
int // current line offset
50 insertSemi
bool // insert a semicolon before next newline
52 // public state - ok to modify
53 ErrorCount
int // number of errors encountered
57 // Read the next Unicode char into S.ch.
58 // S.ch < 0 means end-of-file.
60 func (S
*Scanner
) next() {
61 if S
.rdOffset
< len(S
.src
) {
64 S
.lineOffset
= S
.offset
65 S
.file
.AddLine(S
.offset
)
67 r
, w
:= int(S
.src
[S
.rdOffset
]), 1
70 S
.error(S
.offset
, "illegal character NUL")
73 r
, w
= utf8
.DecodeRune(S
.src
[S
.rdOffset
:])
74 if r
== utf8
.RuneError
&& w
== 1 {
75 S
.error(S
.offset
, "illegal UTF-8 encoding")
83 S
.lineOffset
= S
.offset
84 S
.file
.AddLine(S
.offset
)
91 // The mode parameter to the Init function is a set of flags (or 0).
92 // They control scanner behavior.
95 ScanComments
= 1 << iota // return comments as COMMENT tokens
96 AllowIllegalChars
// do not report an error for illegal chars
97 InsertSemis
// automatically insert semicolons
100 // Init prepares the scanner S to tokenize the text src by setting the
101 // scanner at the beginning of src. The scanner uses the file set file
102 // for position information and it adds line information for each line.
103 // It is ok to re-use the same file when re-scanning the same file as
104 // line information which is already present is ignored. Init causes a
105 // panic if the file size does not match the src size.
107 // Calls to Scan will use the error handler err if they encounter a
108 // syntax error and err is not nil. Also, for each error encountered,
109 // the Scanner field ErrorCount is incremented by one. The mode parameter
110 // determines how comments, illegal characters, and semicolons are handled.
112 // Note that Init may call err if there is an error in the first character
115 func (S
*Scanner
) Init(file
*token
.File
, src
[]byte, err ErrorHandler
, mode
uint) {
116 // Explicitly initialize all fields since a scanner may be reused.
117 if file
.Size() != len(src
) {
118 panic("file size does not match src len")
121 S
.dir
, _
= filepath
.Split(file
.Name())
137 func charString(ch
int) string {
163 return "'" + s
+ "' (U+" + strconv
.Itob(ch
, 16) + ")"
167 func (S
*Scanner
) error(offs
int, msg
string) {
169 S
.err
.Error(S
.file
.Position(S
.file
.Pos(offs
)), msg
)
175 var prefix
= []byte("//line ")
177 func (S
*Scanner
) interpretLineComment(text
[]byte) {
178 if bytes
.HasPrefix(text
, prefix
) {
179 // get filename and line number, if any
180 if i
:= bytes
.LastIndex(text
, []byte{':'}); i
> 0 {
181 if line
, err
:= strconv
.Atoi(string(text
[i
+1:])); err
== nil && line
> 0 {
182 // valid //line filename:line comment;
183 filename
:= filepath
.Clean(string(text
[len(prefix
):i
]))
184 if !filepath
.IsAbs(filename
) {
185 // make filename relative to current directory
186 filename
= filepath
.Join(S
.dir
, filename
)
188 // update scanner position
189 S
.file
.AddLineInfo(S
.lineOffset
, filename
, line
-1) // -1 since comment applies to next line
196 func (S
*Scanner
) scanComment() {
197 // initial '/' already consumed; S.ch == '/' || S.ch == '*'
198 offs
:= S
.offset
- 1 // position of initial '/'
203 for S
.ch
!= '\n' && S
.ch
>= 0 {
206 if offs
== S
.lineOffset
{
207 // comment starts at the beginning of the current line
208 S
.interpretLineComment(S
.src
[offs
:S
.offset
])
218 if ch
== '*' && S
.ch
== '/' {
224 S
.error(offs
, "comment not terminated")
228 func (S
*Scanner
) findLineEnd() bool {
229 // initial '/' already consumed
231 defer func(offs
int) {
232 // reset scanner state to where it was upon calling findLineEnd
235 S
.rdOffset
= offs
+ 1
236 S
.next() // consume initial '/' again
239 // read ahead until a newline, EOF, or non-comment token is found
240 for S
.ch
== '/' || S
.ch
== '*' {
242 //-style comment always contains a newline
245 /*-style comment: look for newline */
253 if ch
== '*' && S
.ch
== '/' {
258 S
.skipWhitespace() // S.insertSemi is set
259 if S
.ch
< 0 || S
.ch
== '\n' {
266 S
.next() // consume '/'
273 func isLetter(ch
int) bool {
274 return 'a' <= ch
&& ch
<= 'z' ||
'A' <= ch
&& ch
<= 'Z' || ch
== '_' || ch
>= 0x80 && unicode
.IsLetter(ch
)
278 func isDigit(ch
int) bool {
279 return '0' <= ch
&& ch
<= '9' || ch
>= 0x80 && unicode
.IsDigit(ch
)
283 func (S
*Scanner
) scanIdentifier() token
.Token
{
285 for isLetter(S
.ch
) ||
isDigit(S
.ch
) {
288 return token
.Lookup(S
.src
[offs
:S
.offset
])
292 func digitVal(ch
int) int {
294 case '0' <= ch
&& ch
<= '9':
296 case 'a' <= ch
&& ch
<= 'f':
298 case 'A' <= ch
&& ch
<= 'F':
301 return 16 // larger than any legal digit val
305 func (S
*Scanner
) scanMantissa(base
int) {
306 for digitVal(S
.ch
) < base
{
312 func (S
*Scanner
) scanNumber(seenDecimalPoint
bool) token
.Token
{
313 // digitVal(S.ch) < 10
316 if seenDecimalPoint
{
326 if S
.ch
== 'x' || S
.ch
== 'X' {
331 // octal int or float
332 seenDecimalDigit
:= false
334 if S
.ch
== '8' || S
.ch
== '9' {
335 // illegal octal int or float
336 seenDecimalDigit
= true
339 if S
.ch
== '.' || S
.ch
== 'e' || S
.ch
== 'E' || S
.ch
== 'i' {
343 if seenDecimalDigit
{
344 S
.error(offs
, "illegal octal number")
350 // decimal int or float
361 if S
.ch
== 'e' || S
.ch
== 'E' {
364 if S
.ch
== '-' || S
.ch
== '+' {
380 func (S
*Scanner
) scanEscape(quote
int) {
383 var i
, base
, max
uint32
385 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote
:
388 case '0', '1', '2', '3', '4', '5', '6', '7':
389 i
, base
, max
= 3, 8, 255
392 i
, base
, max
= 2, 16, 255
395 i
, base
, max
= 4, 16, unicode
.MaxRune
398 i
, base
, max
= 8, 16, unicode
.MaxRune
400 S
.next() // always make progress
401 S
.error(offs
, "unknown escape sequence")
406 for ; i
> 0 && S
.ch
!= quote
&& S
.ch
>= 0; i
-- {
407 d
:= uint32(digitVal(S
.ch
))
409 S
.error(S
.offset
, "illegal character in escape sequence")
415 // in case of an error, consume remaining chars
416 for ; i
> 0 && S
.ch
!= quote
&& S
.ch
>= 0; i
-- {
419 if x
> max ||
0xd800 <= x
&& x
< 0xe000 {
420 S
.error(offs
, "escape sequence is invalid Unicode code point")
425 func (S
*Scanner
) scanChar() {
426 // '\'' opening already consumed
434 if ch
== '\n' || ch
< 0 {
435 S
.error(offs
, "character literal not terminated")
447 S
.error(offs
, "illegal character literal")
452 func (S
*Scanner
) scanString() {
453 // '"' opening already consumed
459 if ch
== '\n' || ch
< 0 {
460 S
.error(offs
, "string not terminated")
472 func (S
*Scanner
) scanRawString() {
473 // '`' opening already consumed
480 S
.error(offs
, "string not terminated")
489 func (S
*Scanner
) skipWhitespace() {
490 for S
.ch
== ' ' || S
.ch
== '\t' || S
.ch
== '\n' && !S
.insertSemi || S
.ch
== '\r' {
496 // Helper functions for scanning multi-byte tokens such as >> += >>= .
497 // Different routines recognize different length tok_i based on matches
498 // of ch_i. If a token ends in '=', the result is tok1 or tok3
499 // respectively. Otherwise, the result is tok0 if there was no other
500 // matching character, or tok2 if the matching character was ch2.
502 func (S
*Scanner
) switch2(tok0
, tok1 token
.Token
) token
.Token
{
511 func (S
*Scanner
) switch3(tok0
, tok1 token
.Token
, ch2
int, tok2 token
.Token
) token
.Token
{
524 func (S
*Scanner
) switch4(tok0
, tok1 token
.Token
, ch2
int, tok2
, tok3 token
.Token
) token
.Token
{
541 // Scan scans the next token and returns the token position,
542 // the token, and the literal string corresponding to the
543 // token. The source end is indicated by token.EOF.
545 // If the returned token is token.SEMICOLON, the corresponding
546 // literal string is ";" if the semicolon was present in the source,
547 // and "\n" if the semicolon was inserted because of a newline or
550 // For more tolerant parsing, Scan will return a valid token if
551 // possible even if a syntax error was encountered. Thus, even
552 // if the resulting token sequence contains no illegal tokens,
553 // a client may not assume that no error occurred. Instead it
554 // must check the scanner's ErrorCount or the number of calls
555 // of the error handler, if there was one installed.
557 // Scan adds line information to the file added to the file
558 // set with Init. Token positions are relative to that file
559 // and thus relative to the file set.
561 func (S
*Scanner
) Scan() (token
.Pos
, token
.Token
, string) {
565 // current token start
570 // determine token value
573 tok
= S
.scanIdentifier()
575 case token
.IDENT
, token
.BREAK
, token
.CONTINUE
, token
.FALLTHROUGH
, token
.RETURN
:
578 case digitVal(ch
) < 10:
580 tok
= S
.scanNumber(false)
582 S
.next() // always make progress
586 S
.insertSemi
= false // EOF consumed
587 return S
.file
.Pos(offs
), token
.SEMICOLON
, "\n"
591 // we only reach here if S.insertSemi was
592 // set in the first place and exited early
593 // from S.skipWhitespace()
594 S
.insertSemi
= false // newline consumed
595 return S
.file
.Pos(offs
), token
.SEMICOLON
, "\n"
609 tok
= S
.switch2(token
.COLON
, token
.DEFINE
)
611 if digitVal(S
.ch
) < 10 {
613 tok
= S
.scanNumber(true)
614 } else if S
.ch
== '.' {
626 tok
= token
.SEMICOLON
643 tok
= S
.switch3(token
.ADD
, token
.ADD_ASSIGN
, '+', token
.INC
)
644 if tok
== token
.INC
{
648 tok
= S
.switch3(token
.SUB
, token
.SUB_ASSIGN
, '-', token
.DEC
)
649 if tok
== token
.DEC
{
653 tok
= S
.switch2(token
.MUL
, token
.MUL_ASSIGN
)
655 if S
.ch
== '/' || S
.ch
== '*' {
657 if S
.insertSemi
&& S
.findLineEnd() {
658 // reset position to the beginning of the comment
661 S
.rdOffset
= offs
+ 1
662 S
.insertSemi
= false // newline consumed
663 return S
.file
.Pos(offs
), token
.SEMICOLON
, "\n"
666 if S
.mode
&ScanComments
== 0 {
668 S
.insertSemi
= false // newline consumed
673 tok
= S
.switch2(token
.QUO
, token
.QUO_ASSIGN
)
676 tok
= S
.switch2(token
.REM
, token
.REM_ASSIGN
)
678 tok
= S
.switch2(token
.XOR
, token
.XOR_ASSIGN
)
684 tok
= S
.switch4(token
.LSS
, token
.LEQ
, '<', token
.SHL
, token
.SHL_ASSIGN
)
687 tok
= S
.switch4(token
.GTR
, token
.GEQ
, '>', token
.SHR
, token
.SHR_ASSIGN
)
689 tok
= S
.switch2(token
.ASSIGN
, token
.EQL
)
691 tok
= S
.switch2(token
.NOT
, token
.NEQ
)
695 tok
= S
.switch2(token
.AND_NOT
, token
.AND_NOT_ASSIGN
)
697 tok
= S
.switch3(token
.AND
, token
.AND_ASSIGN
, '&', token
.LAND
)
700 tok
= S
.switch3(token
.OR
, token
.OR_ASSIGN
, '|', token
.LOR
)
702 if S
.mode
&AllowIllegalChars
== 0 {
703 S
.error(offs
, "illegal character "+charString(ch
))
705 insertSemi
= S
.insertSemi
// preserve insertSemi info
709 if S
.mode
&InsertSemis
!= 0 {
710 S
.insertSemi
= insertSemi
713 // TODO(gri): The scanner API should change such that the literal string
714 // is only valid if an actual literal was scanned. This will
715 // permit a more efficient implementation.
716 return S
.file
.Pos(offs
), tok
, string(S
.src
[offs
:S
.offset
])