Update to current Go library.
[official-gcc.git] / libgo / go / go / scanner / scanner.go
blob2f949ad25689660b7369a3d6e252809286f9340e
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // A scanner for Go source text. Takes a []byte as source which can
6 // then be tokenized through repeated calls to the Scan function.
7 // Typical use:
8 //
9 // var s Scanner
10 // fset := token.NewFileSet() // position information is relative to fset
11 // file := fset.AddFile(filename, fset.Base(), len(src)) // register file
12 // s.Init(file, src, nil /* no error handler */, 0)
13 // for {
14 // pos, tok, lit := s.Scan()
15 // if tok == token.EOF {
16 // break
17 // }
18 // // do something here with pos, tok, and lit
19 // }
21 package scanner
23 import (
24 "bytes"
25 "go/token"
26 "path/filepath"
27 "strconv"
28 "unicode"
29 "utf8"
33 // A Scanner holds the scanner's internal state while processing
34 // a given text. It can be allocated as part of another data
35 // structure but must be initialized via Init before use.
37 type Scanner struct {
38 // immutable state
39 file *token.File // source file handle
40 dir string // directory portion of file.Name()
41 src []byte // source
42 err ErrorHandler // error reporting; or nil
43 mode uint // scanning mode
45 // scanning state
46 ch int // current character
47 offset int // character offset
48 rdOffset int // reading offset (position after current character)
49 lineOffset int // current line offset
50 insertSemi bool // insert a semicolon before next newline
52 // public state - ok to modify
53 ErrorCount int // number of errors encountered
57 // Read the next Unicode char into S.ch.
58 // S.ch < 0 means end-of-file.
60 func (S *Scanner) next() {
61 if S.rdOffset < len(S.src) {
62 S.offset = S.rdOffset
63 if S.ch == '\n' {
64 S.lineOffset = S.offset
65 S.file.AddLine(S.offset)
67 r, w := int(S.src[S.rdOffset]), 1
68 switch {
69 case r == 0:
70 S.error(S.offset, "illegal character NUL")
71 case r >= 0x80:
72 // not ASCII
73 r, w = utf8.DecodeRune(S.src[S.rdOffset:])
74 if r == utf8.RuneError && w == 1 {
75 S.error(S.offset, "illegal UTF-8 encoding")
78 S.rdOffset += w
79 S.ch = r
80 } else {
81 S.offset = len(S.src)
82 if S.ch == '\n' {
83 S.lineOffset = S.offset
84 S.file.AddLine(S.offset)
86 S.ch = -1 // eof
91 // The mode parameter to the Init function is a set of flags (or 0).
92 // They control scanner behavior.
94 const (
95 ScanComments = 1 << iota // return comments as COMMENT tokens
96 AllowIllegalChars // do not report an error for illegal chars
97 InsertSemis // automatically insert semicolons
100 // Init prepares the scanner S to tokenize the text src by setting the
101 // scanner at the beginning of src. The scanner uses the file set file
102 // for position information and it adds line information for each line.
103 // It is ok to re-use the same file when re-scanning the same file as
104 // line information which is already present is ignored. Init causes a
105 // panic if the file size does not match the src size.
107 // Calls to Scan will use the error handler err if they encounter a
108 // syntax error and err is not nil. Also, for each error encountered,
109 // the Scanner field ErrorCount is incremented by one. The mode parameter
110 // determines how comments, illegal characters, and semicolons are handled.
112 // Note that Init may call err if there is an error in the first character
113 // of the file.
115 func (S *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode uint) {
116 // Explicitly initialize all fields since a scanner may be reused.
117 if file.Size() != len(src) {
118 panic("file size does not match src len")
120 S.file = file
121 S.dir, _ = filepath.Split(file.Name())
122 S.src = src
123 S.err = err
124 S.mode = mode
126 S.ch = ' '
127 S.offset = 0
128 S.rdOffset = 0
129 S.lineOffset = 0
130 S.insertSemi = false
131 S.ErrorCount = 0
133 S.next()
137 func charString(ch int) string {
138 var s string
139 switch ch {
140 case -1:
141 return `EOF`
142 case '\a':
143 s = `\a`
144 case '\b':
145 s = `\b`
146 case '\f':
147 s = `\f`
148 case '\n':
149 s = `\n`
150 case '\r':
151 s = `\r`
152 case '\t':
153 s = `\t`
154 case '\v':
155 s = `\v`
156 case '\\':
157 s = `\\`
158 case '\'':
159 s = `\'`
160 default:
161 s = string(ch)
163 return "'" + s + "' (U+" + strconv.Itob(ch, 16) + ")"
167 func (S *Scanner) error(offs int, msg string) {
168 if S.err != nil {
169 S.err.Error(S.file.Position(S.file.Pos(offs)), msg)
171 S.ErrorCount++
175 var prefix = []byte("//line ")
177 func (S *Scanner) interpretLineComment(text []byte) {
178 if bytes.HasPrefix(text, prefix) {
179 // get filename and line number, if any
180 if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
181 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
182 // valid //line filename:line comment;
183 filename := filepath.Clean(string(text[len(prefix):i]))
184 if !filepath.IsAbs(filename) {
185 // make filename relative to current directory
186 filename = filepath.Join(S.dir, filename)
188 // update scanner position
189 S.file.AddLineInfo(S.lineOffset, filename, line-1) // -1 since comment applies to next line
196 func (S *Scanner) scanComment() {
197 // initial '/' already consumed; S.ch == '/' || S.ch == '*'
198 offs := S.offset - 1 // position of initial '/'
200 if S.ch == '/' {
201 //-style comment
202 S.next()
203 for S.ch != '\n' && S.ch >= 0 {
204 S.next()
206 if offs == S.lineOffset {
207 // comment starts at the beginning of the current line
208 S.interpretLineComment(S.src[offs:S.offset])
210 return
213 /*-style comment */
214 S.next()
215 for S.ch >= 0 {
216 ch := S.ch
217 S.next()
218 if ch == '*' && S.ch == '/' {
219 S.next()
220 return
224 S.error(offs, "comment not terminated")
228 func (S *Scanner) findLineEnd() bool {
229 // initial '/' already consumed
231 defer func(offs int) {
232 // reset scanner state to where it was upon calling findLineEnd
233 S.ch = '/'
234 S.offset = offs
235 S.rdOffset = offs + 1
236 S.next() // consume initial '/' again
237 }(S.offset - 1)
239 // read ahead until a newline, EOF, or non-comment token is found
240 for S.ch == '/' || S.ch == '*' {
241 if S.ch == '/' {
242 //-style comment always contains a newline
243 return true
245 /*-style comment: look for newline */
246 S.next()
247 for S.ch >= 0 {
248 ch := S.ch
249 if ch == '\n' {
250 return true
252 S.next()
253 if ch == '*' && S.ch == '/' {
254 S.next()
255 break
258 S.skipWhitespace() // S.insertSemi is set
259 if S.ch < 0 || S.ch == '\n' {
260 return true
262 if S.ch != '/' {
263 // non-comment token
264 return false
266 S.next() // consume '/'
269 return false
273 func isLetter(ch int) bool {
274 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
278 func isDigit(ch int) bool {
279 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
283 func (S *Scanner) scanIdentifier() token.Token {
284 offs := S.offset
285 for isLetter(S.ch) || isDigit(S.ch) {
286 S.next()
288 return token.Lookup(S.src[offs:S.offset])
292 func digitVal(ch int) int {
293 switch {
294 case '0' <= ch && ch <= '9':
295 return ch - '0'
296 case 'a' <= ch && ch <= 'f':
297 return ch - 'a' + 10
298 case 'A' <= ch && ch <= 'F':
299 return ch - 'A' + 10
301 return 16 // larger than any legal digit val
305 func (S *Scanner) scanMantissa(base int) {
306 for digitVal(S.ch) < base {
307 S.next()
312 func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token {
313 // digitVal(S.ch) < 10
314 tok := token.INT
316 if seenDecimalPoint {
317 tok = token.FLOAT
318 S.scanMantissa(10)
319 goto exponent
322 if S.ch == '0' {
323 // int or float
324 offs := S.offset
325 S.next()
326 if S.ch == 'x' || S.ch == 'X' {
327 // hexadecimal int
328 S.next()
329 S.scanMantissa(16)
330 } else {
331 // octal int or float
332 seenDecimalDigit := false
333 S.scanMantissa(8)
334 if S.ch == '8' || S.ch == '9' {
335 // illegal octal int or float
336 seenDecimalDigit = true
337 S.scanMantissa(10)
339 if S.ch == '.' || S.ch == 'e' || S.ch == 'E' || S.ch == 'i' {
340 goto fraction
342 // octal int
343 if seenDecimalDigit {
344 S.error(offs, "illegal octal number")
347 goto exit
350 // decimal int or float
351 S.scanMantissa(10)
353 fraction:
354 if S.ch == '.' {
355 tok = token.FLOAT
356 S.next()
357 S.scanMantissa(10)
360 exponent:
361 if S.ch == 'e' || S.ch == 'E' {
362 tok = token.FLOAT
363 S.next()
364 if S.ch == '-' || S.ch == '+' {
365 S.next()
367 S.scanMantissa(10)
370 if S.ch == 'i' {
371 tok = token.IMAG
372 S.next()
375 exit:
376 return tok
380 func (S *Scanner) scanEscape(quote int) {
381 offs := S.offset
383 var i, base, max uint32
384 switch S.ch {
385 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
386 S.next()
387 return
388 case '0', '1', '2', '3', '4', '5', '6', '7':
389 i, base, max = 3, 8, 255
390 case 'x':
391 S.next()
392 i, base, max = 2, 16, 255
393 case 'u':
394 S.next()
395 i, base, max = 4, 16, unicode.MaxRune
396 case 'U':
397 S.next()
398 i, base, max = 8, 16, unicode.MaxRune
399 default:
400 S.next() // always make progress
401 S.error(offs, "unknown escape sequence")
402 return
405 var x uint32
406 for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
407 d := uint32(digitVal(S.ch))
408 if d >= base {
409 S.error(S.offset, "illegal character in escape sequence")
410 break
412 x = x*base + d
413 S.next()
415 // in case of an error, consume remaining chars
416 for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
417 S.next()
419 if x > max || 0xd800 <= x && x < 0xe000 {
420 S.error(offs, "escape sequence is invalid Unicode code point")
425 func (S *Scanner) scanChar() {
426 // '\'' opening already consumed
427 offs := S.offset - 1
429 n := 0
430 for S.ch != '\'' {
431 ch := S.ch
433 S.next()
434 if ch == '\n' || ch < 0 {
435 S.error(offs, "character literal not terminated")
436 n = 1
437 break
439 if ch == '\\' {
440 S.scanEscape('\'')
444 S.next()
446 if n != 1 {
447 S.error(offs, "illegal character literal")
452 func (S *Scanner) scanString() {
453 // '"' opening already consumed
454 offs := S.offset - 1
456 for S.ch != '"' {
457 ch := S.ch
458 S.next()
459 if ch == '\n' || ch < 0 {
460 S.error(offs, "string not terminated")
461 break
463 if ch == '\\' {
464 S.scanEscape('"')
468 S.next()
472 func (S *Scanner) scanRawString() {
473 // '`' opening already consumed
474 offs := S.offset - 1
476 for S.ch != '`' {
477 ch := S.ch
478 S.next()
479 if ch < 0 {
480 S.error(offs, "string not terminated")
481 break
485 S.next()
489 func (S *Scanner) skipWhitespace() {
490 for S.ch == ' ' || S.ch == '\t' || S.ch == '\n' && !S.insertSemi || S.ch == '\r' {
491 S.next()
496 // Helper functions for scanning multi-byte tokens such as >> += >>= .
497 // Different routines recognize different length tok_i based on matches
498 // of ch_i. If a token ends in '=', the result is tok1 or tok3
499 // respectively. Otherwise, the result is tok0 if there was no other
500 // matching character, or tok2 if the matching character was ch2.
502 func (S *Scanner) switch2(tok0, tok1 token.Token) token.Token {
503 if S.ch == '=' {
504 S.next()
505 return tok1
507 return tok0
511 func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) token.Token {
512 if S.ch == '=' {
513 S.next()
514 return tok1
516 if S.ch == ch2 {
517 S.next()
518 return tok2
520 return tok0
524 func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Token) token.Token {
525 if S.ch == '=' {
526 S.next()
527 return tok1
529 if S.ch == ch2 {
530 S.next()
531 if S.ch == '=' {
532 S.next()
533 return tok3
535 return tok2
537 return tok0
541 // Scan scans the next token and returns the token position,
542 // the token, and the literal string corresponding to the
543 // token. The source end is indicated by token.EOF.
545 // If the returned token is token.SEMICOLON, the corresponding
546 // literal string is ";" if the semicolon was present in the source,
547 // and "\n" if the semicolon was inserted because of a newline or
548 // at EOF.
550 // For more tolerant parsing, Scan will return a valid token if
551 // possible even if a syntax error was encountered. Thus, even
552 // if the resulting token sequence contains no illegal tokens,
553 // a client may not assume that no error occurred. Instead it
554 // must check the scanner's ErrorCount or the number of calls
555 // of the error handler, if there was one installed.
557 // Scan adds line information to the file added to the file
558 // set with Init. Token positions are relative to that file
559 // and thus relative to the file set.
561 func (S *Scanner) Scan() (token.Pos, token.Token, string) {
562 scanAgain:
563 S.skipWhitespace()
565 // current token start
566 insertSemi := false
567 offs := S.offset
568 tok := token.ILLEGAL
570 // determine token value
571 switch ch := S.ch; {
572 case isLetter(ch):
573 tok = S.scanIdentifier()
574 switch tok {
575 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
576 insertSemi = true
578 case digitVal(ch) < 10:
579 insertSemi = true
580 tok = S.scanNumber(false)
581 default:
582 S.next() // always make progress
583 switch ch {
584 case -1:
585 if S.insertSemi {
586 S.insertSemi = false // EOF consumed
587 return S.file.Pos(offs), token.SEMICOLON, "\n"
589 tok = token.EOF
590 case '\n':
591 // we only reach here if S.insertSemi was
592 // set in the first place and exited early
593 // from S.skipWhitespace()
594 S.insertSemi = false // newline consumed
595 return S.file.Pos(offs), token.SEMICOLON, "\n"
596 case '"':
597 insertSemi = true
598 tok = token.STRING
599 S.scanString()
600 case '\'':
601 insertSemi = true
602 tok = token.CHAR
603 S.scanChar()
604 case '`':
605 insertSemi = true
606 tok = token.STRING
607 S.scanRawString()
608 case ':':
609 tok = S.switch2(token.COLON, token.DEFINE)
610 case '.':
611 if digitVal(S.ch) < 10 {
612 insertSemi = true
613 tok = S.scanNumber(true)
614 } else if S.ch == '.' {
615 S.next()
616 if S.ch == '.' {
617 S.next()
618 tok = token.ELLIPSIS
620 } else {
621 tok = token.PERIOD
623 case ',':
624 tok = token.COMMA
625 case ';':
626 tok = token.SEMICOLON
627 case '(':
628 tok = token.LPAREN
629 case ')':
630 insertSemi = true
631 tok = token.RPAREN
632 case '[':
633 tok = token.LBRACK
634 case ']':
635 insertSemi = true
636 tok = token.RBRACK
637 case '{':
638 tok = token.LBRACE
639 case '}':
640 insertSemi = true
641 tok = token.RBRACE
642 case '+':
643 tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
644 if tok == token.INC {
645 insertSemi = true
647 case '-':
648 tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
649 if tok == token.DEC {
650 insertSemi = true
652 case '*':
653 tok = S.switch2(token.MUL, token.MUL_ASSIGN)
654 case '/':
655 if S.ch == '/' || S.ch == '*' {
656 // comment
657 if S.insertSemi && S.findLineEnd() {
658 // reset position to the beginning of the comment
659 S.ch = '/'
660 S.offset = offs
661 S.rdOffset = offs + 1
662 S.insertSemi = false // newline consumed
663 return S.file.Pos(offs), token.SEMICOLON, "\n"
665 S.scanComment()
666 if S.mode&ScanComments == 0 {
667 // skip comment
668 S.insertSemi = false // newline consumed
669 goto scanAgain
671 tok = token.COMMENT
672 } else {
673 tok = S.switch2(token.QUO, token.QUO_ASSIGN)
675 case '%':
676 tok = S.switch2(token.REM, token.REM_ASSIGN)
677 case '^':
678 tok = S.switch2(token.XOR, token.XOR_ASSIGN)
679 case '<':
680 if S.ch == '-' {
681 S.next()
682 tok = token.ARROW
683 } else {
684 tok = S.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
686 case '>':
687 tok = S.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
688 case '=':
689 tok = S.switch2(token.ASSIGN, token.EQL)
690 case '!':
691 tok = S.switch2(token.NOT, token.NEQ)
692 case '&':
693 if S.ch == '^' {
694 S.next()
695 tok = S.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
696 } else {
697 tok = S.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
699 case '|':
700 tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
701 default:
702 if S.mode&AllowIllegalChars == 0 {
703 S.error(offs, "illegal character "+charString(ch))
705 insertSemi = S.insertSemi // preserve insertSemi info
709 if S.mode&InsertSemis != 0 {
710 S.insertSemi = insertSemi
713 // TODO(gri): The scanner API should change such that the literal string
714 // is only valid if an actual literal was scanned. This will
715 // permit a more efficient implementation.
716 return S.file.Pos(offs), tok, string(S.src[offs:S.offset])