libgo/go/go/scanner/scanner.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // A scanner for Go source text. Takes a []byte as source which can
   6 // then be tokenized through repeated calls to the Scan function.
   7 // Typical use:
   8 //
   9 //      var s Scanner
  10 //      fset := token.NewFileSet()  // position information is relative to fset
  11 //      file := fset.AddFile(filename, fset.Base(), len(src))  // register file
  12 //      s.Init(file, src, nil /* no error handler */, 0)
  13 //      for {
  14 //              pos, tok, lit := s.Scan()
  15 //              if tok == token.EOF {
  16 //                      break
  17 //              }
  18 //              // do something here with pos, tok, and lit
  19 //      }
  20 //
  21 package scanner
  22
  23 import (
  24         "bytes"
  25         "go/token"
  26         "path/filepath"
  27         "strconv"
  28         "unicode"
  29         "utf8"
  30 )
  31
  32
  33 // A Scanner holds the scanner's internal state while processing
  34 // a given text.  It can be allocated as part of another data
  35 // structure but must be initialized via Init before use.
  36 //
  37 type Scanner struct {
  38         // immutable state
  39         file *token.File  // source file handle
  40         dir  string       // directory portion of file.Name()
  41         src  []byte       // source
  42         err  ErrorHandler // error reporting; or nil
  43         mode uint         // scanning mode
  44
  45         // scanning state
  46         ch         int  // current character
  47         offset     int  // character offset
  48         rdOffset   int  // reading offset (position after current character)
  49         lineOffset int  // current line offset
  50         insertSemi bool // insert a semicolon before next newline
  51
  52         // public state - ok to modify
  53         ErrorCount int // number of errors encountered
  54 }
  55
  56
  57 // Read the next Unicode char into S.ch.
  58 // S.ch < 0 means end-of-file.
  59 //
  60 func (S *Scanner) next() {
  61         if S.rdOffset < len(S.src) {
  62                 S.offset = S.rdOffset
  63                 if S.ch == '\n' {
  64                         S.lineOffset = S.offset
  65                         S.file.AddLine(S.offset)
  66                 }
  67                 r, w := int(S.src[S.rdOffset]), 1
  68                 switch {
  69                 case r == 0:
  70                         S.error(S.offset, "illegal character NUL")
  71                 case r >= 0x80:
  72                         // not ASCII
  73                         r, w = utf8.DecodeRune(S.src[S.rdOffset:])
  74                         if r == utf8.RuneError && w == 1 {
  75                                 S.error(S.offset, "illegal UTF-8 encoding")
  76                         }
  77                 }
  78                 S.rdOffset += w
  79                 S.ch = r
  80         } else {
  81                 S.offset = len(S.src)
  82                 if S.ch == '\n' {
  83                         S.lineOffset = S.offset
  84                         S.file.AddLine(S.offset)
  85                 }
  86                 S.ch = -1 // eof
  87         }
  88 }
  89
  90
  91 // The mode parameter to the Init function is a set of flags (or 0).
  92 // They control scanner behavior.
  93 //
  94 const (
  95         ScanComments      = 1 << iota // return comments as COMMENT tokens
  96         AllowIllegalChars             // do not report an error for illegal chars
  97         InsertSemis                   // automatically insert semicolons
  98 )
  99
 100 // Init prepares the scanner S to tokenize the text src by setting the
 101 // scanner at the beginning of src. The scanner uses the file set file
 102 // for position information and it adds line information for each line.
 103 // It is ok to re-use the same file when re-scanning the same file as
 104 // line information which is already present is ignored. Init causes a
 105 // panic if the file size does not match the src size.
 106 //
 107 // Calls to Scan will use the error handler err if they encounter a
 108 // syntax error and err is not nil. Also, for each error encountered,
 109 // the Scanner field ErrorCount is incremented by one. The mode parameter
 110 // determines how comments, illegal characters, and semicolons are handled.
 111 //
 112 // Note that Init may call err if there is an error in the first character
 113 // of the file.
 114 //
 115 func (S *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode uint) {
 116         // Explicitly initialize all fields since a scanner may be reused.
 117         if file.Size() != len(src) {
 118                 panic("file size does not match src len")
 119         }
 120         S.file = file
 121         S.dir, _ = filepath.Split(file.Name())
 122         S.src = src
 123         S.err = err
 124         S.mode = mode
 125
 126         S.ch = ' '
 127         S.offset = 0
 128         S.rdOffset = 0
 129         S.lineOffset = 0
 130         S.insertSemi = false
 131         S.ErrorCount = 0
 132
 133         S.next()
 134 }
 135
 136
 137 func charString(ch int) string {
 138         var s string
 139         switch ch {
 140         case -1:
 141                 return `EOF`
 142         case '\a':
 143                 s = `\a`
 144         case '\b':
 145                 s = `\b`
 146         case '\f':
 147                 s = `\f`
 148         case '\n':
 149                 s = `\n`
 150         case '\r':
 151                 s = `\r`
 152         case '\t':
 153                 s = `\t`
 154         case '\v':
 155                 s = `\v`
 156         case '\\':
 157                 s = `\\`
 158         case '\'':
 159                 s = `\'`
 160         default:
 161                 s = string(ch)
 162         }
 163         return "'" + s + "' (U+" + strconv.Itob(ch, 16) + ")"
 164 }
 165
 166
 167 func (S *Scanner) error(offs int, msg string) {
 168         if S.err != nil {
 169                 S.err.Error(S.file.Position(S.file.Pos(offs)), msg)
 170         }
 171         S.ErrorCount++
 172 }
 173
 174
 175 var prefix = []byte("//line ")
 176
 177 func (S *Scanner) interpretLineComment(text []byte) {
 178         if bytes.HasPrefix(text, prefix) {
 179                 // get filename and line number, if any
 180                 if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
 181                         if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
 182                                 // valid //line filename:line comment;
 183                                 filename := filepath.Clean(string(text[len(prefix):i]))
 184                                 if !filepath.IsAbs(filename) {
 185                                         // make filename relative to current directory
 186                                         filename = filepath.Join(S.dir, filename)
 187                                 }
 188                                 // update scanner position
 189                                 S.file.AddLineInfo(S.lineOffset, filename, line-1) // -1 since comment applies to next line
 190                         }
 191                 }
 192         }
 193 }
 194
 195
 196 func (S *Scanner) scanComment() {
 197         // initial '/' already consumed; S.ch == '/' || S.ch == '*'
 198         offs := S.offset - 1 // position of initial '/'
 199
 200         if S.ch == '/' {
 201                 //-style comment
 202                 S.next()
 203                 for S.ch != '\n' && S.ch >= 0 {
 204                         S.next()
 205                 }
 206                 if offs == S.lineOffset {
 207                         // comment starts at the beginning of the current line
 208                         S.interpretLineComment(S.src[offs:S.offset])
 209                 }
 210                 return
 211         }
 212
 213         /*-style comment */
 214         S.next()
 215         for S.ch >= 0 {
 216                 ch := S.ch
 217                 S.next()
 218                 if ch == '*' && S.ch == '/' {
 219                         S.next()
 220                         return
 221                 }
 222         }
 223
 224         S.error(offs, "comment not terminated")
 225 }
 226
 227
 228 func (S *Scanner) findLineEnd() bool {
 229         // initial '/' already consumed
 230
 231         defer func(offs int) {
 232                 // reset scanner state to where it was upon calling findLineEnd
 233                 S.ch = '/'
 234                 S.offset = offs
 235                 S.rdOffset = offs + 1
 236                 S.next() // consume initial '/' again
 237         }(S.offset - 1)
 238
 239         // read ahead until a newline, EOF, or non-comment token is found
 240         for S.ch == '/' || S.ch == '*' {
 241                 if S.ch == '/' {
 242                         //-style comment always contains a newline
 243                         return true
 244                 }
 245                 /*-style comment: look for newline */
 246                 S.next()
 247                 for S.ch >= 0 {
 248                         ch := S.ch
 249                         if ch == '\n' {
 250                                 return true
 251                         }
 252                         S.next()
 253                         if ch == '*' && S.ch == '/' {
 254                                 S.next()
 255                                 break
 256                         }
 257                 }
 258                 S.skipWhitespace() // S.insertSemi is set
 259                 if S.ch < 0 || S.ch == '\n' {
 260                         return true
 261                 }
 262                 if S.ch != '/' {
 263                         // non-comment token
 264                         return false
 265                 }
 266                 S.next() // consume '/'
 267         }
 268
 269         return false
 270 }
 271
 272
 273 func isLetter(ch int) bool {
 274         return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
 275 }
 276
 277
 278 func isDigit(ch int) bool {
 279         return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
 280 }
 281
 282
 283 func (S *Scanner) scanIdentifier() token.Token {
 284         offs := S.offset
 285         for isLetter(S.ch) || isDigit(S.ch) {
 286                 S.next()
 287         }
 288         return token.Lookup(S.src[offs:S.offset])
 289 }
 290
 291
 292 func digitVal(ch int) int {
 293         switch {
 294         case '0' <= ch && ch <= '9':
 295                 return ch - '0'
 296         case 'a' <= ch && ch <= 'f':
 297                 return ch - 'a' + 10
 298         case 'A' <= ch && ch <= 'F':
 299                 return ch - 'A' + 10
 300         }
 301         return 16 // larger than any legal digit val
 302 }
 303
 304
 305 func (S *Scanner) scanMantissa(base int) {
 306         for digitVal(S.ch) < base {
 307                 S.next()
 308         }
 309 }
 310
 311
 312 func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token {
 313         // digitVal(S.ch) < 10
 314         tok := token.INT
 315
 316         if seenDecimalPoint {
 317                 tok = token.FLOAT
 318                 S.scanMantissa(10)
 319                 goto exponent
 320         }
 321
 322         if S.ch == '0' {
 323                 // int or float
 324                 offs := S.offset
 325                 S.next()
 326                 if S.ch == 'x' || S.ch == 'X' {
 327                         // hexadecimal int
 328                         S.next()
 329                         S.scanMantissa(16)
 330                 } else {
 331                         // octal int or float
 332                         seenDecimalDigit := false
 333                         S.scanMantissa(8)
 334                         if S.ch == '8' || S.ch == '9' {
 335                                 // illegal octal int or float
 336                                 seenDecimalDigit = true
 337                                 S.scanMantissa(10)
 338                         }
 339                         if S.ch == '.' || S.ch == 'e' || S.ch == 'E' || S.ch == 'i' {
 340                                 goto fraction
 341                         }
 342                         // octal int
 343                         if seenDecimalDigit {
 344                                 S.error(offs, "illegal octal number")
 345                         }
 346                 }
 347                 goto exit
 348         }
 349
 350         // decimal int or float
 351         S.scanMantissa(10)
 352
 353 fraction:
 354         if S.ch == '.' {
 355                 tok = token.FLOAT
 356                 S.next()
 357                 S.scanMantissa(10)
 358         }
 359
 360 exponent:
 361         if S.ch == 'e' || S.ch == 'E' {
 362                 tok = token.FLOAT
 363                 S.next()
 364                 if S.ch == '-' || S.ch == '+' {
 365                         S.next()
 366                 }
 367                 S.scanMantissa(10)
 368         }
 369
 370         if S.ch == 'i' {
 371                 tok = token.IMAG
 372                 S.next()
 373         }
 374
 375 exit:
 376         return tok
 377 }
 378
 379
 380 func (S *Scanner) scanEscape(quote int) {
 381         offs := S.offset
 382
 383         var i, base, max uint32
 384         switch S.ch {
 385         case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
 386                 S.next()
 387                 return
 388         case '0', '1', '2', '3', '4', '5', '6', '7':
 389                 i, base, max = 3, 8, 255
 390         case 'x':
 391                 S.next()
 392                 i, base, max = 2, 16, 255
 393         case 'u':
 394                 S.next()
 395                 i, base, max = 4, 16, unicode.MaxRune
 396         case 'U':
 397                 S.next()
 398                 i, base, max = 8, 16, unicode.MaxRune
 399         default:
 400                 S.next() // always make progress
 401                 S.error(offs, "unknown escape sequence")
 402                 return
 403         }
 404
 405         var x uint32
 406         for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
 407                 d := uint32(digitVal(S.ch))
 408                 if d >= base {
 409                         S.error(S.offset, "illegal character in escape sequence")
 410                         break
 411                 }
 412                 x = x*base + d
 413                 S.next()
 414         }
 415         // in case of an error, consume remaining chars
 416         for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
 417                 S.next()
 418         }
 419         if x > max || 0xd800 <= x && x < 0xe000 {
 420                 S.error(offs, "escape sequence is invalid Unicode code point")
 421         }
 422 }
 423
 424
 425 func (S *Scanner) scanChar() {
 426         // '\'' opening already consumed
 427         offs := S.offset - 1
 428
 429         n := 0
 430         for S.ch != '\'' {
 431                 ch := S.ch
 432                 n++
 433                 S.next()
 434                 if ch == '\n' || ch < 0 {
 435                         S.error(offs, "character literal not terminated")
 436                         n = 1
 437                         break
 438                 }
 439                 if ch == '\\' {
 440                         S.scanEscape('\'')
 441                 }
 442         }
 443
 444         S.next()
 445
 446         if n != 1 {
 447                 S.error(offs, "illegal character literal")
 448         }
 449 }
 450
 451
 452 func (S *Scanner) scanString() {
 453         // '"' opening already consumed
 454         offs := S.offset - 1
 455
 456         for S.ch != '"' {
 457                 ch := S.ch
 458                 S.next()
 459                 if ch == '\n' || ch < 0 {
 460                         S.error(offs, "string not terminated")
 461                         break
 462                 }
 463                 if ch == '\\' {
 464                         S.scanEscape('"')
 465                 }
 466         }
 467
 468         S.next()
 469 }
 470
 471
 472 func (S *Scanner) scanRawString() {
 473         // '`' opening already consumed
 474         offs := S.offset - 1
 475
 476         for S.ch != '`' {
 477                 ch := S.ch
 478                 S.next()
 479                 if ch < 0 {
 480                         S.error(offs, "string not terminated")
 481                         break
 482                 }
 483         }
 484
 485         S.next()
 486 }
 487
 488
 489 func (S *Scanner) skipWhitespace() {
 490         for S.ch == ' ' || S.ch == '\t' || S.ch == '\n' && !S.insertSemi || S.ch == '\r' {
 491                 S.next()
 492         }
 493 }
 494
 495
 496 // Helper functions for scanning multi-byte tokens such as >> += >>= .
 497 // Different routines recognize different length tok_i based on matches
 498 // of ch_i. If a token ends in '=', the result is tok1 or tok3
 499 // respectively. Otherwise, the result is tok0 if there was no other
 500 // matching character, or tok2 if the matching character was ch2.
 501
 502 func (S *Scanner) switch2(tok0, tok1 token.Token) token.Token {
 503         if S.ch == '=' {
 504                 S.next()
 505                 return tok1
 506         }
 507         return tok0
 508 }
 509
 510
 511 func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) token.Token {
 512         if S.ch == '=' {
 513                 S.next()
 514                 return tok1
 515         }
 516         if S.ch == ch2 {
 517                 S.next()
 518                 return tok2
 519         }
 520         return tok0
 521 }
 522
 523
 524 func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Token) token.Token {
 525         if S.ch == '=' {
 526                 S.next()
 527                 return tok1
 528         }
 529         if S.ch == ch2 {
 530                 S.next()
 531                 if S.ch == '=' {
 532                         S.next()
 533                         return tok3
 534                 }
 535                 return tok2
 536         }
 537         return tok0
 538 }
 539
 540
 541 // Scan scans the next token and returns the token position,
 542 // the token, and the literal string corresponding to the
 543 // token. The source end is indicated by token.EOF.
 544 //
 545 // If the returned token is token.SEMICOLON, the corresponding
 546 // literal string is ";" if the semicolon was present in the source,
 547 // and "\n" if the semicolon was inserted because of a newline or
 548 // at EOF.
 549 //
 550 // For more tolerant parsing, Scan will return a valid token if
 551 // possible even if a syntax error was encountered. Thus, even
 552 // if the resulting token sequence contains no illegal tokens,
 553 // a client may not assume that no error occurred. Instead it
 554 // must check the scanner's ErrorCount or the number of calls
 555 // of the error handler, if there was one installed.
 556 //
 557 // Scan adds line information to the file added to the file
 558 // set with Init. Token positions are relative to that file
 559 // and thus relative to the file set.
 560 //
 561 func (S *Scanner) Scan() (token.Pos, token.Token, string) {
 562 scanAgain:
 563         S.skipWhitespace()
 564
 565         // current token start
 566         insertSemi := false
 567         offs := S.offset
 568         tok := token.ILLEGAL
 569
 570         // determine token value
 571         switch ch := S.ch; {
 572         case isLetter(ch):
 573                 tok = S.scanIdentifier()
 574                 switch tok {
 575                 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
 576                         insertSemi = true
 577                 }
 578         case digitVal(ch) < 10:
 579                 insertSemi = true
 580                 tok = S.scanNumber(false)
 581         default:
 582                 S.next() // always make progress
 583                 switch ch {
 584                 case -1:
 585                         if S.insertSemi {
 586                                 S.insertSemi = false // EOF consumed
 587                                 return S.file.Pos(offs), token.SEMICOLON, "\n"
 588                         }
 589                         tok = token.EOF
 590                 case '\n':
 591                         // we only reach here if S.insertSemi was
 592                         // set in the first place and exited early
 593                         // from S.skipWhitespace()
 594                         S.insertSemi = false // newline consumed
 595                         return S.file.Pos(offs), token.SEMICOLON, "\n"
 596                 case '"':
 597                         insertSemi = true
 598                         tok = token.STRING
 599                         S.scanString()
 600                 case '\'':
 601                         insertSemi = true
 602                         tok = token.CHAR
 603                         S.scanChar()
 604                 case '`':
 605                         insertSemi = true
 606                         tok = token.STRING
 607                         S.scanRawString()
 608                 case ':':
 609                         tok = S.switch2(token.COLON, token.DEFINE)
 610                 case '.':
 611                         if digitVal(S.ch) < 10 {
 612                                 insertSemi = true
 613                                 tok = S.scanNumber(true)
 614                         } else if S.ch == '.' {
 615                                 S.next()
 616                                 if S.ch == '.' {
 617                                         S.next()
 618                                         tok = token.ELLIPSIS
 619                                 }
 620                         } else {
 621                                 tok = token.PERIOD
 622                         }
 623                 case ',':
 624                         tok = token.COMMA
 625                 case ';':
 626                         tok = token.SEMICOLON
 627                 case '(':
 628                         tok = token.LPAREN
 629                 case ')':
 630                         insertSemi = true
 631                         tok = token.RPAREN
 632                 case '[':
 633                         tok = token.LBRACK
 634                 case ']':
 635                         insertSemi = true
 636                         tok = token.RBRACK
 637                 case '{':
 638                         tok = token.LBRACE
 639                 case '}':
 640                         insertSemi = true
 641                         tok = token.RBRACE
 642                 case '+':
 643                         tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
 644                         if tok == token.INC {
 645                                 insertSemi = true
 646                         }
 647                 case '-':
 648                         tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
 649                         if tok == token.DEC {
 650                                 insertSemi = true
 651                         }
 652                 case '*':
 653                         tok = S.switch2(token.MUL, token.MUL_ASSIGN)
 654                 case '/':
 655                         if S.ch == '/' || S.ch == '*' {
 656                                 // comment
 657                                 if S.insertSemi && S.findLineEnd() {
 658                                         // reset position to the beginning of the comment
 659                                         S.ch = '/'
 660                                         S.offset = offs
 661                                         S.rdOffset = offs + 1
 662                                         S.insertSemi = false // newline consumed
 663                                         return S.file.Pos(offs), token.SEMICOLON, "\n"
 664                                 }
 665                                 S.scanComment()
 666                                 if S.mode&ScanComments == 0 {
 667                                         // skip comment
 668                                         S.insertSemi = false // newline consumed
 669                                         goto scanAgain
 670                                 }
 671                                 tok = token.COMMENT
 672                         } else {
 673                                 tok = S.switch2(token.QUO, token.QUO_ASSIGN)
 674                         }
 675                 case '%':
 676                         tok = S.switch2(token.REM, token.REM_ASSIGN)
 677                 case '^':
 678                         tok = S.switch2(token.XOR, token.XOR_ASSIGN)
 679                 case '<':
 680                         if S.ch == '-' {
 681                                 S.next()
 682                                 tok = token.ARROW
 683                         } else {
 684                                 tok = S.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
 685                         }
 686                 case '>':
 687                         tok = S.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
 688                 case '=':
 689                         tok = S.switch2(token.ASSIGN, token.EQL)
 690                 case '!':
 691                         tok = S.switch2(token.NOT, token.NEQ)
 692                 case '&':
 693                         if S.ch == '^' {
 694                                 S.next()
 695                                 tok = S.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
 696                         } else {
 697                                 tok = S.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
 698                         }
 699                 case '|':
 700                         tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
 701                 default:
 702                         if S.mode&AllowIllegalChars == 0 {
 703                                 S.error(offs, "illegal character "+charString(ch))
 704                         }
 705                         insertSemi = S.insertSemi // preserve insertSemi info
 706                 }
 707         }
 708
 709         if S.mode&InsertSemis != 0 {
 710                 S.insertSemi = insertSemi
 711         }
 712
 713         // TODO(gri): The scanner API should change such that the literal string
 714         //            is only valid if an actual literal was scanned. This will
 715         //            permit a more efficient implementation.
 716         return S.file.Pos(offs), tok, string(S.src[offs:S.offset])
 717 }