libgo/go/scanner/scanner.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // A scanner and tokenizer for UTF-8-encoded text.  Takes an io.Reader
   6 // providing the source, which then can be tokenized through repeated calls
   7 // to the Scan function.  For compatibility with existing tools, the NUL
   8 // character is not allowed (implementation restriction).
   9 //
  10 // By default, a Scanner skips white space and Go comments and recognizes all
  11 // literals as defined by the Go language specification.  It may be
  12 // customized to recognize only a subset of those literals and to recognize
  13 // different white space characters.
  14 //
  15 // Basic usage pattern:
  16 //
  17 //      var s scanner.Scanner
  18 //      s.Init(src)
  19 //      tok := s.Scan()
  20 //      for tok != scanner.EOF {
  21 //              // do something with tok
  22 //              tok = s.Scan()
  23 //      }
  24 //
  25 package scanner
  26
  27 import (
  28         "bytes"
  29         "fmt"
  30         "io"
  31         "os"
  32         "unicode"
  33         "utf8"
  34 )
  35
  36
  37 // A source position is represented by a Position value.
  38 // A position is valid if Line > 0.
  39 type Position struct {
  40         Filename string // filename, if any
  41         Offset   int    // byte offset, starting at 0
  42         Line     int    // line number, starting at 1
  43         Column   int    // column number, starting at 0 (character count per line)
  44 }
  45
  46
  47 // IsValid returns true if the position is valid.
  48 func (pos *Position) IsValid() bool { return pos.Line > 0 }
  49
  50
  51 func (pos Position) String() string {
  52         s := pos.Filename
  53         if pos.IsValid() {
  54                 if s != "" {
  55                         s += ":"
  56                 }
  57                 s += fmt.Sprintf("%d:%d", pos.Line, pos.Column)
  58         }
  59         if s == "" {
  60                 s = "???"
  61         }
  62         return s
  63 }
  64
  65
  66 // Predefined mode bits to control recognition of tokens. For instance,
  67 // to configure a Scanner such that it only recognizes (Go) identifiers,
  68 // integers, and skips comments, set the Scanner's Mode field to:
  69 //
  70 //      ScanIdents | ScanInts | SkipComments
  71 //
  72 const (
  73         ScanIdents     = 1 << -Ident
  74         ScanInts       = 1 << -Int
  75         ScanFloats     = 1 << -Float // includes Ints
  76         ScanChars      = 1 << -Char
  77         ScanStrings    = 1 << -String
  78         ScanRawStrings = 1 << -RawString
  79         ScanComments   = 1 << -Comment
  80         SkipComments   = 1 << -skipComment // if set with ScanComments, comments become white space
  81         GoTokens       = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
  82 )
  83
  84
  85 // The result of Scan is one of the following tokens or a Unicode character.
  86 const (
  87         EOF = -(iota + 1)
  88         Ident
  89         Int
  90         Float
  91         Char
  92         String
  93         RawString
  94         Comment
  95         skipComment
  96 )
  97
  98
  99 var tokenString = map[int]string{
 100         EOF:       "EOF",
 101         Ident:     "Ident",
 102         Int:       "Int",
 103         Float:     "Float",
 104         Char:      "Char",
 105         String:    "String",
 106         RawString: "RawString",
 107         Comment:   "Comment",
 108 }
 109
 110
 111 // TokenString returns a (visible) string for a token or Unicode character.
 112 func TokenString(tok int) string {
 113         if s, found := tokenString[tok]; found {
 114                 return s
 115         }
 116         return fmt.Sprintf("U+%04X", tok)
 117 }
 118
 119
 120 // GoWhitespace is the default value for the Scanner's Whitespace field.
 121 // Its value selects Go's white space characters.
 122 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
 123
 124
 125 const bufLen = 1024 // at least utf8.UTFMax
 126
 127 // A Scanner implements reading of Unicode characters and tokens from an io.Reader.
 128 type Scanner struct {
 129         // Input
 130         src io.Reader
 131
 132         // Source buffer
 133         srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next()
 134         srcPos int              // reading position (srcBuf index)
 135         srcEnd int              // source end (srcBuf index)
 136
 137         // Source position
 138         srcBufOffset int // byte offset of srcBuf[0] in source
 139         line         int // newline count + 1
 140         column       int // character count on line
 141
 142         // Token text buffer
 143         // Typically, token text is stored completely in srcBuf, but in general
 144         // the token text's head may be buffered in tokBuf while the token text's
 145         // tail is stored in srcBuf.
 146         tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
 147         tokPos int          // token text tail position (srcBuf index)
 148         tokEnd int          // token text tail end (srcBuf index)
 149
 150         // One character look-ahead
 151         ch int // character before current srcPos
 152
 153         // Error is called for each error encountered. If no Error
 154         // function is set, the error is reported to os.Stderr.
 155         Error func(s *Scanner, msg string)
 156
 157         // ErrorCount is incremented by one for each error encountered.
 158         ErrorCount int
 159
 160         // The Mode field controls which tokens are recognized. For instance,
 161         // to recognize Ints, set the ScanInts bit in Mode. The field may be
 162         // changed at any time.
 163         Mode uint
 164
 165         // The Whitespace field controls which characters are recognized
 166         // as white space. To recognize a character ch <= ' ' as white space,
 167         // set the ch'th bit in Whitespace (the Scanner's behavior is undefined
 168         // for values ch > ' '). The field may be changed at any time.
 169         Whitespace uint64
 170
 171         // Current token position. The Offset, Line, and Column fields
 172         // are set by Scan(); the Filename field is left untouched by the
 173         // Scanner.
 174         Position
 175 }
 176
 177
 178 // Init initializes a Scanner with a new source and returns itself.
 179 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
 180 // and Whitespace is set to GoWhitespace.
 181 func (s *Scanner) Init(src io.Reader) *Scanner {
 182         s.src = src
 183
 184         // initialize source buffer
 185         s.srcBuf[0] = utf8.RuneSelf // sentinel
 186         s.srcPos = 0
 187         s.srcEnd = 0
 188
 189         // initialize source position
 190         s.srcBufOffset = 0
 191         s.line = 1
 192         s.column = 0
 193
 194         // initialize token text buffer
 195         s.tokPos = -1
 196
 197         // initialize one character look-ahead
 198         s.ch = s.next()
 199
 200         // initialize public fields
 201         s.Error = nil
 202         s.ErrorCount = 0
 203         s.Mode = GoTokens
 204         s.Whitespace = GoWhitespace
 205
 206         return s
 207 }
 208
 209
 210 // next reads and returns the next Unicode character. It is designed such
 211 // that only a minimal amount of work needs to be done in the common ASCII
 212 // case (one test to check for both ASCII and end-of-buffer, and one test
 213 // to check for newlines).
 214 func (s *Scanner) next() int {
 215         ch := int(s.srcBuf[s.srcPos])
 216
 217         if ch >= utf8.RuneSelf {
 218                 // uncommon case: not ASCII or not enough bytes
 219                 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
 220                         // not enough bytes: read some more, but first
 221                         // save away token text if any
 222                         if s.tokPos >= 0 {
 223                                 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
 224                                 s.tokPos = 0
 225                         }
 226                         // move unread bytes to beginning of buffer
 227                         copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
 228                         s.srcBufOffset += s.srcPos
 229                         // read more bytes
 230                         i := s.srcEnd - s.srcPos
 231                         n, err := s.src.Read(s.srcBuf[i:bufLen])
 232                         s.srcEnd = i + n
 233                         s.srcPos = 0
 234                         s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
 235                         if err != nil {
 236                                 if s.srcEnd == 0 {
 237                                         return EOF
 238                                 }
 239                                 if err != os.EOF {
 240                                         s.error(err.String())
 241                                         break
 242                                 }
 243                         }
 244                 }
 245                 // at least one byte
 246                 ch = int(s.srcBuf[s.srcPos])
 247                 if ch >= utf8.RuneSelf {
 248                         // uncommon case: not ASCII
 249                         var width int
 250                         ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
 251                         if ch == utf8.RuneError && width == 1 {
 252                                 s.error("illegal UTF-8 encoding")
 253                         }
 254                         s.srcPos += width - 1
 255                 }
 256         }
 257
 258         s.srcPos++
 259         s.column++
 260         switch ch {
 261         case 0:
 262                 // implementation restriction for compatibility with other tools
 263                 s.error("illegal character NUL")
 264         case '\n':
 265                 s.line++
 266                 s.column = 0
 267         }
 268
 269         return ch
 270 }
 271
 272
 273 // Next reads and returns the next Unicode character.
 274 // It returns EOF at the end of the source. It reports
 275 // a read error by calling s.Error, if set, or else
 276 // prints an error message to os.Stderr. Next does not
 277 // update the Scanner's Position field; use Pos() to
 278 // get the current position.
 279 func (s *Scanner) Next() int {
 280         s.tokPos = -1 // don't collect token text
 281         ch := s.ch
 282         s.ch = s.next()
 283         return ch
 284 }
 285
 286
 287 // Peek returns the next Unicode character in the source without advancing
 288 // the scanner. It returns EOF if the scanner's position is at the last
 289 // character of the source.
 290 func (s *Scanner) Peek() int {
 291         return s.ch
 292 }
 293
 294
 295 func (s *Scanner) error(msg string) {
 296         s.ErrorCount++
 297         if s.Error != nil {
 298                 s.Error(s, msg)
 299                 return
 300         }
 301         fmt.Fprintf(os.Stderr, "%s: %s", s.Position, msg)
 302 }
 303
 304
 305 func (s *Scanner) scanIdentifier() int {
 306         ch := s.next() // read character after first '_' or letter
 307         for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) {
 308                 ch = s.next()
 309         }
 310         return ch
 311 }
 312
 313
 314 func digitVal(ch int) int {
 315         switch {
 316         case '0' <= ch && ch <= '9':
 317                 return ch - '0'
 318         case 'a' <= ch && ch <= 'f':
 319                 return ch - 'a' + 10
 320         case 'A' <= ch && ch <= 'F':
 321                 return ch - 'A' + 10
 322         }
 323         return 16 // larger than any legal digit val
 324 }
 325
 326
 327 func isDecimal(ch int) bool { return '0' <= ch && ch <= '9' }
 328
 329
 330 func (s *Scanner) scanMantissa(ch int) int {
 331         for isDecimal(ch) {
 332                 ch = s.next()
 333         }
 334         return ch
 335 }
 336
 337
 338 func (s *Scanner) scanFraction(ch int) int {
 339         if ch == '.' {
 340                 ch = s.scanMantissa(s.next())
 341         }
 342         return ch
 343 }
 344
 345
 346 func (s *Scanner) scanExponent(ch int) int {
 347         if ch == 'e' || ch == 'E' {
 348                 ch = s.next()
 349                 if ch == '-' || ch == '+' {
 350                         ch = s.next()
 351                 }
 352                 ch = s.scanMantissa(ch)
 353         }
 354         return ch
 355 }
 356
 357
 358 func (s *Scanner) scanNumber(ch int) (int, int) {
 359         // isDecimal(ch)
 360         if ch == '0' {
 361                 // int or float
 362                 ch = s.next()
 363                 if ch == 'x' || ch == 'X' {
 364                         // hexadecimal int
 365                         ch = s.next()
 366                         for digitVal(ch) < 16 {
 367                                 ch = s.next()
 368                         }
 369                 } else {
 370                         // octal int or float
 371                         seenDecimalDigit := false
 372                         for isDecimal(ch) {
 373                                 if ch > '7' {
 374                                         seenDecimalDigit = true
 375                                 }
 376                                 ch = s.next()
 377                         }
 378                         if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
 379                                 // float
 380                                 ch = s.scanFraction(ch)
 381                                 ch = s.scanExponent(ch)
 382                                 return Float, ch
 383                         }
 384                         // octal int
 385                         if seenDecimalDigit {
 386                                 s.error("illegal octal number")
 387                         }
 388                 }
 389                 return Int, ch
 390         }
 391         // decimal int or float
 392         ch = s.scanMantissa(ch)
 393         if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
 394                 // float
 395                 ch = s.scanFraction(ch)
 396                 ch = s.scanExponent(ch)
 397                 return Float, ch
 398         }
 399         return Int, ch
 400 }
 401
 402
 403 func (s *Scanner) scanDigits(ch, base, n int) int {
 404         for n > 0 && digitVal(ch) < base {
 405                 ch = s.next()
 406                 n--
 407         }
 408         if n > 0 {
 409                 s.error("illegal char escape")
 410         }
 411         return ch
 412 }
 413
 414
 415 func (s *Scanner) scanEscape(quote int) int {
 416         ch := s.next() // read character after '/'
 417         switch ch {
 418         case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
 419                 // nothing to do
 420                 ch = s.next()
 421         case '0', '1', '2', '3', '4', '5', '6', '7':
 422                 ch = s.scanDigits(ch, 8, 3)
 423         case 'x':
 424                 ch = s.scanDigits(s.next(), 16, 2)
 425         case 'u':
 426                 ch = s.scanDigits(s.next(), 16, 4)
 427         case 'U':
 428                 ch = s.scanDigits(s.next(), 16, 8)
 429         default:
 430                 s.error("illegal char escape")
 431         }
 432         return ch
 433 }
 434
 435
 436 func (s *Scanner) scanString(quote int) (n int) {
 437         ch := s.next() // read character after quote
 438         for ch != quote {
 439                 if ch == '\n' || ch < 0 {
 440                         s.error("literal not terminated")
 441                         return
 442                 }
 443                 if ch == '\\' {
 444                         ch = s.scanEscape(quote)
 445                 } else {
 446                         ch = s.next()
 447                 }
 448                 n++
 449         }
 450         return
 451 }
 452
 453
 454 func (s *Scanner) scanRawString() {
 455         ch := s.next() // read character after '`'
 456         for ch != '`' {
 457                 if ch < 0 {
 458                         s.error("literal not terminated")
 459                         return
 460                 }
 461                 ch = s.next()
 462         }
 463 }
 464
 465
 466 func (s *Scanner) scanChar() {
 467         if s.scanString('\'') != 1 {
 468                 s.error("illegal char literal")
 469         }
 470 }
 471
 472
 473 func (s *Scanner) scanLineComment() {
 474         ch := s.next() // read character after "//"
 475         for ch != '\n' {
 476                 if ch < 0 {
 477                         s.error("comment not terminated")
 478                         return
 479                 }
 480                 ch = s.next()
 481         }
 482 }
 483
 484
 485 func (s *Scanner) scanGeneralComment() {
 486         ch := s.next() // read character after "/*"
 487         for {
 488                 if ch < 0 {
 489                         s.error("comment not terminated")
 490                         return
 491                 }
 492                 ch0 := ch
 493                 ch = s.next()
 494                 if ch0 == '*' && ch == '/' {
 495                         break
 496                 }
 497         }
 498 }
 499
 500
 501 func (s *Scanner) scanComment(ch int) {
 502         // ch == '/' || ch == '*'
 503         if ch == '/' {
 504                 s.scanLineComment()
 505                 return
 506         }
 507         s.scanGeneralComment()
 508 }
 509
 510
 511 // Scan reads the next token or Unicode character from source and returns it.
 512 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
 513 // It returns EOF at the end of the source. It reports scanner errors (read and
 514 // token errors) by calling s.Error, if set; otherwise it prints an error message
 515 // to os.Stderr.
 516 func (s *Scanner) Scan() int {
 517         ch := s.ch
 518
 519         // reset token text position
 520         s.tokPos = -1
 521
 522 redo:
 523         // skip white space
 524         for s.Whitespace&(1<<uint(ch)) != 0 {
 525                 ch = s.next()
 526         }
 527
 528         // start collecting token text
 529         s.tokBuf.Reset()
 530         s.tokPos = s.srcPos - 1
 531
 532         // set token position
 533         s.Offset = s.srcBufOffset + s.tokPos
 534         s.Line = s.line
 535         s.Column = s.column
 536
 537         // determine token value
 538         tok := ch
 539         switch {
 540         case unicode.IsLetter(ch) || ch == '_':
 541                 if s.Mode&ScanIdents != 0 {
 542                         tok = Ident
 543                         ch = s.scanIdentifier()
 544                 } else {
 545                         ch = s.next()
 546                 }
 547         case isDecimal(ch):
 548                 if s.Mode&(ScanInts|ScanFloats) != 0 {
 549                         tok, ch = s.scanNumber(ch)
 550                 } else {
 551                         ch = s.next()
 552                 }
 553         default:
 554                 switch ch {
 555                 case '"':
 556                         if s.Mode&ScanStrings != 0 {
 557                                 s.scanString('"')
 558                                 tok = String
 559                         }
 560                         ch = s.next()
 561                 case '\'':
 562                         if s.Mode&ScanChars != 0 {
 563                                 s.scanChar()
 564                                 tok = Char
 565                         }
 566                         ch = s.next()
 567                 case '.':
 568                         ch = s.next()
 569                         if isDecimal(ch) && s.Mode&ScanFloats != 0 {
 570                                 tok = Float
 571                                 ch = s.scanMantissa(ch)
 572                                 ch = s.scanExponent(ch)
 573                         }
 574                 case '/':
 575                         ch = s.next()
 576                         if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
 577                                 if s.Mode&SkipComments != 0 {
 578                                         s.tokPos = -1 // don't collect token text
 579                                         s.scanComment(ch)
 580                                         ch = s.next()
 581                                         goto redo
 582                                 }
 583                                 s.scanComment(ch)
 584                                 tok = Comment
 585                                 ch = s.next()
 586                         }
 587                 case '`':
 588                         if s.Mode&ScanRawStrings != 0 {
 589                                 s.scanRawString()
 590                                 tok = String
 591                         }
 592                         ch = s.next()
 593                 default:
 594                         ch = s.next()
 595                 }
 596         }
 597
 598         // end of token text
 599         s.tokEnd = s.srcPos - 1
 600
 601         s.ch = ch
 602         return tok
 603 }
 604
 605
 606 // Position returns the current source position. If called before Next()
 607 // or Scan(), it returns the position of the next Unicode character or token
 608 // returned by these functions. If called afterwards, it returns the position
 609 // immediately after the last character of the most recent token or character
 610 // scanned.
 611 func (s *Scanner) Pos() Position {
 612         return Position{
 613                 s.Filename,
 614                 s.srcBufOffset + s.srcPos - 1,
 615                 s.line,
 616                 s.column,
 617         }
 618 }
 619
 620
 621 // TokenText returns the string corresponding to the most recently scanned token.
 622 // Valid after calling Scan().
 623 func (s *Scanner) TokenText() string {
 624         if s.tokPos < 0 {
 625                 // no token text
 626                 return ""
 627         }
 628
 629         if s.tokEnd < 0 {
 630                 // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0)
 631                 s.tokEnd = s.tokPos
 632         }
 633
 634         if s.tokBuf.Len() == 0 {
 635                 // common case: the entire token text is still in srcBuf
 636                 return string(s.srcBuf[s.tokPos:s.tokEnd])
 637         }
 638
 639         // part of the token text was saved in tokBuf: save the rest in
 640         // tokBuf as well and return its content
 641         s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
 642         s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
 643         return s.tokBuf.String()
 644 }