Merge from mainline (167278:168000).
[official-gcc/graphite-test-results.git] / libgo / go / scanner / scanner.go
blob11aa9f43f33d44bb573dd1177fa96f5ce97c8517
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // A scanner and tokenizer for UTF-8-encoded text. Takes an io.Reader
6 // providing the source, which then can be tokenized through repeated calls
7 // to the Scan function. For compatibility with existing tools, the NUL
8 // character is not allowed (implementation restriction).
9 //
10 // By default, a Scanner skips white space and Go comments and recognizes all
11 // literals as defined by the Go language specification. It may be
12 // customized to recognize only a subset of those literals and to recognize
13 // different white space characters.
15 // Basic usage pattern:
17 // var s scanner.Scanner
18 // s.Init(src)
19 // tok := s.Scan()
20 // for tok != scanner.EOF {
21 // // do something with tok
22 // tok = s.Scan()
23 // }
25 package scanner
27 import (
28 "bytes"
29 "fmt"
30 "io"
31 "os"
32 "unicode"
33 "utf8"
37 // A source position is represented by a Position value.
38 // A position is valid if Line > 0.
39 type Position struct {
40 Filename string // filename, if any
41 Offset int // byte offset, starting at 0
42 Line int // line number, starting at 1
43 Column int // column number, starting at 0 (character count per line)
47 // IsValid returns true if the position is valid.
48 func (pos *Position) IsValid() bool { return pos.Line > 0 }
51 func (pos Position) String() string {
52 s := pos.Filename
53 if pos.IsValid() {
54 if s != "" {
55 s += ":"
57 s += fmt.Sprintf("%d:%d", pos.Line, pos.Column)
59 if s == "" {
60 s = "???"
62 return s
66 // Predefined mode bits to control recognition of tokens. For instance,
67 // to configure a Scanner such that it only recognizes (Go) identifiers,
68 // integers, and skips comments, set the Scanner's Mode field to:
70 // ScanIdents | ScanInts | SkipComments
72 const (
73 ScanIdents = 1 << -Ident
74 ScanInts = 1 << -Int
75 ScanFloats = 1 << -Float // includes Ints
76 ScanChars = 1 << -Char
77 ScanStrings = 1 << -String
78 ScanRawStrings = 1 << -RawString
79 ScanComments = 1 << -Comment
80 SkipComments = 1 << -skipComment // if set with ScanComments, comments become white space
81 GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
85 // The result of Scan is one of the following tokens or a Unicode character.
86 const (
87 EOF = -(iota + 1)
88 Ident
89 Int
90 Float
91 Char
92 String
93 RawString
94 Comment
95 skipComment
99 var tokenString = map[int]string{
100 EOF: "EOF",
101 Ident: "Ident",
102 Int: "Int",
103 Float: "Float",
104 Char: "Char",
105 String: "String",
106 RawString: "RawString",
107 Comment: "Comment",
111 // TokenString returns a (visible) string for a token or Unicode character.
112 func TokenString(tok int) string {
113 if s, found := tokenString[tok]; found {
114 return s
116 return fmt.Sprintf("U+%04X", tok)
120 // GoWhitespace is the default value for the Scanner's Whitespace field.
121 // Its value selects Go's white space characters.
122 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
125 const bufLen = 1024 // at least utf8.UTFMax
127 // A Scanner implements reading of Unicode characters and tokens from an io.Reader.
128 type Scanner struct {
129 // Input
130 src io.Reader
132 // Source buffer
133 srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next()
134 srcPos int // reading position (srcBuf index)
135 srcEnd int // source end (srcBuf index)
137 // Source position
138 srcBufOffset int // byte offset of srcBuf[0] in source
139 line int // newline count + 1
140 column int // character count on line
142 // Token text buffer
143 // Typically, token text is stored completely in srcBuf, but in general
144 // the token text's head may be buffered in tokBuf while the token text's
145 // tail is stored in srcBuf.
146 tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
147 tokPos int // token text tail position (srcBuf index)
148 tokEnd int // token text tail end (srcBuf index)
150 // One character look-ahead
151 ch int // character before current srcPos
153 // Error is called for each error encountered. If no Error
154 // function is set, the error is reported to os.Stderr.
155 Error func(s *Scanner, msg string)
157 // ErrorCount is incremented by one for each error encountered.
158 ErrorCount int
160 // The Mode field controls which tokens are recognized. For instance,
161 // to recognize Ints, set the ScanInts bit in Mode. The field may be
162 // changed at any time.
163 Mode uint
165 // The Whitespace field controls which characters are recognized
166 // as white space. To recognize a character ch <= ' ' as white space,
167 // set the ch'th bit in Whitespace (the Scanner's behavior is undefined
168 // for values ch > ' '). The field may be changed at any time.
169 Whitespace uint64
171 // Current token position. The Offset, Line, and Column fields
172 // are set by Scan(); the Filename field is left untouched by the
173 // Scanner.
174 Position
178 // Init initializes a Scanner with a new source and returns itself.
179 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
180 // and Whitespace is set to GoWhitespace.
181 func (s *Scanner) Init(src io.Reader) *Scanner {
182 s.src = src
184 // initialize source buffer
185 s.srcBuf[0] = utf8.RuneSelf // sentinel
186 s.srcPos = 0
187 s.srcEnd = 0
189 // initialize source position
190 s.srcBufOffset = 0
191 s.line = 1
192 s.column = 0
194 // initialize token text buffer
195 s.tokPos = -1
197 // initialize one character look-ahead
198 s.ch = s.next()
200 // initialize public fields
201 s.Error = nil
202 s.ErrorCount = 0
203 s.Mode = GoTokens
204 s.Whitespace = GoWhitespace
206 return s
210 // next reads and returns the next Unicode character. It is designed such
211 // that only a minimal amount of work needs to be done in the common ASCII
212 // case (one test to check for both ASCII and end-of-buffer, and one test
213 // to check for newlines).
214 func (s *Scanner) next() int {
215 ch := int(s.srcBuf[s.srcPos])
217 if ch >= utf8.RuneSelf {
218 // uncommon case: not ASCII or not enough bytes
219 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
220 // not enough bytes: read some more, but first
221 // save away token text if any
222 if s.tokPos >= 0 {
223 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
224 s.tokPos = 0
226 // move unread bytes to beginning of buffer
227 copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
228 s.srcBufOffset += s.srcPos
229 // read more bytes
230 i := s.srcEnd - s.srcPos
231 n, err := s.src.Read(s.srcBuf[i:bufLen])
232 s.srcEnd = i + n
233 s.srcPos = 0
234 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
235 if err != nil {
236 if s.srcEnd == 0 {
237 return EOF
239 if err != os.EOF {
240 s.error(err.String())
241 break
245 // at least one byte
246 ch = int(s.srcBuf[s.srcPos])
247 if ch >= utf8.RuneSelf {
248 // uncommon case: not ASCII
249 var width int
250 ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
251 if ch == utf8.RuneError && width == 1 {
252 s.error("illegal UTF-8 encoding")
254 s.srcPos += width - 1
258 s.srcPos++
259 s.column++
260 switch ch {
261 case 0:
262 // implementation restriction for compatibility with other tools
263 s.error("illegal character NUL")
264 case '\n':
265 s.line++
266 s.column = 0
269 return ch
273 // Next reads and returns the next Unicode character.
274 // It returns EOF at the end of the source. It reports
275 // a read error by calling s.Error, if set, or else
276 // prints an error message to os.Stderr. Next does not
277 // update the Scanner's Position field; use Pos() to
278 // get the current position.
279 func (s *Scanner) Next() int {
280 s.tokPos = -1 // don't collect token text
281 ch := s.ch
282 s.ch = s.next()
283 return ch
287 // Peek returns the next Unicode character in the source without advancing
288 // the scanner. It returns EOF if the scanner's position is at the last
289 // character of the source.
290 func (s *Scanner) Peek() int {
291 return s.ch
295 func (s *Scanner) error(msg string) {
296 s.ErrorCount++
297 if s.Error != nil {
298 s.Error(s, msg)
299 return
301 fmt.Fprintf(os.Stderr, "%s: %s", s.Position, msg)
305 func (s *Scanner) scanIdentifier() int {
306 ch := s.next() // read character after first '_' or letter
307 for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) {
308 ch = s.next()
310 return ch
314 func digitVal(ch int) int {
315 switch {
316 case '0' <= ch && ch <= '9':
317 return ch - '0'
318 case 'a' <= ch && ch <= 'f':
319 return ch - 'a' + 10
320 case 'A' <= ch && ch <= 'F':
321 return ch - 'A' + 10
323 return 16 // larger than any legal digit val
327 func isDecimal(ch int) bool { return '0' <= ch && ch <= '9' }
330 func (s *Scanner) scanMantissa(ch int) int {
331 for isDecimal(ch) {
332 ch = s.next()
334 return ch
338 func (s *Scanner) scanFraction(ch int) int {
339 if ch == '.' {
340 ch = s.scanMantissa(s.next())
342 return ch
346 func (s *Scanner) scanExponent(ch int) int {
347 if ch == 'e' || ch == 'E' {
348 ch = s.next()
349 if ch == '-' || ch == '+' {
350 ch = s.next()
352 ch = s.scanMantissa(ch)
354 return ch
358 func (s *Scanner) scanNumber(ch int) (int, int) {
359 // isDecimal(ch)
360 if ch == '0' {
361 // int or float
362 ch = s.next()
363 if ch == 'x' || ch == 'X' {
364 // hexadecimal int
365 ch = s.next()
366 for digitVal(ch) < 16 {
367 ch = s.next()
369 } else {
370 // octal int or float
371 seenDecimalDigit := false
372 for isDecimal(ch) {
373 if ch > '7' {
374 seenDecimalDigit = true
376 ch = s.next()
378 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
379 // float
380 ch = s.scanFraction(ch)
381 ch = s.scanExponent(ch)
382 return Float, ch
384 // octal int
385 if seenDecimalDigit {
386 s.error("illegal octal number")
389 return Int, ch
391 // decimal int or float
392 ch = s.scanMantissa(ch)
393 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
394 // float
395 ch = s.scanFraction(ch)
396 ch = s.scanExponent(ch)
397 return Float, ch
399 return Int, ch
403 func (s *Scanner) scanDigits(ch, base, n int) int {
404 for n > 0 && digitVal(ch) < base {
405 ch = s.next()
408 if n > 0 {
409 s.error("illegal char escape")
411 return ch
415 func (s *Scanner) scanEscape(quote int) int {
416 ch := s.next() // read character after '/'
417 switch ch {
418 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
419 // nothing to do
420 ch = s.next()
421 case '0', '1', '2', '3', '4', '5', '6', '7':
422 ch = s.scanDigits(ch, 8, 3)
423 case 'x':
424 ch = s.scanDigits(s.next(), 16, 2)
425 case 'u':
426 ch = s.scanDigits(s.next(), 16, 4)
427 case 'U':
428 ch = s.scanDigits(s.next(), 16, 8)
429 default:
430 s.error("illegal char escape")
432 return ch
436 func (s *Scanner) scanString(quote int) (n int) {
437 ch := s.next() // read character after quote
438 for ch != quote {
439 if ch == '\n' || ch < 0 {
440 s.error("literal not terminated")
441 return
443 if ch == '\\' {
444 ch = s.scanEscape(quote)
445 } else {
446 ch = s.next()
450 return
454 func (s *Scanner) scanRawString() {
455 ch := s.next() // read character after '`'
456 for ch != '`' {
457 if ch < 0 {
458 s.error("literal not terminated")
459 return
461 ch = s.next()
466 func (s *Scanner) scanChar() {
467 if s.scanString('\'') != 1 {
468 s.error("illegal char literal")
473 func (s *Scanner) scanLineComment() {
474 ch := s.next() // read character after "//"
475 for ch != '\n' {
476 if ch < 0 {
477 s.error("comment not terminated")
478 return
480 ch = s.next()
485 func (s *Scanner) scanGeneralComment() {
486 ch := s.next() // read character after "/*"
487 for {
488 if ch < 0 {
489 s.error("comment not terminated")
490 return
492 ch0 := ch
493 ch = s.next()
494 if ch0 == '*' && ch == '/' {
495 break
501 func (s *Scanner) scanComment(ch int) {
502 // ch == '/' || ch == '*'
503 if ch == '/' {
504 s.scanLineComment()
505 return
507 s.scanGeneralComment()
511 // Scan reads the next token or Unicode character from source and returns it.
512 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
513 // It returns EOF at the end of the source. It reports scanner errors (read and
514 // token errors) by calling s.Error, if set; otherwise it prints an error message
515 // to os.Stderr.
516 func (s *Scanner) Scan() int {
517 ch := s.ch
519 // reset token text position
520 s.tokPos = -1
522 redo:
523 // skip white space
524 for s.Whitespace&(1<<uint(ch)) != 0 {
525 ch = s.next()
528 // start collecting token text
529 s.tokBuf.Reset()
530 s.tokPos = s.srcPos - 1
532 // set token position
533 s.Offset = s.srcBufOffset + s.tokPos
534 s.Line = s.line
535 s.Column = s.column
537 // determine token value
538 tok := ch
539 switch {
540 case unicode.IsLetter(ch) || ch == '_':
541 if s.Mode&ScanIdents != 0 {
542 tok = Ident
543 ch = s.scanIdentifier()
544 } else {
545 ch = s.next()
547 case isDecimal(ch):
548 if s.Mode&(ScanInts|ScanFloats) != 0 {
549 tok, ch = s.scanNumber(ch)
550 } else {
551 ch = s.next()
553 default:
554 switch ch {
555 case '"':
556 if s.Mode&ScanStrings != 0 {
557 s.scanString('"')
558 tok = String
560 ch = s.next()
561 case '\'':
562 if s.Mode&ScanChars != 0 {
563 s.scanChar()
564 tok = Char
566 ch = s.next()
567 case '.':
568 ch = s.next()
569 if isDecimal(ch) && s.Mode&ScanFloats != 0 {
570 tok = Float
571 ch = s.scanMantissa(ch)
572 ch = s.scanExponent(ch)
574 case '/':
575 ch = s.next()
576 if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
577 if s.Mode&SkipComments != 0 {
578 s.tokPos = -1 // don't collect token text
579 s.scanComment(ch)
580 ch = s.next()
581 goto redo
583 s.scanComment(ch)
584 tok = Comment
585 ch = s.next()
587 case '`':
588 if s.Mode&ScanRawStrings != 0 {
589 s.scanRawString()
590 tok = String
592 ch = s.next()
593 default:
594 ch = s.next()
598 // end of token text
599 s.tokEnd = s.srcPos - 1
601 s.ch = ch
602 return tok
606 // Position returns the current source position. If called before Next()
607 // or Scan(), it returns the position of the next Unicode character or token
608 // returned by these functions. If called afterwards, it returns the position
609 // immediately after the last character of the most recent token or character
610 // scanned.
611 func (s *Scanner) Pos() Position {
612 return Position{
613 s.Filename,
614 s.srcBufOffset + s.srcPos - 1,
615 s.line,
616 s.column,
621 // TokenText returns the string corresponding to the most recently scanned token.
622 // Valid after calling Scan().
623 func (s *Scanner) TokenText() string {
624 if s.tokPos < 0 {
625 // no token text
626 return ""
629 if s.tokEnd < 0 {
630 // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0)
631 s.tokEnd = s.tokPos
634 if s.tokBuf.Len() == 0 {
635 // common case: the entire token text is still in srcBuf
636 return string(s.srcBuf[s.tokPos:s.tokEnd])
639 // part of the token text was saved in tokBuf: save the rest in
640 // tokBuf as well and return its content
641 s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
642 s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
643 return s.tokBuf.String()