1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // A scanner and tokenizer for UTF-8-encoded text. Takes an io.Reader
6 // providing the source, which then can be tokenized through repeated calls
7 // to the Scan function. For compatibility with existing tools, the NUL
8 // character is not allowed (implementation restriction).
10 // By default, a Scanner skips white space and Go comments and recognizes all
11 // literals as defined by the Go language specification. It may be
12 // customized to recognize only a subset of those literals and to recognize
13 // different white space characters.
15 // Basic usage pattern:
17 // var s scanner.Scanner
20 // for tok != scanner.EOF {
21 // // do something with tok
37 // A source position is represented by a Position value.
38 // A position is valid if Line > 0.
39 type Position
struct {
40 Filename
string // filename, if any
41 Offset
int // byte offset, starting at 0
42 Line
int // line number, starting at 1
43 Column
int // column number, starting at 0 (character count per line)
47 // IsValid returns true if the position is valid.
48 func (pos
*Position
) IsValid() bool { return pos
.Line
> 0 }
51 func (pos Position
) String() string {
57 s
+= fmt
.Sprintf("%d:%d", pos
.Line
, pos
.Column
)
66 // Predefined mode bits to control recognition of tokens. For instance,
67 // to configure a Scanner such that it only recognizes (Go) identifiers,
68 // integers, and skips comments, set the Scanner's Mode field to:
70 // ScanIdents | ScanInts | SkipComments
73 ScanIdents
= 1 << -Ident
75 ScanFloats
= 1 << -Float
// includes Ints
76 ScanChars
= 1 << -Char
77 ScanStrings
= 1 << -String
78 ScanRawStrings
= 1 << -RawString
79 ScanComments
= 1 << -Comment
80 SkipComments
= 1 << -skipComment
// if set with ScanComments, comments become white space
81 GoTokens
= ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
85 // The result of Scan is one of the following tokens or a Unicode character.
99 var tokenString
= map[int]string{
106 RawString
: "RawString",
111 // TokenString returns a (visible) string for a token or Unicode character.
112 func TokenString(tok
int) string {
113 if s
, found
:= tokenString
[tok
]; found
{
116 return fmt
.Sprintf("U+%04X", tok
)
120 // GoWhitespace is the default value for the Scanner's Whitespace field.
121 // Its value selects Go's white space characters.
122 const GoWhitespace
= 1<<'\t' |
1<<'\n' |
1<<'\r' |
1<<' '
125 const bufLen
= 1024 // at least utf8.UTFMax
127 // A Scanner implements reading of Unicode characters and tokens from an io.Reader.
128 type Scanner
struct {
133 srcBuf
[bufLen
+ 1]byte // +1 for sentinel for common case of s.next()
134 srcPos
int // reading position (srcBuf index)
135 srcEnd
int // source end (srcBuf index)
138 srcBufOffset
int // byte offset of srcBuf[0] in source
139 line
int // newline count + 1
140 column
int // character count on line
143 // Typically, token text is stored completely in srcBuf, but in general
144 // the token text's head may be buffered in tokBuf while the token text's
145 // tail is stored in srcBuf.
146 tokBuf bytes
.Buffer
// token text head that is not in srcBuf anymore
147 tokPos
int // token text tail position (srcBuf index)
148 tokEnd
int // token text tail end (srcBuf index)
150 // One character look-ahead
151 ch
int // character before current srcPos
153 // Error is called for each error encountered. If no Error
154 // function is set, the error is reported to os.Stderr.
155 Error
func(s
*Scanner
, msg
string)
157 // ErrorCount is incremented by one for each error encountered.
160 // The Mode field controls which tokens are recognized. For instance,
161 // to recognize Ints, set the ScanInts bit in Mode. The field may be
162 // changed at any time.
165 // The Whitespace field controls which characters are recognized
166 // as white space. To recognize a character ch <= ' ' as white space,
167 // set the ch'th bit in Whitespace (the Scanner's behavior is undefined
168 // for values ch > ' '). The field may be changed at any time.
171 // Current token position. The Offset, Line, and Column fields
172 // are set by Scan(); the Filename field is left untouched by the
178 // Init initializes a Scanner with a new source and returns itself.
179 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
180 // and Whitespace is set to GoWhitespace.
181 func (s
*Scanner
) Init(src io
.Reader
) *Scanner
{
184 // initialize source buffer
185 s
.srcBuf
[0] = utf8
.RuneSelf
// sentinel
189 // initialize source position
194 // initialize token text buffer
197 // initialize one character look-ahead
200 // initialize public fields
204 s
.Whitespace
= GoWhitespace
210 // next reads and returns the next Unicode character. It is designed such
211 // that only a minimal amount of work needs to be done in the common ASCII
212 // case (one test to check for both ASCII and end-of-buffer, and one test
213 // to check for newlines).
214 func (s
*Scanner
) next() int {
215 ch
:= int(s
.srcBuf
[s
.srcPos
])
217 if ch
>= utf8
.RuneSelf
{
218 // uncommon case: not ASCII or not enough bytes
219 for s
.srcPos
+utf8
.UTFMax
> s
.srcEnd
&& !utf8
.FullRune(s
.srcBuf
[s
.srcPos
:s
.srcEnd
]) {
220 // not enough bytes: read some more, but first
221 // save away token text if any
223 s
.tokBuf
.Write(s
.srcBuf
[s
.tokPos
:s
.srcPos
])
226 // move unread bytes to beginning of buffer
227 copy(s
.srcBuf
[0:], s
.srcBuf
[s
.srcPos
:s
.srcEnd
])
228 s
.srcBufOffset
+= s
.srcPos
230 i
:= s
.srcEnd
- s
.srcPos
231 n
, err
:= s
.src
.Read(s
.srcBuf
[i
:bufLen
])
234 s
.srcBuf
[s
.srcEnd
] = utf8
.RuneSelf
// sentinel
240 s
.error(err
.String())
246 ch
= int(s
.srcBuf
[s
.srcPos
])
247 if ch
>= utf8
.RuneSelf
{
248 // uncommon case: not ASCII
250 ch
, width
= utf8
.DecodeRune(s
.srcBuf
[s
.srcPos
:s
.srcEnd
])
251 if ch
== utf8
.RuneError
&& width
== 1 {
252 s
.error("illegal UTF-8 encoding")
254 s
.srcPos
+= width
- 1
262 // implementation restriction for compatibility with other tools
263 s
.error("illegal character NUL")
273 // Next reads and returns the next Unicode character.
274 // It returns EOF at the end of the source. It reports
275 // a read error by calling s.Error, if set, or else
276 // prints an error message to os.Stderr. Next does not
277 // update the Scanner's Position field; use Pos() to
278 // get the current position.
279 func (s
*Scanner
) Next() int {
280 s
.tokPos
= -1 // don't collect token text
287 // Peek returns the next Unicode character in the source without advancing
288 // the scanner. It returns EOF if the scanner's position is at the last
289 // character of the source.
290 func (s
*Scanner
) Peek() int {
295 func (s
*Scanner
) error(msg
string) {
301 fmt
.Fprintf(os
.Stderr
, "%s: %s", s
.Position
, msg
)
305 func (s
*Scanner
) scanIdentifier() int {
306 ch
:= s
.next() // read character after first '_' or letter
307 for ch
== '_' || unicode
.IsLetter(ch
) || unicode
.IsDigit(ch
) {
314 func digitVal(ch
int) int {
316 case '0' <= ch
&& ch
<= '9':
318 case 'a' <= ch
&& ch
<= 'f':
320 case 'A' <= ch
&& ch
<= 'F':
323 return 16 // larger than any legal digit val
327 func isDecimal(ch
int) bool { return '0' <= ch
&& ch
<= '9' }
330 func (s
*Scanner
) scanMantissa(ch
int) int {
338 func (s
*Scanner
) scanFraction(ch
int) int {
340 ch
= s
.scanMantissa(s
.next())
346 func (s
*Scanner
) scanExponent(ch
int) int {
347 if ch
== 'e' || ch
== 'E' {
349 if ch
== '-' || ch
== '+' {
352 ch
= s
.scanMantissa(ch
)
358 func (s
*Scanner
) scanNumber(ch
int) (int, int) {
363 if ch
== 'x' || ch
== 'X' {
366 for digitVal(ch
) < 16 {
370 // octal int or float
371 seenDecimalDigit
:= false
374 seenDecimalDigit
= true
378 if s
.Mode
&ScanFloats
!= 0 && (ch
== '.' || ch
== 'e' || ch
== 'E') {
380 ch
= s
.scanFraction(ch
)
381 ch
= s
.scanExponent(ch
)
385 if seenDecimalDigit
{
386 s
.error("illegal octal number")
391 // decimal int or float
392 ch
= s
.scanMantissa(ch
)
393 if s
.Mode
&ScanFloats
!= 0 && (ch
== '.' || ch
== 'e' || ch
== 'E') {
395 ch
= s
.scanFraction(ch
)
396 ch
= s
.scanExponent(ch
)
403 func (s
*Scanner
) scanDigits(ch
, base
, n
int) int {
404 for n
> 0 && digitVal(ch
) < base
{
409 s
.error("illegal char escape")
415 func (s
*Scanner
) scanEscape(quote
int) int {
416 ch
:= s
.next() // read character after '/'
418 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote
:
421 case '0', '1', '2', '3', '4', '5', '6', '7':
422 ch
= s
.scanDigits(ch
, 8, 3)
424 ch
= s
.scanDigits(s
.next(), 16, 2)
426 ch
= s
.scanDigits(s
.next(), 16, 4)
428 ch
= s
.scanDigits(s
.next(), 16, 8)
430 s
.error("illegal char escape")
436 func (s
*Scanner
) scanString(quote
int) (n
int) {
437 ch
:= s
.next() // read character after quote
439 if ch
== '\n' || ch
< 0 {
440 s
.error("literal not terminated")
444 ch
= s
.scanEscape(quote
)
454 func (s
*Scanner
) scanRawString() {
455 ch
:= s
.next() // read character after '`'
458 s
.error("literal not terminated")
466 func (s
*Scanner
) scanChar() {
467 if s
.scanString('\'') != 1 {
468 s
.error("illegal char literal")
473 func (s
*Scanner
) scanLineComment() {
474 ch
:= s
.next() // read character after "//"
477 s
.error("comment not terminated")
485 func (s
*Scanner
) scanGeneralComment() {
486 ch
:= s
.next() // read character after "/*"
489 s
.error("comment not terminated")
494 if ch0
== '*' && ch
== '/' {
501 func (s
*Scanner
) scanComment(ch
int) {
502 // ch == '/' || ch == '*'
507 s
.scanGeneralComment()
511 // Scan reads the next token or Unicode character from source and returns it.
512 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
513 // It returns EOF at the end of the source. It reports scanner errors (read and
514 // token errors) by calling s.Error, if set; otherwise it prints an error message
516 func (s
*Scanner
) Scan() int {
519 // reset token text position
524 for s
.Whitespace
&(1<<uint(ch
)) != 0 {
528 // start collecting token text
530 s
.tokPos
= s
.srcPos
- 1
532 // set token position
533 s
.Offset
= s
.srcBufOffset
+ s
.tokPos
537 // determine token value
540 case unicode
.IsLetter(ch
) || ch
== '_':
541 if s
.Mode
&ScanIdents
!= 0 {
543 ch
= s
.scanIdentifier()
548 if s
.Mode
&(ScanInts|ScanFloats
) != 0 {
549 tok
, ch
= s
.scanNumber(ch
)
556 if s
.Mode
&ScanStrings
!= 0 {
562 if s
.Mode
&ScanChars
!= 0 {
569 if isDecimal(ch
) && s
.Mode
&ScanFloats
!= 0 {
571 ch
= s
.scanMantissa(ch
)
572 ch
= s
.scanExponent(ch
)
576 if (ch
== '/' || ch
== '*') && s
.Mode
&ScanComments
!= 0 {
577 if s
.Mode
&SkipComments
!= 0 {
578 s
.tokPos
= -1 // don't collect token text
588 if s
.Mode
&ScanRawStrings
!= 0 {
599 s
.tokEnd
= s
.srcPos
- 1
606 // Position returns the current source position. If called before Next()
607 // or Scan(), it returns the position of the next Unicode character or token
608 // returned by these functions. If called afterwards, it returns the position
609 // immediately after the last character of the most recent token or character
611 func (s
*Scanner
) Pos() Position
{
614 s
.srcBufOffset
+ s
.srcPos
- 1,
621 // TokenText returns the string corresponding to the most recently scanned token.
622 // Valid after calling Scan().
623 func (s
*Scanner
) TokenText() string {
630 // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0)
634 if s
.tokBuf
.Len() == 0 {
635 // common case: the entire token text is still in srcBuf
636 return string(s
.srcBuf
[s
.tokPos
:s
.tokEnd
])
639 // part of the token text was saved in tokBuf: save the rest in
640 // tokBuf as well and return its content
641 s
.tokBuf
.Write(s
.srcBuf
[s
.tokPos
:s
.tokEnd
])
642 s
.tokPos
= s
.tokEnd
// ensure idempotency of TokenText() call
643 return s
.tokBuf
.String()