libgo/go/text/template/parse/lex.go

   1 // Copyright 2011 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package parse
   6
   7 import (
   8         "fmt"
   9         "strings"
  10         "unicode"
  11         "unicode/utf8"
  12 )
  13
  14 // item represents a token or text string returned from the scanner.
  15 type item struct {
  16         typ  itemType // The type of this item.
  17         pos  Pos      // The starting position, in bytes, of this item in the input string.
  18         val  string   // The value of this item.
  19         line int      // The line number at the start of this item.
  20 }
  21
  22 func (i item) String() string {
  23         switch {
  24         case i.typ == itemEOF:
  25                 return "EOF"
  26         case i.typ == itemError:
  27                 return i.val
  28         case i.typ > itemKeyword:
  29                 return fmt.Sprintf("<%s>", i.val)
  30         case len(i.val) > 10:
  31                 return fmt.Sprintf("%.10q...", i.val)
  32         }
  33         return fmt.Sprintf("%q", i.val)
  34 }
  35
  36 // itemType identifies the type of lex items.
  37 type itemType int
  38
  39 const (
  40         itemError        itemType = iota // error occurred; value is text of error
  41         itemBool                         // boolean constant
  42         itemChar                         // printable ASCII character; grab bag for comma etc.
  43         itemCharConstant                 // character constant
  44         itemComplex                      // complex constant (1+2i); imaginary is just a number
  45         itemColonEquals                  // colon-equals (':=') introducing a declaration
  46         itemEOF
  47         itemField      // alphanumeric identifier starting with '.'
  48         itemIdentifier // alphanumeric identifier not starting with '.'
  49         itemLeftDelim  // left action delimiter
  50         itemLeftParen  // '(' inside action
  51         itemNumber     // simple number, including imaginary
  52         itemPipe       // pipe symbol
  53         itemRawString  // raw quoted string (includes quotes)
  54         itemRightDelim // right action delimiter
  55         itemRightParen // ')' inside action
  56         itemSpace      // run of spaces separating arguments
  57         itemString     // quoted string (includes quotes)
  58         itemText       // plain text
  59         itemVariable   // variable starting with '$', such as '$' or  '$1' or '$hello'
  60         // Keywords appear after all the rest.
  61         itemKeyword  // used only to delimit the keywords
  62         itemBlock    // block keyword
  63         itemDot      // the cursor, spelled '.'
  64         itemDefine   // define keyword
  65         itemElse     // else keyword
  66         itemEnd      // end keyword
  67         itemIf       // if keyword
  68         itemNil      // the untyped nil constant, easiest to treat as a keyword
  69         itemRange    // range keyword
  70         itemTemplate // template keyword
  71         itemWith     // with keyword
  72 )
  73
  74 var key = map[string]itemType{
  75         ".":        itemDot,
  76         "block":    itemBlock,
  77         "define":   itemDefine,
  78         "else":     itemElse,
  79         "end":      itemEnd,
  80         "if":       itemIf,
  81         "range":    itemRange,
  82         "nil":      itemNil,
  83         "template": itemTemplate,
  84         "with":     itemWith,
  85 }
  86
  87 const eof = -1
  88
  89 // Trimming spaces.
  90 // If the action begins "{{- " rather than "{{", then all space/tab/newlines
  91 // preceding the action are trimmed; conversely if it ends " -}}" the
  92 // leading spaces are trimmed. This is done entirely in the lexer; the
  93 // parser never sees it happen. We require an ASCII space to be
  94 // present to avoid ambiguity with things like "{{-3}}". It reads
  95 // better with the space present anyway. For simplicity, only ASCII
  96 // space does the job.
  97 const (
  98         spaceChars      = " \t\r\n" // These are the space characters defined by Go itself.
  99         leftTrimMarker  = "- "      // Attached to left delimiter, trims trailing spaces from preceding text.
 100         rightTrimMarker = " -"      // Attached to right delimiter, trims leading spaces from following text.
 101         trimMarkerLen   = Pos(len(leftTrimMarker))
 102 )
 103
 104 // stateFn represents the state of the scanner as a function that returns the next state.
 105 type stateFn func(*lexer) stateFn
 106
 107 // lexer holds the state of the scanner.
 108 type lexer struct {
 109         name       string    // the name of the input; used only for error reports
 110         input      string    // the string being scanned
 111         leftDelim  string    // start of action
 112         rightDelim string    // end of action
 113         pos        Pos       // current position in the input
 114         start      Pos       // start position of this item
 115         width      Pos       // width of last rune read from input
 116         items      chan item // channel of scanned items
 117         parenDepth int       // nesting depth of ( ) exprs
 118         line       int       // 1+number of newlines seen
 119 }
 120
 121 // next returns the next rune in the input.
 122 func (l *lexer) next() rune {
 123         if int(l.pos) >= len(l.input) {
 124                 l.width = 0
 125                 return eof
 126         }
 127         r, w := utf8.DecodeRuneInString(l.input[l.pos:])
 128         l.width = Pos(w)
 129         l.pos += l.width
 130         if r == '\n' {
 131                 l.line++
 132         }
 133         return r
 134 }
 135
 136 // peek returns but does not consume the next rune in the input.
 137 func (l *lexer) peek() rune {
 138         r := l.next()
 139         l.backup()
 140         return r
 141 }
 142
 143 // backup steps back one rune. Can only be called once per call of next.
 144 func (l *lexer) backup() {
 145         l.pos -= l.width
 146         // Correct newline count.
 147         if l.width == 1 && l.input[l.pos] == '\n' {
 148                 l.line--
 149         }
 150 }
 151
 152 // emit passes an item back to the client.
 153 func (l *lexer) emit(t itemType) {
 154         l.items <- item{t, l.start, l.input[l.start:l.pos], l.line}
 155         // Some items contain text internally. If so, count their newlines.
 156         switch t {
 157         case itemText, itemRawString, itemLeftDelim, itemRightDelim:
 158                 l.line += strings.Count(l.input[l.start:l.pos], "\n")
 159         }
 160         l.start = l.pos
 161 }
 162
 163 // ignore skips over the pending input before this point.
 164 func (l *lexer) ignore() {
 165         l.line += strings.Count(l.input[l.start:l.pos], "\n")
 166         l.start = l.pos
 167 }
 168
 169 // accept consumes the next rune if it's from the valid set.
 170 func (l *lexer) accept(valid string) bool {
 171         if strings.ContainsRune(valid, l.next()) {
 172                 return true
 173         }
 174         l.backup()
 175         return false
 176 }
 177
 178 // acceptRun consumes a run of runes from the valid set.
 179 func (l *lexer) acceptRun(valid string) {
 180         for strings.ContainsRune(valid, l.next()) {
 181         }
 182         l.backup()
 183 }
 184
 185 // errorf returns an error token and terminates the scan by passing
 186 // back a nil pointer that will be the next state, terminating l.nextItem.
 187 func (l *lexer) errorf(format string, args ...interface{}) stateFn {
 188         l.items <- item{itemError, l.start, fmt.Sprintf(format, args...), l.line}
 189         return nil
 190 }
 191
 192 // nextItem returns the next item from the input.
 193 // Called by the parser, not in the lexing goroutine.
 194 func (l *lexer) nextItem() item {
 195         return <-l.items
 196 }
 197
 198 // drain drains the output so the lexing goroutine will exit.
 199 // Called by the parser, not in the lexing goroutine.
 200 func (l *lexer) drain() {
 201         for range l.items {
 202         }
 203 }
 204
 205 // lex creates a new scanner for the input string.
 206 func lex(name, input, left, right string) *lexer {
 207         if left == "" {
 208                 left = leftDelim
 209         }
 210         if right == "" {
 211                 right = rightDelim
 212         }
 213         l := &lexer{
 214                 name:       name,
 215                 input:      input,
 216                 leftDelim:  left,
 217                 rightDelim: right,
 218                 items:      make(chan item),
 219                 line:       1,
 220         }
 221         go l.run()
 222         return l
 223 }
 224
 225 // run runs the state machine for the lexer.
 226 func (l *lexer) run() {
 227         for state := lexText; state != nil; {
 228                 state = state(l)
 229         }
 230         close(l.items)
 231 }
 232
 233 // state functions
 234
 235 const (
 236         leftDelim    = "{{"
 237         rightDelim   = "}}"
 238         leftComment  = "/*"
 239         rightComment = "*/"
 240 )
 241
 242 // lexText scans until an opening action delimiter, "{{".
 243 func lexText(l *lexer) stateFn {
 244         l.width = 0
 245         if x := strings.Index(l.input[l.pos:], l.leftDelim); x >= 0 {
 246                 ldn := Pos(len(l.leftDelim))
 247                 l.pos += Pos(x)
 248                 trimLength := Pos(0)
 249                 if strings.HasPrefix(l.input[l.pos+ldn:], leftTrimMarker) {
 250                         trimLength = rightTrimLength(l.input[l.start:l.pos])
 251                 }
 252                 l.pos -= trimLength
 253                 if l.pos > l.start {
 254                         l.emit(itemText)
 255                 }
 256                 l.pos += trimLength
 257                 l.ignore()
 258                 return lexLeftDelim
 259         } else {
 260                 l.pos = Pos(len(l.input))
 261         }
 262         // Correctly reached EOF.
 263         if l.pos > l.start {
 264                 l.emit(itemText)
 265         }
 266         l.emit(itemEOF)
 267         return nil
 268 }
 269
 270 // rightTrimLength returns the length of the spaces at the end of the string.
 271 func rightTrimLength(s string) Pos {
 272         return Pos(len(s) - len(strings.TrimRight(s, spaceChars)))
 273 }
 274
 275 // atRightDelim reports whether the lexer is at a right delimiter, possibly preceded by a trim marker.
 276 func (l *lexer) atRightDelim() (delim, trimSpaces bool) {
 277         if strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
 278                 return true, false
 279         }
 280         // The right delim might have the marker before.
 281         if strings.HasPrefix(l.input[l.pos:], rightTrimMarker) &&
 282                 strings.HasPrefix(l.input[l.pos+trimMarkerLen:], l.rightDelim) {
 283                 return true, true
 284         }
 285         return false, false
 286 }
 287
 288 // leftTrimLength returns the length of the spaces at the beginning of the string.
 289 func leftTrimLength(s string) Pos {
 290         return Pos(len(s) - len(strings.TrimLeft(s, spaceChars)))
 291 }
 292
 293 // lexLeftDelim scans the left delimiter, which is known to be present, possibly with a trim marker.
 294 func lexLeftDelim(l *lexer) stateFn {
 295         l.pos += Pos(len(l.leftDelim))
 296         trimSpace := strings.HasPrefix(l.input[l.pos:], leftTrimMarker)
 297         afterMarker := Pos(0)
 298         if trimSpace {
 299                 afterMarker = trimMarkerLen
 300         }
 301         if strings.HasPrefix(l.input[l.pos+afterMarker:], leftComment) {
 302                 l.pos += afterMarker
 303                 l.ignore()
 304                 return lexComment
 305         }
 306         l.emit(itemLeftDelim)
 307         l.pos += afterMarker
 308         l.ignore()
 309         l.parenDepth = 0
 310         return lexInsideAction
 311 }
 312
 313 // lexComment scans a comment. The left comment marker is known to be present.
 314 func lexComment(l *lexer) stateFn {
 315         l.pos += Pos(len(leftComment))
 316         i := strings.Index(l.input[l.pos:], rightComment)
 317         if i < 0 {
 318                 return l.errorf("unclosed comment")
 319         }
 320         l.pos += Pos(i + len(rightComment))
 321         delim, trimSpace := l.atRightDelim()
 322         if !delim {
 323                 return l.errorf("comment ends before closing delimiter")
 324         }
 325         if trimSpace {
 326                 l.pos += trimMarkerLen
 327         }
 328         l.pos += Pos(len(l.rightDelim))
 329         if trimSpace {
 330                 l.pos += leftTrimLength(l.input[l.pos:])
 331         }
 332         l.ignore()
 333         return lexText
 334 }
 335
 336 // lexRightDelim scans the right delimiter, which is known to be present, possibly with a trim marker.
 337 func lexRightDelim(l *lexer) stateFn {
 338         trimSpace := strings.HasPrefix(l.input[l.pos:], rightTrimMarker)
 339         if trimSpace {
 340                 l.pos += trimMarkerLen
 341                 l.ignore()
 342         }
 343         l.pos += Pos(len(l.rightDelim))
 344         l.emit(itemRightDelim)
 345         if trimSpace {
 346                 l.pos += leftTrimLength(l.input[l.pos:])
 347                 l.ignore()
 348         }
 349         return lexText
 350 }
 351
 352 // lexInsideAction scans the elements inside action delimiters.
 353 func lexInsideAction(l *lexer) stateFn {
 354         // Either number, quoted string, or identifier.
 355         // Spaces separate arguments; runs of spaces turn into itemSpace.
 356         // Pipe symbols separate and are emitted.
 357         delim, _ := l.atRightDelim()
 358         if delim {
 359                 if l.parenDepth == 0 {
 360                         return lexRightDelim
 361                 }
 362                 return l.errorf("unclosed left paren")
 363         }
 364         switch r := l.next(); {
 365         case r == eof || isEndOfLine(r):
 366                 return l.errorf("unclosed action")
 367         case isSpace(r):
 368                 return lexSpace
 369         case r == ':':
 370                 if l.next() != '=' {
 371                         return l.errorf("expected :=")
 372                 }
 373                 l.emit(itemColonEquals)
 374         case r == '|':
 375                 l.emit(itemPipe)
 376         case r == '"':
 377                 return lexQuote
 378         case r == '`':
 379                 return lexRawQuote
 380         case r == '$':
 381                 return lexVariable
 382         case r == '\'':
 383                 return lexChar
 384         case r == '.':
 385                 // special look-ahead for ".field" so we don't break l.backup().
 386                 if l.pos < Pos(len(l.input)) {
 387                         r := l.input[l.pos]
 388                         if r < '0' || '9' < r {
 389                                 return lexField
 390                         }
 391                 }
 392                 fallthrough // '.' can start a number.
 393         case r == '+' || r == '-' || ('0' <= r && r <= '9'):
 394                 l.backup()
 395                 return lexNumber
 396         case isAlphaNumeric(r):
 397                 l.backup()
 398                 return lexIdentifier
 399         case r == '(':
 400                 l.emit(itemLeftParen)
 401                 l.parenDepth++
 402         case r == ')':
 403                 l.emit(itemRightParen)
 404                 l.parenDepth--
 405                 if l.parenDepth < 0 {
 406                         return l.errorf("unexpected right paren %#U", r)
 407                 }
 408         case r <= unicode.MaxASCII && unicode.IsPrint(r):
 409                 l.emit(itemChar)
 410                 return lexInsideAction
 411         default:
 412                 return l.errorf("unrecognized character in action: %#U", r)
 413         }
 414         return lexInsideAction
 415 }
 416
 417 // lexSpace scans a run of space characters.
 418 // One space has already been seen.
 419 func lexSpace(l *lexer) stateFn {
 420         for isSpace(l.peek()) {
 421                 l.next()
 422         }
 423         l.emit(itemSpace)
 424         return lexInsideAction
 425 }
 426
 427 // lexIdentifier scans an alphanumeric.
 428 func lexIdentifier(l *lexer) stateFn {
 429 Loop:
 430         for {
 431                 switch r := l.next(); {
 432                 case isAlphaNumeric(r):
 433                         // absorb.
 434                 default:
 435                         l.backup()
 436                         word := l.input[l.start:l.pos]
 437                         if !l.atTerminator() {
 438                                 return l.errorf("bad character %#U", r)
 439                         }
 440                         switch {
 441                         case key[word] > itemKeyword:
 442                                 l.emit(key[word])
 443                         case word[0] == '.':
 444                                 l.emit(itemField)
 445                         case word == "true", word == "false":
 446                                 l.emit(itemBool)
 447                         default:
 448                                 l.emit(itemIdentifier)
 449                         }
 450                         break Loop
 451                 }
 452         }
 453         return lexInsideAction
 454 }
 455
 456 // lexField scans a field: .Alphanumeric.
 457 // The . has been scanned.
 458 func lexField(l *lexer) stateFn {
 459         return lexFieldOrVariable(l, itemField)
 460 }
 461
 462 // lexVariable scans a Variable: $Alphanumeric.
 463 // The $ has been scanned.
 464 func lexVariable(l *lexer) stateFn {
 465         if l.atTerminator() { // Nothing interesting follows -> "$".
 466                 l.emit(itemVariable)
 467                 return lexInsideAction
 468         }
 469         return lexFieldOrVariable(l, itemVariable)
 470 }
 471
 472 // lexVariable scans a field or variable: [.$]Alphanumeric.
 473 // The . or $ has been scanned.
 474 func lexFieldOrVariable(l *lexer, typ itemType) stateFn {
 475         if l.atTerminator() { // Nothing interesting follows -> "." or "$".
 476                 if typ == itemVariable {
 477                         l.emit(itemVariable)
 478                 } else {
 479                         l.emit(itemDot)
 480                 }
 481                 return lexInsideAction
 482         }
 483         var r rune
 484         for {
 485                 r = l.next()
 486                 if !isAlphaNumeric(r) {
 487                         l.backup()
 488                         break
 489                 }
 490         }
 491         if !l.atTerminator() {
 492                 return l.errorf("bad character %#U", r)
 493         }
 494         l.emit(typ)
 495         return lexInsideAction
 496 }
 497
 498 // atTerminator reports whether the input is at valid termination character to
 499 // appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
 500 // like "$x+2" not being acceptable without a space, in case we decide one
 501 // day to implement arithmetic.
 502 func (l *lexer) atTerminator() bool {
 503         r := l.peek()
 504         if isSpace(r) || isEndOfLine(r) {
 505                 return true
 506         }
 507         switch r {
 508         case eof, '.', ',', '|', ':', ')', '(':
 509                 return true
 510         }
 511         // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
 512         // succeed but should fail) but only in extremely rare cases caused by willfully
 513         // bad choice of delimiter.
 514         if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
 515                 return true
 516         }
 517         return false
 518 }
 519
 520 // lexChar scans a character constant. The initial quote is already
 521 // scanned. Syntax checking is done by the parser.
 522 func lexChar(l *lexer) stateFn {
 523 Loop:
 524         for {
 525                 switch l.next() {
 526                 case '\\':
 527                         if r := l.next(); r != eof && r != '\n' {
 528                                 break
 529                         }
 530                         fallthrough
 531                 case eof, '\n':
 532                         return l.errorf("unterminated character constant")
 533                 case '\'':
 534                         break Loop
 535                 }
 536         }
 537         l.emit(itemCharConstant)
 538         return lexInsideAction
 539 }
 540
 541 // lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
 542 // isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
 543 // and "089" - but when it's wrong the input is invalid and the parser (via
 544 // strconv) will notice.
 545 func lexNumber(l *lexer) stateFn {
 546         if !l.scanNumber() {
 547                 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
 548         }
 549         if sign := l.peek(); sign == '+' || sign == '-' {
 550                 // Complex: 1+2i. No spaces, must end in 'i'.
 551                 if !l.scanNumber() || l.input[l.pos-1] != 'i' {
 552                         return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
 553                 }
 554                 l.emit(itemComplex)
 555         } else {
 556                 l.emit(itemNumber)
 557         }
 558         return lexInsideAction
 559 }
 560
 561 func (l *lexer) scanNumber() bool {
 562         // Optional leading sign.
 563         l.accept("+-")
 564         // Is it hex?
 565         digits := "0123456789"
 566         if l.accept("0") && l.accept("xX") {
 567                 digits = "0123456789abcdefABCDEF"
 568         }
 569         l.acceptRun(digits)
 570         if l.accept(".") {
 571                 l.acceptRun(digits)
 572         }
 573         if l.accept("eE") {
 574                 l.accept("+-")
 575                 l.acceptRun("0123456789")
 576         }
 577         // Is it imaginary?
 578         l.accept("i")
 579         // Next thing mustn't be alphanumeric.
 580         if isAlphaNumeric(l.peek()) {
 581                 l.next()
 582                 return false
 583         }
 584         return true
 585 }
 586
 587 // lexQuote scans a quoted string.
 588 func lexQuote(l *lexer) stateFn {
 589 Loop:
 590         for {
 591                 switch l.next() {
 592                 case '\\':
 593                         if r := l.next(); r != eof && r != '\n' {
 594                                 break
 595                         }
 596                         fallthrough
 597                 case eof, '\n':
 598                         return l.errorf("unterminated quoted string")
 599                 case '"':
 600                         break Loop
 601                 }
 602         }
 603         l.emit(itemString)
 604         return lexInsideAction
 605 }
 606
 607 // lexRawQuote scans a raw quoted string.
 608 func lexRawQuote(l *lexer) stateFn {
 609         startLine := l.line
 610 Loop:
 611         for {
 612                 switch l.next() {
 613                 case eof:
 614                         // Restore line number to location of opening quote.
 615                         // We will error out so it's ok just to overwrite the field.
 616                         l.line = startLine
 617                         return l.errorf("unterminated raw quoted string")
 618                 case '`':
 619                         break Loop
 620                 }
 621         }
 622         l.emit(itemRawString)
 623         return lexInsideAction
 624 }
 625
 626 // isSpace reports whether r is a space character.
 627 func isSpace(r rune) bool {
 628         return r == ' ' || r == '\t'
 629 }
 630
 631 // isEndOfLine reports whether r is an end-of-line character.
 632 func isEndOfLine(r rune) bool {
 633         return r == '\r' || r == '\n'
 634 }
 635
 636 // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
 637 func isAlphaNumeric(r rune) bool {
 638         return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
 639 }