libgo/go/encoding/json/scanner.go

   1 // Copyright 2010 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package json
   6
   7 // JSON value parser state machine.
   8 // Just about at the limit of what is reasonable to write by hand.
   9 // Some parts are a bit tedious, but overall it nicely factors out the
  10 // otherwise common code from the multiple scanning functions
  11 // in this package (Compact, Indent, checkValid, nextValue, etc).
  12 //
  13 // This file starts with two simple examples using the scanner
  14 // before diving into the scanner itself.
  15
  16 import "strconv"
  17
  18 // Valid reports whether data is a valid JSON encoding.
  19 func Valid(data []byte) bool {
  20         return checkValid(data, &scanner{}) == nil
  21 }
  22
  23 // checkValid verifies that data is valid JSON-encoded data.
  24 // scan is passed in for use by checkValid to avoid an allocation.
  25 func checkValid(data []byte, scan *scanner) error {
  26         scan.reset()
  27         for _, c := range data {
  28                 scan.bytes++
  29                 if scan.step(scan, c) == scanError {
  30                         return scan.err
  31                 }
  32         }
  33         if scan.eof() == scanError {
  34                 return scan.err
  35         }
  36         return nil
  37 }
  38
  39 // nextValue splits data after the next whole JSON value,
  40 // returning that value and the bytes that follow it as separate slices.
  41 // scan is passed in for use by nextValue to avoid an allocation.
  42 func nextValue(data []byte, scan *scanner) (value, rest []byte, err error) {
  43         scan.reset()
  44         for i, c := range data {
  45                 v := scan.step(scan, c)
  46                 if v >= scanEndObject {
  47                         switch v {
  48                         // probe the scanner with a space to determine whether we will
  49                         // get scanEnd on the next character. Otherwise, if the next character
  50                         // is not a space, scanEndTop allocates a needless error.
  51                         case scanEndObject, scanEndArray:
  52                                 if scan.step(scan, ' ') == scanEnd {
  53                                         return data[:i+1], data[i+1:], nil
  54                                 }
  55                         case scanError:
  56                                 return nil, nil, scan.err
  57                         case scanEnd:
  58                                 return data[:i], data[i:], nil
  59                         }
  60                 }
  61         }
  62         if scan.eof() == scanError {
  63                 return nil, nil, scan.err
  64         }
  65         return data, nil, nil
  66 }
  67
  68 // A SyntaxError is a description of a JSON syntax error.
  69 type SyntaxError struct {
  70         msg    string // description of error
  71         Offset int64  // error occurred after reading Offset bytes
  72 }
  73
  74 func (e *SyntaxError) Error() string { return e.msg }
  75
  76 // A scanner is a JSON scanning state machine.
  77 // Callers call scan.reset() and then pass bytes in one at a time
  78 // by calling scan.step(&scan, c) for each byte.
  79 // The return value, referred to as an opcode, tells the
  80 // caller about significant parsing events like beginning
  81 // and ending literals, objects, and arrays, so that the
  82 // caller can follow along if it wishes.
  83 // The return value scanEnd indicates that a single top-level
  84 // JSON value has been completed, *before* the byte that
  85 // just got passed in.  (The indication must be delayed in order
  86 // to recognize the end of numbers: is 123 a whole value or
  87 // the beginning of 12345e+6?).
  88 type scanner struct {
  89         // The step is a func to be called to execute the next transition.
  90         // Also tried using an integer constant and a single func
  91         // with a switch, but using the func directly was 10% faster
  92         // on a 64-bit Mac Mini, and it's nicer to read.
  93         step func(*scanner, byte) int
  94
  95         // Reached end of top-level value.
  96         endTop bool
  97
  98         // Stack of what we're in the middle of - array values, object keys, object values.
  99         parseState []int
 100
 101         // Error that happened, if any.
 102         err error
 103
 104         // 1-byte redo (see undo method)
 105         redo      bool
 106         redoCode  int
 107         redoState func(*scanner, byte) int
 108
 109         // total bytes consumed, updated by decoder.Decode
 110         bytes int64
 111 }
 112
 113 // These values are returned by the state transition functions
 114 // assigned to scanner.state and the method scanner.eof.
 115 // They give details about the current state of the scan that
 116 // callers might be interested to know about.
 117 // It is okay to ignore the return value of any particular
 118 // call to scanner.state: if one call returns scanError,
 119 // every subsequent call will return scanError too.
 120 const (
 121         // Continue.
 122         scanContinue     = iota // uninteresting byte
 123         scanBeginLiteral        // end implied by next result != scanContinue
 124         scanBeginObject         // begin object
 125         scanObjectKey           // just finished object key (string)
 126         scanObjectValue         // just finished non-last object value
 127         scanEndObject           // end object (implies scanObjectValue if possible)
 128         scanBeginArray          // begin array
 129         scanArrayValue          // just finished array value
 130         scanEndArray            // end array (implies scanArrayValue if possible)
 131         scanSkipSpace           // space byte; can skip; known to be last "continue" result
 132
 133         // Stop.
 134         scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
 135         scanError // hit an error, scanner.err.
 136 )
 137
 138 // These values are stored in the parseState stack.
 139 // They give the current state of a composite value
 140 // being scanned. If the parser is inside a nested value
 141 // the parseState describes the nested state, outermost at entry 0.
 142 const (
 143         parseObjectKey   = iota // parsing object key (before colon)
 144         parseObjectValue        // parsing object value (after colon)
 145         parseArrayValue         // parsing array value
 146 )
 147
 148 // reset prepares the scanner for use.
 149 // It must be called before calling s.step.
 150 func (s *scanner) reset() {
 151         s.step = stateBeginValue
 152         s.parseState = s.parseState[0:0]
 153         s.err = nil
 154         s.redo = false
 155         s.endTop = false
 156 }
 157
 158 // eof tells the scanner that the end of input has been reached.
 159 // It returns a scan status just as s.step does.
 160 func (s *scanner) eof() int {
 161         if s.err != nil {
 162                 return scanError
 163         }
 164         if s.endTop {
 165                 return scanEnd
 166         }
 167         s.step(s, ' ')
 168         if s.endTop {
 169                 return scanEnd
 170         }
 171         if s.err == nil {
 172                 s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
 173         }
 174         return scanError
 175 }
 176
 177 // pushParseState pushes a new parse state p onto the parse stack.
 178 func (s *scanner) pushParseState(p int) {
 179         s.parseState = append(s.parseState, p)
 180 }
 181
 182 // popParseState pops a parse state (already obtained) off the stack
 183 // and updates s.step accordingly.
 184 func (s *scanner) popParseState() {
 185         n := len(s.parseState) - 1
 186         s.parseState = s.parseState[0:n]
 187         s.redo = false
 188         if n == 0 {
 189                 s.step = stateEndTop
 190                 s.endTop = true
 191         } else {
 192                 s.step = stateEndValue
 193         }
 194 }
 195
 196 func isSpace(c byte) bool {
 197         return c == ' ' || c == '\t' || c == '\r' || c == '\n'
 198 }
 199
 200 // stateBeginValueOrEmpty is the state after reading `[`.
 201 func stateBeginValueOrEmpty(s *scanner, c byte) int {
 202         if c <= ' ' && isSpace(c) {
 203                 return scanSkipSpace
 204         }
 205         if c == ']' {
 206                 return stateEndValue(s, c)
 207         }
 208         return stateBeginValue(s, c)
 209 }
 210
 211 // stateBeginValue is the state at the beginning of the input.
 212 func stateBeginValue(s *scanner, c byte) int {
 213         if c <= ' ' && isSpace(c) {
 214                 return scanSkipSpace
 215         }
 216         switch c {
 217         case '{':
 218                 s.step = stateBeginStringOrEmpty
 219                 s.pushParseState(parseObjectKey)
 220                 return scanBeginObject
 221         case '[':
 222                 s.step = stateBeginValueOrEmpty
 223                 s.pushParseState(parseArrayValue)
 224                 return scanBeginArray
 225         case '"':
 226                 s.step = stateInString
 227                 return scanBeginLiteral
 228         case '-':
 229                 s.step = stateNeg
 230                 return scanBeginLiteral
 231         case '0': // beginning of 0.123
 232                 s.step = state0
 233                 return scanBeginLiteral
 234         case 't': // beginning of true
 235                 s.step = stateT
 236                 return scanBeginLiteral
 237         case 'f': // beginning of false
 238                 s.step = stateF
 239                 return scanBeginLiteral
 240         case 'n': // beginning of null
 241                 s.step = stateN
 242                 return scanBeginLiteral
 243         }
 244         if '1' <= c && c <= '9' { // beginning of 1234.5
 245                 s.step = state1
 246                 return scanBeginLiteral
 247         }
 248         return s.error(c, "looking for beginning of value")
 249 }
 250
 251 // stateBeginStringOrEmpty is the state after reading `{`.
 252 func stateBeginStringOrEmpty(s *scanner, c byte) int {
 253         if c <= ' ' && isSpace(c) {
 254                 return scanSkipSpace
 255         }
 256         if c == '}' {
 257                 n := len(s.parseState)
 258                 s.parseState[n-1] = parseObjectValue
 259                 return stateEndValue(s, c)
 260         }
 261         return stateBeginString(s, c)
 262 }
 263
 264 // stateBeginString is the state after reading `{"key": value,`.
 265 func stateBeginString(s *scanner, c byte) int {
 266         if c <= ' ' && isSpace(c) {
 267                 return scanSkipSpace
 268         }
 269         if c == '"' {
 270                 s.step = stateInString
 271                 return scanBeginLiteral
 272         }
 273         return s.error(c, "looking for beginning of object key string")
 274 }
 275
 276 // stateEndValue is the state after completing a value,
 277 // such as after reading `{}` or `true` or `["x"`.
 278 func stateEndValue(s *scanner, c byte) int {
 279         n := len(s.parseState)
 280         if n == 0 {
 281                 // Completed top-level before the current byte.
 282                 s.step = stateEndTop
 283                 s.endTop = true
 284                 return stateEndTop(s, c)
 285         }
 286         if c <= ' ' && isSpace(c) {
 287                 s.step = stateEndValue
 288                 return scanSkipSpace
 289         }
 290         ps := s.parseState[n-1]
 291         switch ps {
 292         case parseObjectKey:
 293                 if c == ':' {
 294                         s.parseState[n-1] = parseObjectValue
 295                         s.step = stateBeginValue
 296                         return scanObjectKey
 297                 }
 298                 return s.error(c, "after object key")
 299         case parseObjectValue:
 300                 if c == ',' {
 301                         s.parseState[n-1] = parseObjectKey
 302                         s.step = stateBeginString
 303                         return scanObjectValue
 304                 }
 305                 if c == '}' {
 306                         s.popParseState()
 307                         return scanEndObject
 308                 }
 309                 return s.error(c, "after object key:value pair")
 310         case parseArrayValue:
 311                 if c == ',' {
 312                         s.step = stateBeginValue
 313                         return scanArrayValue
 314                 }
 315                 if c == ']' {
 316                         s.popParseState()
 317                         return scanEndArray
 318                 }
 319                 return s.error(c, "after array element")
 320         }
 321         return s.error(c, "")
 322 }
 323
 324 // stateEndTop is the state after finishing the top-level value,
 325 // such as after reading `{}` or `[1,2,3]`.
 326 // Only space characters should be seen now.
 327 func stateEndTop(s *scanner, c byte) int {
 328         if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
 329                 // Complain about non-space byte on next call.
 330                 s.error(c, "after top-level value")
 331         }
 332         return scanEnd
 333 }
 334
 335 // stateInString is the state after reading `"`.
 336 func stateInString(s *scanner, c byte) int {
 337         if c == '"' {
 338                 s.step = stateEndValue
 339                 return scanContinue
 340         }
 341         if c == '\\' {
 342                 s.step = stateInStringEsc
 343                 return scanContinue
 344         }
 345         if c < 0x20 {
 346                 return s.error(c, "in string literal")
 347         }
 348         return scanContinue
 349 }
 350
 351 // stateInStringEsc is the state after reading `"\` during a quoted string.
 352 func stateInStringEsc(s *scanner, c byte) int {
 353         switch c {
 354         case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
 355                 s.step = stateInString
 356                 return scanContinue
 357         case 'u':
 358                 s.step = stateInStringEscU
 359                 return scanContinue
 360         }
 361         return s.error(c, "in string escape code")
 362 }
 363
 364 // stateInStringEscU is the state after reading `"\u` during a quoted string.
 365 func stateInStringEscU(s *scanner, c byte) int {
 366         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 367                 s.step = stateInStringEscU1
 368                 return scanContinue
 369         }
 370         // numbers
 371         return s.error(c, "in \\u hexadecimal character escape")
 372 }
 373
 374 // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
 375 func stateInStringEscU1(s *scanner, c byte) int {
 376         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 377                 s.step = stateInStringEscU12
 378                 return scanContinue
 379         }
 380         // numbers
 381         return s.error(c, "in \\u hexadecimal character escape")
 382 }
 383
 384 // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
 385 func stateInStringEscU12(s *scanner, c byte) int {
 386         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 387                 s.step = stateInStringEscU123
 388                 return scanContinue
 389         }
 390         // numbers
 391         return s.error(c, "in \\u hexadecimal character escape")
 392 }
 393
 394 // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
 395 func stateInStringEscU123(s *scanner, c byte) int {
 396         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 397                 s.step = stateInString
 398                 return scanContinue
 399         }
 400         // numbers
 401         return s.error(c, "in \\u hexadecimal character escape")
 402 }
 403
 404 // stateNeg is the state after reading `-` during a number.
 405 func stateNeg(s *scanner, c byte) int {
 406         if c == '0' {
 407                 s.step = state0
 408                 return scanContinue
 409         }
 410         if '1' <= c && c <= '9' {
 411                 s.step = state1
 412                 return scanContinue
 413         }
 414         return s.error(c, "in numeric literal")
 415 }
 416
 417 // state1 is the state after reading a non-zero integer during a number,
 418 // such as after reading `1` or `100` but not `0`.
 419 func state1(s *scanner, c byte) int {
 420         if '0' <= c && c <= '9' {
 421                 s.step = state1
 422                 return scanContinue
 423         }
 424         return state0(s, c)
 425 }
 426
 427 // state0 is the state after reading `0` during a number.
 428 func state0(s *scanner, c byte) int {
 429         if c == '.' {
 430                 s.step = stateDot
 431                 return scanContinue
 432         }
 433         if c == 'e' || c == 'E' {
 434                 s.step = stateE
 435                 return scanContinue
 436         }
 437         return stateEndValue(s, c)
 438 }
 439
 440 // stateDot is the state after reading the integer and decimal point in a number,
 441 // such as after reading `1.`.
 442 func stateDot(s *scanner, c byte) int {
 443         if '0' <= c && c <= '9' {
 444                 s.step = stateDot0
 445                 return scanContinue
 446         }
 447         return s.error(c, "after decimal point in numeric literal")
 448 }
 449
 450 // stateDot0 is the state after reading the integer, decimal point, and subsequent
 451 // digits of a number, such as after reading `3.14`.
 452 func stateDot0(s *scanner, c byte) int {
 453         if '0' <= c && c <= '9' {
 454                 return scanContinue
 455         }
 456         if c == 'e' || c == 'E' {
 457                 s.step = stateE
 458                 return scanContinue
 459         }
 460         return stateEndValue(s, c)
 461 }
 462
 463 // stateE is the state after reading the mantissa and e in a number,
 464 // such as after reading `314e` or `0.314e`.
 465 func stateE(s *scanner, c byte) int {
 466         if c == '+' || c == '-' {
 467                 s.step = stateESign
 468                 return scanContinue
 469         }
 470         return stateESign(s, c)
 471 }
 472
 473 // stateESign is the state after reading the mantissa, e, and sign in a number,
 474 // such as after reading `314e-` or `0.314e+`.
 475 func stateESign(s *scanner, c byte) int {
 476         if '0' <= c && c <= '9' {
 477                 s.step = stateE0
 478                 return scanContinue
 479         }
 480         return s.error(c, "in exponent of numeric literal")
 481 }
 482
 483 // stateE0 is the state after reading the mantissa, e, optional sign,
 484 // and at least one digit of the exponent in a number,
 485 // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
 486 func stateE0(s *scanner, c byte) int {
 487         if '0' <= c && c <= '9' {
 488                 return scanContinue
 489         }
 490         return stateEndValue(s, c)
 491 }
 492
 493 // stateT is the state after reading `t`.
 494 func stateT(s *scanner, c byte) int {
 495         if c == 'r' {
 496                 s.step = stateTr
 497                 return scanContinue
 498         }
 499         return s.error(c, "in literal true (expecting 'r')")
 500 }
 501
 502 // stateTr is the state after reading `tr`.
 503 func stateTr(s *scanner, c byte) int {
 504         if c == 'u' {
 505                 s.step = stateTru
 506                 return scanContinue
 507         }
 508         return s.error(c, "in literal true (expecting 'u')")
 509 }
 510
 511 // stateTru is the state after reading `tru`.
 512 func stateTru(s *scanner, c byte) int {
 513         if c == 'e' {
 514                 s.step = stateEndValue
 515                 return scanContinue
 516         }
 517         return s.error(c, "in literal true (expecting 'e')")
 518 }
 519
 520 // stateF is the state after reading `f`.
 521 func stateF(s *scanner, c byte) int {
 522         if c == 'a' {
 523                 s.step = stateFa
 524                 return scanContinue
 525         }
 526         return s.error(c, "in literal false (expecting 'a')")
 527 }
 528
 529 // stateFa is the state after reading `fa`.
 530 func stateFa(s *scanner, c byte) int {
 531         if c == 'l' {
 532                 s.step = stateFal
 533                 return scanContinue
 534         }
 535         return s.error(c, "in literal false (expecting 'l')")
 536 }
 537
 538 // stateFal is the state after reading `fal`.
 539 func stateFal(s *scanner, c byte) int {
 540         if c == 's' {
 541                 s.step = stateFals
 542                 return scanContinue
 543         }
 544         return s.error(c, "in literal false (expecting 's')")
 545 }
 546
 547 // stateFals is the state after reading `fals`.
 548 func stateFals(s *scanner, c byte) int {
 549         if c == 'e' {
 550                 s.step = stateEndValue
 551                 return scanContinue
 552         }
 553         return s.error(c, "in literal false (expecting 'e')")
 554 }
 555
 556 // stateN is the state after reading `n`.
 557 func stateN(s *scanner, c byte) int {
 558         if c == 'u' {
 559                 s.step = stateNu
 560                 return scanContinue
 561         }
 562         return s.error(c, "in literal null (expecting 'u')")
 563 }
 564
 565 // stateNu is the state after reading `nu`.
 566 func stateNu(s *scanner, c byte) int {
 567         if c == 'l' {
 568                 s.step = stateNul
 569                 return scanContinue
 570         }
 571         return s.error(c, "in literal null (expecting 'l')")
 572 }
 573
 574 // stateNul is the state after reading `nul`.
 575 func stateNul(s *scanner, c byte) int {
 576         if c == 'l' {
 577                 s.step = stateEndValue
 578                 return scanContinue
 579         }
 580         return s.error(c, "in literal null (expecting 'l')")
 581 }
 582
 583 // stateError is the state after reaching a syntax error,
 584 // such as after reading `[1}` or `5.1.2`.
 585 func stateError(s *scanner, c byte) int {
 586         return scanError
 587 }
 588
 589 // error records an error and switches to the error state.
 590 func (s *scanner) error(c byte, context string) int {
 591         s.step = stateError
 592         s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
 593         return scanError
 594 }
 595
 596 // quoteChar formats c as a quoted character literal
 597 func quoteChar(c byte) string {
 598         // special cases - different from quoted strings
 599         if c == '\'' {
 600                 return `'\''`
 601         }
 602         if c == '"' {
 603                 return `'"'`
 604         }
 605
 606         // use quoted string with different quotation marks
 607         s := strconv.Quote(string(c))
 608         return "'" + s[1:len(s)-1] + "'"
 609 }
 610
 611 // undo causes the scanner to return scanCode from the next state transition.
 612 // This gives callers a simple 1-byte undo mechanism.
 613 func (s *scanner) undo(scanCode int) {
 614         if s.redo {
 615                 panic("json: invalid use of scanner")
 616         }
 617         s.redoCode = scanCode
 618         s.redoState = s.step
 619         s.step = stateRedo
 620         s.redo = true
 621 }
 622
 623 // stateRedo helps implement the scanner's 1-byte undo.
 624 func stateRedo(s *scanner, c byte) int {
 625         s.redo = false
 626         s.step = s.redoState
 627         return s.redoCode
 628 }