libgo/go/encoding/json/scanner.go

   1 // Copyright 2010 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package json
   6
   7 // JSON value parser state machine.
   8 // Just about at the limit of what is reasonable to write by hand.
   9 // Some parts are a bit tedious, but overall it nicely factors out the
  10 // otherwise common code from the multiple scanning functions
  11 // in this package (Compact, Indent, checkValid, etc).
  12 //
  13 // This file starts with two simple examples using the scanner
  14 // before diving into the scanner itself.
  15
  16 import "strconv"
  17
  18 // Valid reports whether data is a valid JSON encoding.
  19 func Valid(data []byte) bool {
  20         return checkValid(data, &scanner{}) == nil
  21 }
  22
  23 // checkValid verifies that data is valid JSON-encoded data.
  24 // scan is passed in for use by checkValid to avoid an allocation.
  25 func checkValid(data []byte, scan *scanner) error {
  26         scan.reset()
  27         for _, c := range data {
  28                 scan.bytes++
  29                 if scan.step(scan, c) == scanError {
  30                         return scan.err
  31                 }
  32         }
  33         if scan.eof() == scanError {
  34                 return scan.err
  35         }
  36         return nil
  37 }
  38
  39 // A SyntaxError is a description of a JSON syntax error.
  40 type SyntaxError struct {
  41         msg    string // description of error
  42         Offset int64  // error occurred after reading Offset bytes
  43 }
  44
  45 func (e *SyntaxError) Error() string { return e.msg }
  46
  47 // A scanner is a JSON scanning state machine.
  48 // Callers call scan.reset() and then pass bytes in one at a time
  49 // by calling scan.step(&scan, c) for each byte.
  50 // The return value, referred to as an opcode, tells the
  51 // caller about significant parsing events like beginning
  52 // and ending literals, objects, and arrays, so that the
  53 // caller can follow along if it wishes.
  54 // The return value scanEnd indicates that a single top-level
  55 // JSON value has been completed, *before* the byte that
  56 // just got passed in.  (The indication must be delayed in order
  57 // to recognize the end of numbers: is 123 a whole value or
  58 // the beginning of 12345e+6?).
  59 type scanner struct {
  60         // The step is a func to be called to execute the next transition.
  61         // Also tried using an integer constant and a single func
  62         // with a switch, but using the func directly was 10% faster
  63         // on a 64-bit Mac Mini, and it's nicer to read.
  64         step func(*scanner, byte) int
  65
  66         // Reached end of top-level value.
  67         endTop bool
  68
  69         // Stack of what we're in the middle of - array values, object keys, object values.
  70         parseState []int
  71
  72         // Error that happened, if any.
  73         err error
  74
  75         // total bytes consumed, updated by decoder.Decode
  76         bytes int64
  77 }
  78
  79 // These values are returned by the state transition functions
  80 // assigned to scanner.state and the method scanner.eof.
  81 // They give details about the current state of the scan that
  82 // callers might be interested to know about.
  83 // It is okay to ignore the return value of any particular
  84 // call to scanner.state: if one call returns scanError,
  85 // every subsequent call will return scanError too.
  86 const (
  87         // Continue.
  88         scanContinue     = iota // uninteresting byte
  89         scanBeginLiteral        // end implied by next result != scanContinue
  90         scanBeginObject         // begin object
  91         scanObjectKey           // just finished object key (string)
  92         scanObjectValue         // just finished non-last object value
  93         scanEndObject           // end object (implies scanObjectValue if possible)
  94         scanBeginArray          // begin array
  95         scanArrayValue          // just finished array value
  96         scanEndArray            // end array (implies scanArrayValue if possible)
  97         scanSkipSpace           // space byte; can skip; known to be last "continue" result
  98
  99         // Stop.
 100         scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
 101         scanError // hit an error, scanner.err.
 102 )
 103
 104 // These values are stored in the parseState stack.
 105 // They give the current state of a composite value
 106 // being scanned. If the parser is inside a nested value
 107 // the parseState describes the nested state, outermost at entry 0.
 108 const (
 109         parseObjectKey   = iota // parsing object key (before colon)
 110         parseObjectValue        // parsing object value (after colon)
 111         parseArrayValue         // parsing array value
 112 )
 113
 114 // reset prepares the scanner for use.
 115 // It must be called before calling s.step.
 116 func (s *scanner) reset() {
 117         s.step = stateBeginValue
 118         s.parseState = s.parseState[0:0]
 119         s.err = nil
 120         s.endTop = false
 121 }
 122
 123 // eof tells the scanner that the end of input has been reached.
 124 // It returns a scan status just as s.step does.
 125 func (s *scanner) eof() int {
 126         if s.err != nil {
 127                 return scanError
 128         }
 129         if s.endTop {
 130                 return scanEnd
 131         }
 132         s.step(s, ' ')
 133         if s.endTop {
 134                 return scanEnd
 135         }
 136         if s.err == nil {
 137                 s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
 138         }
 139         return scanError
 140 }
 141
 142 // pushParseState pushes a new parse state p onto the parse stack.
 143 func (s *scanner) pushParseState(p int) {
 144         s.parseState = append(s.parseState, p)
 145 }
 146
 147 // popParseState pops a parse state (already obtained) off the stack
 148 // and updates s.step accordingly.
 149 func (s *scanner) popParseState() {
 150         n := len(s.parseState) - 1
 151         s.parseState = s.parseState[0:n]
 152         if n == 0 {
 153                 s.step = stateEndTop
 154                 s.endTop = true
 155         } else {
 156                 s.step = stateEndValue
 157         }
 158 }
 159
 160 func isSpace(c byte) bool {
 161         return c == ' ' || c == '\t' || c == '\r' || c == '\n'
 162 }
 163
 164 // stateBeginValueOrEmpty is the state after reading `[`.
 165 func stateBeginValueOrEmpty(s *scanner, c byte) int {
 166         if c <= ' ' && isSpace(c) {
 167                 return scanSkipSpace
 168         }
 169         if c == ']' {
 170                 return stateEndValue(s, c)
 171         }
 172         return stateBeginValue(s, c)
 173 }
 174
 175 // stateBeginValue is the state at the beginning of the input.
 176 func stateBeginValue(s *scanner, c byte) int {
 177         if c <= ' ' && isSpace(c) {
 178                 return scanSkipSpace
 179         }
 180         switch c {
 181         case '{':
 182                 s.step = stateBeginStringOrEmpty
 183                 s.pushParseState(parseObjectKey)
 184                 return scanBeginObject
 185         case '[':
 186                 s.step = stateBeginValueOrEmpty
 187                 s.pushParseState(parseArrayValue)
 188                 return scanBeginArray
 189         case '"':
 190                 s.step = stateInString
 191                 return scanBeginLiteral
 192         case '-':
 193                 s.step = stateNeg
 194                 return scanBeginLiteral
 195         case '0': // beginning of 0.123
 196                 s.step = state0
 197                 return scanBeginLiteral
 198         case 't': // beginning of true
 199                 s.step = stateT
 200                 return scanBeginLiteral
 201         case 'f': // beginning of false
 202                 s.step = stateF
 203                 return scanBeginLiteral
 204         case 'n': // beginning of null
 205                 s.step = stateN
 206                 return scanBeginLiteral
 207         }
 208         if '1' <= c && c <= '9' { // beginning of 1234.5
 209                 s.step = state1
 210                 return scanBeginLiteral
 211         }
 212         return s.error(c, "looking for beginning of value")
 213 }
 214
 215 // stateBeginStringOrEmpty is the state after reading `{`.
 216 func stateBeginStringOrEmpty(s *scanner, c byte) int {
 217         if c <= ' ' && isSpace(c) {
 218                 return scanSkipSpace
 219         }
 220         if c == '}' {
 221                 n := len(s.parseState)
 222                 s.parseState[n-1] = parseObjectValue
 223                 return stateEndValue(s, c)
 224         }
 225         return stateBeginString(s, c)
 226 }
 227
 228 // stateBeginString is the state after reading `{"key": value,`.
 229 func stateBeginString(s *scanner, c byte) int {
 230         if c <= ' ' && isSpace(c) {
 231                 return scanSkipSpace
 232         }
 233         if c == '"' {
 234                 s.step = stateInString
 235                 return scanBeginLiteral
 236         }
 237         return s.error(c, "looking for beginning of object key string")
 238 }
 239
 240 // stateEndValue is the state after completing a value,
 241 // such as after reading `{}` or `true` or `["x"`.
 242 func stateEndValue(s *scanner, c byte) int {
 243         n := len(s.parseState)
 244         if n == 0 {
 245                 // Completed top-level before the current byte.
 246                 s.step = stateEndTop
 247                 s.endTop = true
 248                 return stateEndTop(s, c)
 249         }
 250         if c <= ' ' && isSpace(c) {
 251                 s.step = stateEndValue
 252                 return scanSkipSpace
 253         }
 254         ps := s.parseState[n-1]
 255         switch ps {
 256         case parseObjectKey:
 257                 if c == ':' {
 258                         s.parseState[n-1] = parseObjectValue
 259                         s.step = stateBeginValue
 260                         return scanObjectKey
 261                 }
 262                 return s.error(c, "after object key")
 263         case parseObjectValue:
 264                 if c == ',' {
 265                         s.parseState[n-1] = parseObjectKey
 266                         s.step = stateBeginString
 267                         return scanObjectValue
 268                 }
 269                 if c == '}' {
 270                         s.popParseState()
 271                         return scanEndObject
 272                 }
 273                 return s.error(c, "after object key:value pair")
 274         case parseArrayValue:
 275                 if c == ',' {
 276                         s.step = stateBeginValue
 277                         return scanArrayValue
 278                 }
 279                 if c == ']' {
 280                         s.popParseState()
 281                         return scanEndArray
 282                 }
 283                 return s.error(c, "after array element")
 284         }
 285         return s.error(c, "")
 286 }
 287
 288 // stateEndTop is the state after finishing the top-level value,
 289 // such as after reading `{}` or `[1,2,3]`.
 290 // Only space characters should be seen now.
 291 func stateEndTop(s *scanner, c byte) int {
 292         if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
 293                 // Complain about non-space byte on next call.
 294                 s.error(c, "after top-level value")
 295         }
 296         return scanEnd
 297 }
 298
 299 // stateInString is the state after reading `"`.
 300 func stateInString(s *scanner, c byte) int {
 301         if c == '"' {
 302                 s.step = stateEndValue
 303                 return scanContinue
 304         }
 305         if c == '\\' {
 306                 s.step = stateInStringEsc
 307                 return scanContinue
 308         }
 309         if c < 0x20 {
 310                 return s.error(c, "in string literal")
 311         }
 312         return scanContinue
 313 }
 314
 315 // stateInStringEsc is the state after reading `"\` during a quoted string.
 316 func stateInStringEsc(s *scanner, c byte) int {
 317         switch c {
 318         case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
 319                 s.step = stateInString
 320                 return scanContinue
 321         case 'u':
 322                 s.step = stateInStringEscU
 323                 return scanContinue
 324         }
 325         return s.error(c, "in string escape code")
 326 }
 327
 328 // stateInStringEscU is the state after reading `"\u` during a quoted string.
 329 func stateInStringEscU(s *scanner, c byte) int {
 330         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 331                 s.step = stateInStringEscU1
 332                 return scanContinue
 333         }
 334         // numbers
 335         return s.error(c, "in \\u hexadecimal character escape")
 336 }
 337
 338 // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
 339 func stateInStringEscU1(s *scanner, c byte) int {
 340         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 341                 s.step = stateInStringEscU12
 342                 return scanContinue
 343         }
 344         // numbers
 345         return s.error(c, "in \\u hexadecimal character escape")
 346 }
 347
 348 // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
 349 func stateInStringEscU12(s *scanner, c byte) int {
 350         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 351                 s.step = stateInStringEscU123
 352                 return scanContinue
 353         }
 354         // numbers
 355         return s.error(c, "in \\u hexadecimal character escape")
 356 }
 357
 358 // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
 359 func stateInStringEscU123(s *scanner, c byte) int {
 360         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 361                 s.step = stateInString
 362                 return scanContinue
 363         }
 364         // numbers
 365         return s.error(c, "in \\u hexadecimal character escape")
 366 }
 367
 368 // stateNeg is the state after reading `-` during a number.
 369 func stateNeg(s *scanner, c byte) int {
 370         if c == '0' {
 371                 s.step = state0
 372                 return scanContinue
 373         }
 374         if '1' <= c && c <= '9' {
 375                 s.step = state1
 376                 return scanContinue
 377         }
 378         return s.error(c, "in numeric literal")
 379 }
 380
 381 // state1 is the state after reading a non-zero integer during a number,
 382 // such as after reading `1` or `100` but not `0`.
 383 func state1(s *scanner, c byte) int {
 384         if '0' <= c && c <= '9' {
 385                 s.step = state1
 386                 return scanContinue
 387         }
 388         return state0(s, c)
 389 }
 390
 391 // state0 is the state after reading `0` during a number.
 392 func state0(s *scanner, c byte) int {
 393         if c == '.' {
 394                 s.step = stateDot
 395                 return scanContinue
 396         }
 397         if c == 'e' || c == 'E' {
 398                 s.step = stateE
 399                 return scanContinue
 400         }
 401         return stateEndValue(s, c)
 402 }
 403
 404 // stateDot is the state after reading the integer and decimal point in a number,
 405 // such as after reading `1.`.
 406 func stateDot(s *scanner, c byte) int {
 407         if '0' <= c && c <= '9' {
 408                 s.step = stateDot0
 409                 return scanContinue
 410         }
 411         return s.error(c, "after decimal point in numeric literal")
 412 }
 413
 414 // stateDot0 is the state after reading the integer, decimal point, and subsequent
 415 // digits of a number, such as after reading `3.14`.
 416 func stateDot0(s *scanner, c byte) int {
 417         if '0' <= c && c <= '9' {
 418                 return scanContinue
 419         }
 420         if c == 'e' || c == 'E' {
 421                 s.step = stateE
 422                 return scanContinue
 423         }
 424         return stateEndValue(s, c)
 425 }
 426
 427 // stateE is the state after reading the mantissa and e in a number,
 428 // such as after reading `314e` or `0.314e`.
 429 func stateE(s *scanner, c byte) int {
 430         if c == '+' || c == '-' {
 431                 s.step = stateESign
 432                 return scanContinue
 433         }
 434         return stateESign(s, c)
 435 }
 436
 437 // stateESign is the state after reading the mantissa, e, and sign in a number,
 438 // such as after reading `314e-` or `0.314e+`.
 439 func stateESign(s *scanner, c byte) int {
 440         if '0' <= c && c <= '9' {
 441                 s.step = stateE0
 442                 return scanContinue
 443         }
 444         return s.error(c, "in exponent of numeric literal")
 445 }
 446
 447 // stateE0 is the state after reading the mantissa, e, optional sign,
 448 // and at least one digit of the exponent in a number,
 449 // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
 450 func stateE0(s *scanner, c byte) int {
 451         if '0' <= c && c <= '9' {
 452                 return scanContinue
 453         }
 454         return stateEndValue(s, c)
 455 }
 456
 457 // stateT is the state after reading `t`.
 458 func stateT(s *scanner, c byte) int {
 459         if c == 'r' {
 460                 s.step = stateTr
 461                 return scanContinue
 462         }
 463         return s.error(c, "in literal true (expecting 'r')")
 464 }
 465
 466 // stateTr is the state after reading `tr`.
 467 func stateTr(s *scanner, c byte) int {
 468         if c == 'u' {
 469                 s.step = stateTru
 470                 return scanContinue
 471         }
 472         return s.error(c, "in literal true (expecting 'u')")
 473 }
 474
 475 // stateTru is the state after reading `tru`.
 476 func stateTru(s *scanner, c byte) int {
 477         if c == 'e' {
 478                 s.step = stateEndValue
 479                 return scanContinue
 480         }
 481         return s.error(c, "in literal true (expecting 'e')")
 482 }
 483
 484 // stateF is the state after reading `f`.
 485 func stateF(s *scanner, c byte) int {
 486         if c == 'a' {
 487                 s.step = stateFa
 488                 return scanContinue
 489         }
 490         return s.error(c, "in literal false (expecting 'a')")
 491 }
 492
 493 // stateFa is the state after reading `fa`.
 494 func stateFa(s *scanner, c byte) int {
 495         if c == 'l' {
 496                 s.step = stateFal
 497                 return scanContinue
 498         }
 499         return s.error(c, "in literal false (expecting 'l')")
 500 }
 501
 502 // stateFal is the state after reading `fal`.
 503 func stateFal(s *scanner, c byte) int {
 504         if c == 's' {
 505                 s.step = stateFals
 506                 return scanContinue
 507         }
 508         return s.error(c, "in literal false (expecting 's')")
 509 }
 510
 511 // stateFals is the state after reading `fals`.
 512 func stateFals(s *scanner, c byte) int {
 513         if c == 'e' {
 514                 s.step = stateEndValue
 515                 return scanContinue
 516         }
 517         return s.error(c, "in literal false (expecting 'e')")
 518 }
 519
 520 // stateN is the state after reading `n`.
 521 func stateN(s *scanner, c byte) int {
 522         if c == 'u' {
 523                 s.step = stateNu
 524                 return scanContinue
 525         }
 526         return s.error(c, "in literal null (expecting 'u')")
 527 }
 528
 529 // stateNu is the state after reading `nu`.
 530 func stateNu(s *scanner, c byte) int {
 531         if c == 'l' {
 532                 s.step = stateNul
 533                 return scanContinue
 534         }
 535         return s.error(c, "in literal null (expecting 'l')")
 536 }
 537
 538 // stateNul is the state after reading `nul`.
 539 func stateNul(s *scanner, c byte) int {
 540         if c == 'l' {
 541                 s.step = stateEndValue
 542                 return scanContinue
 543         }
 544         return s.error(c, "in literal null (expecting 'l')")
 545 }
 546
 547 // stateError is the state after reaching a syntax error,
 548 // such as after reading `[1}` or `5.1.2`.
 549 func stateError(s *scanner, c byte) int {
 550         return scanError
 551 }
 552
 553 // error records an error and switches to the error state.
 554 func (s *scanner) error(c byte, context string) int {
 555         s.step = stateError
 556         s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
 557         return scanError
 558 }
 559
 560 // quoteChar formats c as a quoted character literal
 561 func quoteChar(c byte) string {
 562         // special cases - different from quoted strings
 563         if c == '\'' {
 564                 return `'\''`
 565         }
 566         if c == '"' {
 567                 return `'"'`
 568         }
 569
 570         // use quoted string with different quotation marks
 571         s := strconv.Quote(string(c))
 572         return "'" + s[1:len(s)-1] + "'"
 573 }