libgo/go/encoding/json/scanner.go

   1 // Copyright 2010 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package json
   6
   7 // JSON value parser state machine.
   8 // Just about at the limit of what is reasonable to write by hand.
   9 // Some parts are a bit tedious, but overall it nicely factors out the
  10 // otherwise common code from the multiple scanning functions
  11 // in this package (Compact, Indent, checkValid, etc).
  12 //
  13 // This file starts with two simple examples using the scanner
  14 // before diving into the scanner itself.
  15
  16 import (
  17         "strconv"
  18         "sync"
  19 )
  20
  21 // Valid reports whether data is a valid JSON encoding.
  22 func Valid(data []byte) bool {
  23         scan := newScanner()
  24         defer freeScanner(scan)
  25         return checkValid(data, scan) == nil
  26 }
  27
  28 // checkValid verifies that data is valid JSON-encoded data.
  29 // scan is passed in for use by checkValid to avoid an allocation.
  30 func checkValid(data []byte, scan *scanner) error {
  31         scan.reset()
  32         for _, c := range data {
  33                 scan.bytes++
  34                 if scan.step(scan, c) == scanError {
  35                         return scan.err
  36                 }
  37         }
  38         if scan.eof() == scanError {
  39                 return scan.err
  40         }
  41         return nil
  42 }
  43
  44 // A SyntaxError is a description of a JSON syntax error.
  45 type SyntaxError struct {
  46         msg    string // description of error
  47         Offset int64  // error occurred after reading Offset bytes
  48 }
  49
  50 func (e *SyntaxError) Error() string { return e.msg }
  51
  52 // A scanner is a JSON scanning state machine.
  53 // Callers call scan.reset and then pass bytes in one at a time
  54 // by calling scan.step(&scan, c) for each byte.
  55 // The return value, referred to as an opcode, tells the
  56 // caller about significant parsing events like beginning
  57 // and ending literals, objects, and arrays, so that the
  58 // caller can follow along if it wishes.
  59 // The return value scanEnd indicates that a single top-level
  60 // JSON value has been completed, *before* the byte that
  61 // just got passed in.  (The indication must be delayed in order
  62 // to recognize the end of numbers: is 123 a whole value or
  63 // the beginning of 12345e+6?).
  64 type scanner struct {
  65         // The step is a func to be called to execute the next transition.
  66         // Also tried using an integer constant and a single func
  67         // with a switch, but using the func directly was 10% faster
  68         // on a 64-bit Mac Mini, and it's nicer to read.
  69         step func(*scanner, byte) int
  70
  71         // Reached end of top-level value.
  72         endTop bool
  73
  74         // Stack of what we're in the middle of - array values, object keys, object values.
  75         parseState []int
  76
  77         // Error that happened, if any.
  78         err error
  79
  80         // total bytes consumed, updated by decoder.Decode (and deliberately
  81         // not set to zero by scan.reset)
  82         bytes int64
  83 }
  84
  85 var scannerPool = sync.Pool{
  86         New: func() any {
  87                 return &scanner{}
  88         },
  89 }
  90
  91 func newScanner() *scanner {
  92         scan := scannerPool.Get().(*scanner)
  93         // scan.reset by design doesn't set bytes to zero
  94         scan.bytes = 0
  95         scan.reset()
  96         return scan
  97 }
  98
  99 func freeScanner(scan *scanner) {
 100         // Avoid hanging on to too much memory in extreme cases.
 101         if len(scan.parseState) > 1024 {
 102                 scan.parseState = nil
 103         }
 104         scannerPool.Put(scan)
 105 }
 106
 107 // These values are returned by the state transition functions
 108 // assigned to scanner.state and the method scanner.eof.
 109 // They give details about the current state of the scan that
 110 // callers might be interested to know about.
 111 // It is okay to ignore the return value of any particular
 112 // call to scanner.state: if one call returns scanError,
 113 // every subsequent call will return scanError too.
 114 const (
 115         // Continue.
 116         scanContinue     = iota // uninteresting byte
 117         scanBeginLiteral        // end implied by next result != scanContinue
 118         scanBeginObject         // begin object
 119         scanObjectKey           // just finished object key (string)
 120         scanObjectValue         // just finished non-last object value
 121         scanEndObject           // end object (implies scanObjectValue if possible)
 122         scanBeginArray          // begin array
 123         scanArrayValue          // just finished array value
 124         scanEndArray            // end array (implies scanArrayValue if possible)
 125         scanSkipSpace           // space byte; can skip; known to be last "continue" result
 126
 127         // Stop.
 128         scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
 129         scanError // hit an error, scanner.err.
 130 )
 131
 132 // These values are stored in the parseState stack.
 133 // They give the current state of a composite value
 134 // being scanned. If the parser is inside a nested value
 135 // the parseState describes the nested state, outermost at entry 0.
 136 const (
 137         parseObjectKey   = iota // parsing object key (before colon)
 138         parseObjectValue        // parsing object value (after colon)
 139         parseArrayValue         // parsing array value
 140 )
 141
 142 // This limits the max nesting depth to prevent stack overflow.
 143 // This is permitted by https://tools.ietf.org/html/rfc7159#section-9
 144 const maxNestingDepth = 10000
 145
 146 // reset prepares the scanner for use.
 147 // It must be called before calling s.step.
 148 func (s *scanner) reset() {
 149         s.step = stateBeginValue
 150         s.parseState = s.parseState[0:0]
 151         s.err = nil
 152         s.endTop = false
 153 }
 154
 155 // eof tells the scanner that the end of input has been reached.
 156 // It returns a scan status just as s.step does.
 157 func (s *scanner) eof() int {
 158         if s.err != nil {
 159                 return scanError
 160         }
 161         if s.endTop {
 162                 return scanEnd
 163         }
 164         s.step(s, ' ')
 165         if s.endTop {
 166                 return scanEnd
 167         }
 168         if s.err == nil {
 169                 s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
 170         }
 171         return scanError
 172 }
 173
 174 // pushParseState pushes a new parse state p onto the parse stack.
 175 // an error state is returned if maxNestingDepth was exceeded, otherwise successState is returned.
 176 func (s *scanner) pushParseState(c byte, newParseState int, successState int) int {
 177         s.parseState = append(s.parseState, newParseState)
 178         if len(s.parseState) <= maxNestingDepth {
 179                 return successState
 180         }
 181         return s.error(c, "exceeded max depth")
 182 }
 183
 184 // popParseState pops a parse state (already obtained) off the stack
 185 // and updates s.step accordingly.
 186 func (s *scanner) popParseState() {
 187         n := len(s.parseState) - 1
 188         s.parseState = s.parseState[0:n]
 189         if n == 0 {
 190                 s.step = stateEndTop
 191                 s.endTop = true
 192         } else {
 193                 s.step = stateEndValue
 194         }
 195 }
 196
 197 func isSpace(c byte) bool {
 198         return c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n')
 199 }
 200
 201 // stateBeginValueOrEmpty is the state after reading `[`.
 202 func stateBeginValueOrEmpty(s *scanner, c byte) int {
 203         if isSpace(c) {
 204                 return scanSkipSpace
 205         }
 206         if c == ']' {
 207                 return stateEndValue(s, c)
 208         }
 209         return stateBeginValue(s, c)
 210 }
 211
 212 // stateBeginValue is the state at the beginning of the input.
 213 func stateBeginValue(s *scanner, c byte) int {
 214         if isSpace(c) {
 215                 return scanSkipSpace
 216         }
 217         switch c {
 218         case '{':
 219                 s.step = stateBeginStringOrEmpty
 220                 return s.pushParseState(c, parseObjectKey, scanBeginObject)
 221         case '[':
 222                 s.step = stateBeginValueOrEmpty
 223                 return s.pushParseState(c, parseArrayValue, scanBeginArray)
 224         case '"':
 225                 s.step = stateInString
 226                 return scanBeginLiteral
 227         case '-':
 228                 s.step = stateNeg
 229                 return scanBeginLiteral
 230         case '0': // beginning of 0.123
 231                 s.step = state0
 232                 return scanBeginLiteral
 233         case 't': // beginning of true
 234                 s.step = stateT
 235                 return scanBeginLiteral
 236         case 'f': // beginning of false
 237                 s.step = stateF
 238                 return scanBeginLiteral
 239         case 'n': // beginning of null
 240                 s.step = stateN
 241                 return scanBeginLiteral
 242         }
 243         if '1' <= c && c <= '9' { // beginning of 1234.5
 244                 s.step = state1
 245                 return scanBeginLiteral
 246         }
 247         return s.error(c, "looking for beginning of value")
 248 }
 249
 250 // stateBeginStringOrEmpty is the state after reading `{`.
 251 func stateBeginStringOrEmpty(s *scanner, c byte) int {
 252         if isSpace(c) {
 253                 return scanSkipSpace
 254         }
 255         if c == '}' {
 256                 n := len(s.parseState)
 257                 s.parseState[n-1] = parseObjectValue
 258                 return stateEndValue(s, c)
 259         }
 260         return stateBeginString(s, c)
 261 }
 262
 263 // stateBeginString is the state after reading `{"key": value,`.
 264 func stateBeginString(s *scanner, c byte) int {
 265         if isSpace(c) {
 266                 return scanSkipSpace
 267         }
 268         if c == '"' {
 269                 s.step = stateInString
 270                 return scanBeginLiteral
 271         }
 272         return s.error(c, "looking for beginning of object key string")
 273 }
 274
 275 // stateEndValue is the state after completing a value,
 276 // such as after reading `{}` or `true` or `["x"`.
 277 func stateEndValue(s *scanner, c byte) int {
 278         n := len(s.parseState)
 279         if n == 0 {
 280                 // Completed top-level before the current byte.
 281                 s.step = stateEndTop
 282                 s.endTop = true
 283                 return stateEndTop(s, c)
 284         }
 285         if isSpace(c) {
 286                 s.step = stateEndValue
 287                 return scanSkipSpace
 288         }
 289         ps := s.parseState[n-1]
 290         switch ps {
 291         case parseObjectKey:
 292                 if c == ':' {
 293                         s.parseState[n-1] = parseObjectValue
 294                         s.step = stateBeginValue
 295                         return scanObjectKey
 296                 }
 297                 return s.error(c, "after object key")
 298         case parseObjectValue:
 299                 if c == ',' {
 300                         s.parseState[n-1] = parseObjectKey
 301                         s.step = stateBeginString
 302                         return scanObjectValue
 303                 }
 304                 if c == '}' {
 305                         s.popParseState()
 306                         return scanEndObject
 307                 }
 308                 return s.error(c, "after object key:value pair")
 309         case parseArrayValue:
 310                 if c == ',' {
 311                         s.step = stateBeginValue
 312                         return scanArrayValue
 313                 }
 314                 if c == ']' {
 315                         s.popParseState()
 316                         return scanEndArray
 317                 }
 318                 return s.error(c, "after array element")
 319         }
 320         return s.error(c, "")
 321 }
 322
 323 // stateEndTop is the state after finishing the top-level value,
 324 // such as after reading `{}` or `[1,2,3]`.
 325 // Only space characters should be seen now.
 326 func stateEndTop(s *scanner, c byte) int {
 327         if !isSpace(c) {
 328                 // Complain about non-space byte on next call.
 329                 s.error(c, "after top-level value")
 330         }
 331         return scanEnd
 332 }
 333
 334 // stateInString is the state after reading `"`.
 335 func stateInString(s *scanner, c byte) int {
 336         if c == '"' {
 337                 s.step = stateEndValue
 338                 return scanContinue
 339         }
 340         if c == '\\' {
 341                 s.step = stateInStringEsc
 342                 return scanContinue
 343         }
 344         if c < 0x20 {
 345                 return s.error(c, "in string literal")
 346         }
 347         return scanContinue
 348 }
 349
 350 // stateInStringEsc is the state after reading `"\` during a quoted string.
 351 func stateInStringEsc(s *scanner, c byte) int {
 352         switch c {
 353         case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
 354                 s.step = stateInString
 355                 return scanContinue
 356         case 'u':
 357                 s.step = stateInStringEscU
 358                 return scanContinue
 359         }
 360         return s.error(c, "in string escape code")
 361 }
 362
 363 // stateInStringEscU is the state after reading `"\u` during a quoted string.
 364 func stateInStringEscU(s *scanner, c byte) int {
 365         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 366                 s.step = stateInStringEscU1
 367                 return scanContinue
 368         }
 369         // numbers
 370         return s.error(c, "in \\u hexadecimal character escape")
 371 }
 372
 373 // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
 374 func stateInStringEscU1(s *scanner, c byte) int {
 375         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 376                 s.step = stateInStringEscU12
 377                 return scanContinue
 378         }
 379         // numbers
 380         return s.error(c, "in \\u hexadecimal character escape")
 381 }
 382
 383 // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
 384 func stateInStringEscU12(s *scanner, c byte) int {
 385         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 386                 s.step = stateInStringEscU123
 387                 return scanContinue
 388         }
 389         // numbers
 390         return s.error(c, "in \\u hexadecimal character escape")
 391 }
 392
 393 // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
 394 func stateInStringEscU123(s *scanner, c byte) int {
 395         if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
 396                 s.step = stateInString
 397                 return scanContinue
 398         }
 399         // numbers
 400         return s.error(c, "in \\u hexadecimal character escape")
 401 }
 402
 403 // stateNeg is the state after reading `-` during a number.
 404 func stateNeg(s *scanner, c byte) int {
 405         if c == '0' {
 406                 s.step = state0
 407                 return scanContinue
 408         }
 409         if '1' <= c && c <= '9' {
 410                 s.step = state1
 411                 return scanContinue
 412         }
 413         return s.error(c, "in numeric literal")
 414 }
 415
 416 // state1 is the state after reading a non-zero integer during a number,
 417 // such as after reading `1` or `100` but not `0`.
 418 func state1(s *scanner, c byte) int {
 419         if '0' <= c && c <= '9' {
 420                 s.step = state1
 421                 return scanContinue
 422         }
 423         return state0(s, c)
 424 }
 425
 426 // state0 is the state after reading `0` during a number.
 427 func state0(s *scanner, c byte) int {
 428         if c == '.' {
 429                 s.step = stateDot
 430                 return scanContinue
 431         }
 432         if c == 'e' || c == 'E' {
 433                 s.step = stateE
 434                 return scanContinue
 435         }
 436         return stateEndValue(s, c)
 437 }
 438
 439 // stateDot is the state after reading the integer and decimal point in a number,
 440 // such as after reading `1.`.
 441 func stateDot(s *scanner, c byte) int {
 442         if '0' <= c && c <= '9' {
 443                 s.step = stateDot0
 444                 return scanContinue
 445         }
 446         return s.error(c, "after decimal point in numeric literal")
 447 }
 448
 449 // stateDot0 is the state after reading the integer, decimal point, and subsequent
 450 // digits of a number, such as after reading `3.14`.
 451 func stateDot0(s *scanner, c byte) int {
 452         if '0' <= c && c <= '9' {
 453                 return scanContinue
 454         }
 455         if c == 'e' || c == 'E' {
 456                 s.step = stateE
 457                 return scanContinue
 458         }
 459         return stateEndValue(s, c)
 460 }
 461
 462 // stateE is the state after reading the mantissa and e in a number,
 463 // such as after reading `314e` or `0.314e`.
 464 func stateE(s *scanner, c byte) int {
 465         if c == '+' || c == '-' {
 466                 s.step = stateESign
 467                 return scanContinue
 468         }
 469         return stateESign(s, c)
 470 }
 471
 472 // stateESign is the state after reading the mantissa, e, and sign in a number,
 473 // such as after reading `314e-` or `0.314e+`.
 474 func stateESign(s *scanner, c byte) int {
 475         if '0' <= c && c <= '9' {
 476                 s.step = stateE0
 477                 return scanContinue
 478         }
 479         return s.error(c, "in exponent of numeric literal")
 480 }
 481
 482 // stateE0 is the state after reading the mantissa, e, optional sign,
 483 // and at least one digit of the exponent in a number,
 484 // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
 485 func stateE0(s *scanner, c byte) int {
 486         if '0' <= c && c <= '9' {
 487                 return scanContinue
 488         }
 489         return stateEndValue(s, c)
 490 }
 491
 492 // stateT is the state after reading `t`.
 493 func stateT(s *scanner, c byte) int {
 494         if c == 'r' {
 495                 s.step = stateTr
 496                 return scanContinue
 497         }
 498         return s.error(c, "in literal true (expecting 'r')")
 499 }
 500
 501 // stateTr is the state after reading `tr`.
 502 func stateTr(s *scanner, c byte) int {
 503         if c == 'u' {
 504                 s.step = stateTru
 505                 return scanContinue
 506         }
 507         return s.error(c, "in literal true (expecting 'u')")
 508 }
 509
 510 // stateTru is the state after reading `tru`.
 511 func stateTru(s *scanner, c byte) int {
 512         if c == 'e' {
 513                 s.step = stateEndValue
 514                 return scanContinue
 515         }
 516         return s.error(c, "in literal true (expecting 'e')")
 517 }
 518
 519 // stateF is the state after reading `f`.
 520 func stateF(s *scanner, c byte) int {
 521         if c == 'a' {
 522                 s.step = stateFa
 523                 return scanContinue
 524         }
 525         return s.error(c, "in literal false (expecting 'a')")
 526 }
 527
 528 // stateFa is the state after reading `fa`.
 529 func stateFa(s *scanner, c byte) int {
 530         if c == 'l' {
 531                 s.step = stateFal
 532                 return scanContinue
 533         }
 534         return s.error(c, "in literal false (expecting 'l')")
 535 }
 536
 537 // stateFal is the state after reading `fal`.
 538 func stateFal(s *scanner, c byte) int {
 539         if c == 's' {
 540                 s.step = stateFals
 541                 return scanContinue
 542         }
 543         return s.error(c, "in literal false (expecting 's')")
 544 }
 545
 546 // stateFals is the state after reading `fals`.
 547 func stateFals(s *scanner, c byte) int {
 548         if c == 'e' {
 549                 s.step = stateEndValue
 550                 return scanContinue
 551         }
 552         return s.error(c, "in literal false (expecting 'e')")
 553 }
 554
 555 // stateN is the state after reading `n`.
 556 func stateN(s *scanner, c byte) int {
 557         if c == 'u' {
 558                 s.step = stateNu
 559                 return scanContinue
 560         }
 561         return s.error(c, "in literal null (expecting 'u')")
 562 }
 563
 564 // stateNu is the state after reading `nu`.
 565 func stateNu(s *scanner, c byte) int {
 566         if c == 'l' {
 567                 s.step = stateNul
 568                 return scanContinue
 569         }
 570         return s.error(c, "in literal null (expecting 'l')")
 571 }
 572
 573 // stateNul is the state after reading `nul`.
 574 func stateNul(s *scanner, c byte) int {
 575         if c == 'l' {
 576                 s.step = stateEndValue
 577                 return scanContinue
 578         }
 579         return s.error(c, "in literal null (expecting 'l')")
 580 }
 581
 582 // stateError is the state after reaching a syntax error,
 583 // such as after reading `[1}` or `5.1.2`.
 584 func stateError(s *scanner, c byte) int {
 585         return scanError
 586 }
 587
 588 // error records an error and switches to the error state.
 589 func (s *scanner) error(c byte, context string) int {
 590         s.step = stateError
 591         s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
 592         return scanError
 593 }
 594
 595 // quoteChar formats c as a quoted character literal
 596 func quoteChar(c byte) string {
 597         // special cases - different from quoted strings
 598         if c == '\'' {
 599                 return `'\''`
 600         }
 601         if c == '"' {
 602                 return `'"'`
 603         }
 604
 605         // use quoted string with different quotation marks
 606         s := strconv.Quote(string(c))
 607         return "'" + s[1:len(s)-1] + "'"
 608 }