scanner.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package json
  5. // JSON value parser state machine.
  6. // Just about at the limit of what is reasonable to write by hand.
  7. // Some parts are a bit tedious, but overall it nicely factors out the
  8. // otherwise common code from the multiple scanning functions
  9. // in this package (Compact, Indent, checkValid, nextValue, etc).
  10. //
  11. // This file starts with two simple examples using the scanner
  12. // before diving into the scanner itself.
  13. import "strconv"
  14. // checkValid verifies that data is valid JSON-encoded data.
  15. // scan is passed in for use by checkValid to avoid an allocation.
  16. func checkValid(data []byte, scan *scanner) error {
  17. scan.reset()
  18. for _, c := range data {
  19. scan.bytes++
  20. if scan.step(scan, c) == scanError {
  21. return scan.err
  22. }
  23. }
  24. if scan.eof() == scanError {
  25. return scan.err
  26. }
  27. return nil
  28. }
  29. // nextValue splits data after the next whole JSON value,
  30. // returning that value and the bytes that follow it as separate slices.
  31. // scan is passed in for use by nextValue to avoid an allocation.
  32. func nextValue(data []byte, scan *scanner) (value, rest []byte, err error) {
  33. scan.reset()
  34. for i, c := range data {
  35. v := scan.step(scan, c)
  36. if v >= scanEndObject {
  37. switch v {
  38. // probe the scanner with a space to determine whether we will
  39. // get scanEnd on the next character. Otherwise, if the next character
  40. // is not a space, scanEndTop allocates a needless error.
  41. case scanEndObject, scanEndArray:
  42. if scan.step(scan, ' ') == scanEnd {
  43. return data[:i+1], data[i+1:], nil
  44. }
  45. case scanError:
  46. return nil, nil, scan.err
  47. case scanEnd:
  48. return data[:i], data[i:], nil
  49. }
  50. }
  51. }
  52. if scan.eof() == scanError {
  53. return nil, nil, scan.err
  54. }
  55. return data, nil, nil
  56. }
  57. // A SyntaxError is a description of a JSON syntax error.
  58. type SyntaxError struct {
  59. msg string // description of error
  60. Offset int64 // error occurred after reading Offset bytes
  61. }
  62. func (e *SyntaxError) Error() string { return e.msg }
  63. // A scanner is a JSON scanning state machine.
  64. // Callers call scan.reset() and then pass bytes in one at a time
  65. // by calling scan.step(&scan, c) for each byte.
  66. // The return value, referred to as an opcode, tells the
  67. // caller about significant parsing events like beginning
  68. // and ending literals, objects, and arrays, so that the
  69. // caller can follow along if it wishes.
  70. // The return value scanEnd indicates that a single top-level
  71. // JSON value has been completed, *before* the byte that
  72. // just got passed in. (The indication must be delayed in order
  73. // to recognize the end of numbers: is 123 a whole value or
  74. // the beginning of 12345e+6?).
  75. type scanner struct {
  76. // The step is a func to be called to execute the next transition.
  77. // Also tried using an integer constant and a single func
  78. // with a switch, but using the func directly was 10% faster
  79. // on a 64-bit Mac Mini, and it's nicer to read.
  80. step func(*scanner, byte) int
  81. // Reached end of top-level value.
  82. endTop bool
  83. // Stack of what we're in the middle of - array values, object keys, object values.
  84. parseState []int
  85. // Error that happened, if any.
  86. err error
  87. // 1-byte redo (see undo method)
  88. redo bool
  89. redoCode int
  90. redoState func(*scanner, byte) int
  91. // total bytes consumed, updated by decoder.Decode
  92. bytes int64
  93. }
  94. // These values are returned by the state transition functions
  95. // assigned to scanner.state and the method scanner.eof.
  96. // They give details about the current state of the scan that
  97. // callers might be interested to know about.
  98. // It is okay to ignore the return value of any particular
  99. // call to scanner.state: if one call returns scanError,
  100. // every subsequent call will return scanError too.
  101. const (
  102. // Continue.
  103. scanContinue = iota // uninteresting byte
  104. scanBeginLiteral // end implied by next result != scanContinue
  105. scanBeginObject // begin object
  106. scanObjectKey // just finished object key (string)
  107. scanObjectValue // just finished non-last object value
  108. scanEndObject // end object (implies scanObjectValue if possible)
  109. scanBeginArray // begin array
  110. scanArrayValue // just finished array value
  111. scanEndArray // end array (implies scanArrayValue if possible)
  112. scanSkipSpace // space byte; can skip; known to be last "continue" result
  113. // Stop.
  114. scanEnd // top-level value ended *before* this byte; known to be first "stop" result
  115. scanError // hit an error, scanner.err.
  116. )
  117. // These values are stored in the parseState stack.
  118. // They give the current state of a composite value
  119. // being scanned. If the parser is inside a nested value
  120. // the parseState describes the nested state, outermost at entry 0.
  121. const (
  122. parseObjectKey = iota // parsing object key (before colon)
  123. parseObjectValue // parsing object value (after colon)
  124. parseArrayValue // parsing array value
  125. )
  126. // reset prepares the scanner for use.
  127. // It must be called before calling s.step.
  128. func (s *scanner) reset() {
  129. s.step = stateBeginValue
  130. s.parseState = s.parseState[0:0]
  131. s.err = nil
  132. s.redo = false
  133. s.endTop = false
  134. }
  135. // eof tells the scanner that the end of input has been reached.
  136. // It returns a scan status just as s.step does.
  137. func (s *scanner) eof() int {
  138. if s.err != nil {
  139. return scanError
  140. }
  141. if s.endTop {
  142. return scanEnd
  143. }
  144. s.step(s, ' ')
  145. if s.endTop {
  146. return scanEnd
  147. }
  148. if s.err == nil {
  149. s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
  150. }
  151. return scanError
  152. }
  153. // pushParseState pushes a new parse state p onto the parse stack.
  154. func (s *scanner) pushParseState(p int) {
  155. s.parseState = append(s.parseState, p)
  156. }
  157. // popParseState pops a parse state (already obtained) off the stack
  158. // and updates s.step accordingly.
  159. func (s *scanner) popParseState() {
  160. n := len(s.parseState) - 1
  161. s.parseState = s.parseState[0:n]
  162. s.redo = false
  163. if n == 0 {
  164. s.step = stateEndTop
  165. s.endTop = true
  166. } else {
  167. s.step = stateEndValue
  168. }
  169. }
  170. func isSpace(c byte) bool {
  171. return c == ' ' || c == '\t' || c == '\r' || c == '\n'
  172. }
  173. // stateBeginValueOrEmpty is the state after reading `[`.
  174. func stateBeginValueOrEmpty(s *scanner, c byte) int {
  175. if c <= ' ' && isSpace(c) {
  176. return scanSkipSpace
  177. }
  178. if c == ']' {
  179. return stateEndValue(s, c)
  180. }
  181. return stateBeginValue(s, c)
  182. }
  183. // stateBeginValue is the state at the beginning of the input.
  184. func stateBeginValue(s *scanner, c byte) int {
  185. if c <= ' ' && isSpace(c) {
  186. return scanSkipSpace
  187. }
  188. switch c {
  189. case '{':
  190. s.step = stateBeginStringOrEmpty
  191. s.pushParseState(parseObjectKey)
  192. return scanBeginObject
  193. case '[':
  194. s.step = stateBeginValueOrEmpty
  195. s.pushParseState(parseArrayValue)
  196. return scanBeginArray
  197. case '"':
  198. s.step = stateInString
  199. return scanBeginLiteral
  200. case '-':
  201. s.step = stateNeg
  202. return scanBeginLiteral
  203. case '0': // beginning of 0.123
  204. s.step = state0
  205. return scanBeginLiteral
  206. case 't': // beginning of true
  207. s.step = stateT
  208. return scanBeginLiteral
  209. case 'f': // beginning of false
  210. s.step = stateF
  211. return scanBeginLiteral
  212. case 'n': // beginning of null
  213. s.step = stateN
  214. return scanBeginLiteral
  215. }
  216. if '1' <= c && c <= '9' { // beginning of 1234.5
  217. s.step = state1
  218. return scanBeginLiteral
  219. }
  220. return s.error(c, "looking for beginning of value")
  221. }
  222. // stateBeginStringOrEmpty is the state after reading `{`.
  223. func stateBeginStringOrEmpty(s *scanner, c byte) int {
  224. if c <= ' ' && isSpace(c) {
  225. return scanSkipSpace
  226. }
  227. if c == '}' {
  228. n := len(s.parseState)
  229. s.parseState[n-1] = parseObjectValue
  230. return stateEndValue(s, c)
  231. }
  232. return stateBeginString(s, c)
  233. }
  234. // stateBeginString is the state after reading `{"key": value,`.
  235. func stateBeginString(s *scanner, c byte) int {
  236. if c <= ' ' && isSpace(c) {
  237. return scanSkipSpace
  238. }
  239. if c == '"' {
  240. s.step = stateInString
  241. return scanBeginLiteral
  242. }
  243. return s.error(c, "looking for beginning of object key string")
  244. }
  245. // stateEndValue is the state after completing a value,
  246. // such as after reading `{}` or `true` or `["x"`.
  247. func stateEndValue(s *scanner, c byte) int {
  248. n := len(s.parseState)
  249. if n == 0 {
  250. // Completed top-level before the current byte.
  251. s.step = stateEndTop
  252. s.endTop = true
  253. return stateEndTop(s, c)
  254. }
  255. if c <= ' ' && isSpace(c) {
  256. s.step = stateEndValue
  257. return scanSkipSpace
  258. }
  259. ps := s.parseState[n-1]
  260. switch ps {
  261. case parseObjectKey:
  262. if c == ':' {
  263. s.parseState[n-1] = parseObjectValue
  264. s.step = stateBeginValue
  265. return scanObjectKey
  266. }
  267. return s.error(c, "after object key")
  268. case parseObjectValue:
  269. if c == ',' {
  270. s.parseState[n-1] = parseObjectKey
  271. s.step = stateBeginString
  272. return scanObjectValue
  273. }
  274. if c == '}' {
  275. s.popParseState()
  276. return scanEndObject
  277. }
  278. return s.error(c, "after object key:value pair")
  279. case parseArrayValue:
  280. if c == ',' {
  281. s.step = stateBeginValue
  282. return scanArrayValue
  283. }
  284. if c == ']' {
  285. s.popParseState()
  286. return scanEndArray
  287. }
  288. return s.error(c, "after array element")
  289. }
  290. return s.error(c, "")
  291. }
  292. // stateEndTop is the state after finishing the top-level value,
  293. // such as after reading `{}` or `[1,2,3]`.
  294. // Only space characters should be seen now.
  295. func stateEndTop(s *scanner, c byte) int {
  296. if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
  297. // Complain about non-space byte on next call.
  298. s.error(c, "after top-level value")
  299. }
  300. return scanEnd
  301. }
  302. // stateInString is the state after reading `"`.
  303. func stateInString(s *scanner, c byte) int {
  304. if c == '"' {
  305. s.step = stateEndValue
  306. return scanContinue
  307. }
  308. if c == '\\' {
  309. s.step = stateInStringEsc
  310. return scanContinue
  311. }
  312. if c < 0x20 {
  313. return s.error(c, "in string literal")
  314. }
  315. return scanContinue
  316. }
  317. // stateInStringEsc is the state after reading `"\` during a quoted string.
  318. func stateInStringEsc(s *scanner, c byte) int {
  319. switch c {
  320. case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
  321. s.step = stateInString
  322. return scanContinue
  323. case 'u':
  324. s.step = stateInStringEscU
  325. return scanContinue
  326. }
  327. return s.error(c, "in string escape code")
  328. }
  329. // stateInStringEscU is the state after reading `"\u` during a quoted string.
  330. func stateInStringEscU(s *scanner, c byte) int {
  331. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  332. s.step = stateInStringEscU1
  333. return scanContinue
  334. }
  335. // numbers
  336. return s.error(c, "in \\u hexadecimal character escape")
  337. }
  338. // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
  339. func stateInStringEscU1(s *scanner, c byte) int {
  340. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  341. s.step = stateInStringEscU12
  342. return scanContinue
  343. }
  344. // numbers
  345. return s.error(c, "in \\u hexadecimal character escape")
  346. }
  347. // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
  348. func stateInStringEscU12(s *scanner, c byte) int {
  349. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  350. s.step = stateInStringEscU123
  351. return scanContinue
  352. }
  353. // numbers
  354. return s.error(c, "in \\u hexadecimal character escape")
  355. }
  356. // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
  357. func stateInStringEscU123(s *scanner, c byte) int {
  358. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  359. s.step = stateInString
  360. return scanContinue
  361. }
  362. // numbers
  363. return s.error(c, "in \\u hexadecimal character escape")
  364. }
  365. // stateNeg is the state after reading `-` during a number.
  366. func stateNeg(s *scanner, c byte) int {
  367. if c == '0' {
  368. s.step = state0
  369. return scanContinue
  370. }
  371. if '1' <= c && c <= '9' {
  372. s.step = state1
  373. return scanContinue
  374. }
  375. return s.error(c, "in numeric literal")
  376. }
  377. // state1 is the state after reading a non-zero integer during a number,
  378. // such as after reading `1` or `100` but not `0`.
  379. func state1(s *scanner, c byte) int {
  380. if '0' <= c && c <= '9' {
  381. s.step = state1
  382. return scanContinue
  383. }
  384. return state0(s, c)
  385. }
  386. // state0 is the state after reading `0` during a number.
  387. func state0(s *scanner, c byte) int {
  388. if c == '.' {
  389. s.step = stateDot
  390. return scanContinue
  391. }
  392. if c == 'e' || c == 'E' {
  393. s.step = stateE
  394. return scanContinue
  395. }
  396. return stateEndValue(s, c)
  397. }
  398. // stateDot is the state after reading the integer and decimal point in a number,
  399. // such as after reading `1.`.
  400. func stateDot(s *scanner, c byte) int {
  401. if '0' <= c && c <= '9' {
  402. s.step = stateDot0
  403. return scanContinue
  404. }
  405. return s.error(c, "after decimal point in numeric literal")
  406. }
  407. // stateDot0 is the state after reading the integer, decimal point, and subsequent
  408. // digits of a number, such as after reading `3.14`.
  409. func stateDot0(s *scanner, c byte) int {
  410. if '0' <= c && c <= '9' {
  411. return scanContinue
  412. }
  413. if c == 'e' || c == 'E' {
  414. s.step = stateE
  415. return scanContinue
  416. }
  417. return stateEndValue(s, c)
  418. }
  419. // stateE is the state after reading the mantissa and e in a number,
  420. // such as after reading `314e` or `0.314e`.
  421. func stateE(s *scanner, c byte) int {
  422. if c == '+' || c == '-' {
  423. s.step = stateESign
  424. return scanContinue
  425. }
  426. return stateESign(s, c)
  427. }
  428. // stateESign is the state after reading the mantissa, e, and sign in a number,
  429. // such as after reading `314e-` or `0.314e+`.
  430. func stateESign(s *scanner, c byte) int {
  431. if '0' <= c && c <= '9' {
  432. s.step = stateE0
  433. return scanContinue
  434. }
  435. return s.error(c, "in exponent of numeric literal")
  436. }
  437. // stateE0 is the state after reading the mantissa, e, optional sign,
  438. // and at least one digit of the exponent in a number,
  439. // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
  440. func stateE0(s *scanner, c byte) int {
  441. if '0' <= c && c <= '9' {
  442. return scanContinue
  443. }
  444. return stateEndValue(s, c)
  445. }
  446. // stateT is the state after reading `t`.
  447. func stateT(s *scanner, c byte) int {
  448. if c == 'r' {
  449. s.step = stateTr
  450. return scanContinue
  451. }
  452. return s.error(c, "in literal true (expecting 'r')")
  453. }
  454. // stateTr is the state after reading `tr`.
  455. func stateTr(s *scanner, c byte) int {
  456. if c == 'u' {
  457. s.step = stateTru
  458. return scanContinue
  459. }
  460. return s.error(c, "in literal true (expecting 'u')")
  461. }
  462. // stateTru is the state after reading `tru`.
  463. func stateTru(s *scanner, c byte) int {
  464. if c == 'e' {
  465. s.step = stateEndValue
  466. return scanContinue
  467. }
  468. return s.error(c, "in literal true (expecting 'e')")
  469. }
  470. // stateF is the state after reading `f`.
  471. func stateF(s *scanner, c byte) int {
  472. if c == 'a' {
  473. s.step = stateFa
  474. return scanContinue
  475. }
  476. return s.error(c, "in literal false (expecting 'a')")
  477. }
  478. // stateFa is the state after reading `fa`.
  479. func stateFa(s *scanner, c byte) int {
  480. if c == 'l' {
  481. s.step = stateFal
  482. return scanContinue
  483. }
  484. return s.error(c, "in literal false (expecting 'l')")
  485. }
  486. // stateFal is the state after reading `fal`.
  487. func stateFal(s *scanner, c byte) int {
  488. if c == 's' {
  489. s.step = stateFals
  490. return scanContinue
  491. }
  492. return s.error(c, "in literal false (expecting 's')")
  493. }
  494. // stateFals is the state after reading `fals`.
  495. func stateFals(s *scanner, c byte) int {
  496. if c == 'e' {
  497. s.step = stateEndValue
  498. return scanContinue
  499. }
  500. return s.error(c, "in literal false (expecting 'e')")
  501. }
  502. // stateN is the state after reading `n`.
  503. func stateN(s *scanner, c byte) int {
  504. if c == 'u' {
  505. s.step = stateNu
  506. return scanContinue
  507. }
  508. return s.error(c, "in literal null (expecting 'u')")
  509. }
  510. // stateNu is the state after reading `nu`.
  511. func stateNu(s *scanner, c byte) int {
  512. if c == 'l' {
  513. s.step = stateNul
  514. return scanContinue
  515. }
  516. return s.error(c, "in literal null (expecting 'l')")
  517. }
  518. // stateNul is the state after reading `nul`.
  519. func stateNul(s *scanner, c byte) int {
  520. if c == 'l' {
  521. s.step = stateEndValue
  522. return scanContinue
  523. }
  524. return s.error(c, "in literal null (expecting 'l')")
  525. }
  526. // stateError is the state after reaching a syntax error,
  527. // such as after reading `[1}` or `5.1.2`.
  528. func stateError(s *scanner, c byte) int {
  529. return scanError
  530. }
  531. // error records an error and switches to the error state.
  532. func (s *scanner) error(c byte, context string) int {
  533. s.step = stateError
  534. s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
  535. return scanError
  536. }
  537. // quoteChar formats c as a quoted character literal
  538. func quoteChar(c byte) string {
  539. // special cases - different from quoted strings
  540. if c == '\'' {
  541. return `'\''`
  542. }
  543. if c == '"' {
  544. return `'"'`
  545. }
  546. // use quoted string with different quotation marks
  547. s := strconv.Quote(string(c))
  548. return "'" + s[1:len(s)-1] + "'"
  549. }
  550. // undo causes the scanner to return scanCode from the next state transition.
  551. // This gives callers a simple 1-byte undo mechanism.
  552. func (s *scanner) undo(scanCode int) {
  553. if s.redo {
  554. panic("json: invalid use of scanner")
  555. }
  556. s.redoCode = scanCode
  557. s.redoState = s.step
  558. s.step = stateRedo
  559. s.redo = true
  560. }
  561. // stateRedo helps implement the scanner's 1-byte undo.
  562. func stateRedo(s *scanner, c byte) int {
  563. s.redo = false
  564. s.step = s.redoState
  565. return s.redoCode
  566. }