scanner.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698
  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package json
  5. // JSON value parser state machine.
  6. // Just about at the limit of what is reasonable to write by hand.
  7. // Some parts are a bit tedious, but overall it nicely factors out the
  8. // otherwise common code from the multiple scanning functions
  9. // in this package (Compact, Indent, checkValid, nextValue, etc).
  10. //
  11. // This file starts with two simple examples using the scanner
  12. // before diving into the scanner itself.
  13. import "strconv"
  14. // checkValid verifies that data is valid JSON-encoded data.
  15. // scan is passed in for use by checkValid to avoid an allocation.
  16. func checkValid(data []byte, scan *scanner) error {
  17. scan.reset()
  18. for _, c := range data {
  19. scan.bytes++
  20. if scan.step(scan, c) == scanError {
  21. return scan.err
  22. }
  23. }
  24. if scan.eof() == scanError {
  25. return scan.err
  26. }
  27. return nil
  28. }
  29. // nextValue splits data after the next whole JSON value,
  30. // returning that value and the bytes that follow it as separate slices.
  31. // scan is passed in for use by nextValue to avoid an allocation.
  32. func nextValue(data []byte, scan *scanner) (value, rest []byte, err error) {
  33. scan.reset()
  34. for i, c := range data {
  35. v := scan.step(scan, c)
  36. if v >= scanEndObject {
  37. switch v {
  38. // probe the scanner with a space to determine whether we will
  39. // get scanEnd on the next character. Otherwise, if the next character
  40. // is not a space, scanEndTop allocates a needless error.
  41. case scanEndObject, scanEndArray, scanEndParams:
  42. if scan.step(scan, ' ') == scanEnd {
  43. return data[:i+1], data[i+1:], nil
  44. }
  45. case scanError:
  46. return nil, nil, scan.err
  47. case scanEnd:
  48. return data[:i], data[i:], nil
  49. }
  50. }
  51. }
  52. if scan.eof() == scanError {
  53. return nil, nil, scan.err
  54. }
  55. return data, nil, nil
  56. }
  57. // A SyntaxError is a description of a JSON syntax error.
  58. type SyntaxError struct {
  59. msg string // description of error
  60. Offset int64 // error occurred after reading Offset bytes
  61. }
  62. func (e *SyntaxError) Error() string { return e.msg }
  63. // A scanner is a JSON scanning state machine.
  64. // Callers call scan.reset() and then pass bytes in one at a time
  65. // by calling scan.step(&scan, c) for each byte.
  66. // The return value, referred to as an opcode, tells the
  67. // caller about significant parsing events like beginning
  68. // and ending literals, objects, and arrays, so that the
  69. // caller can follow along if it wishes.
  70. // The return value scanEnd indicates that a single top-level
  71. // JSON value has been completed, *before* the byte that
  72. // just got passed in. (The indication must be delayed in order
  73. // to recognize the end of numbers: is 123 a whole value or
  74. // the beginning of 12345e+6?).
  75. type scanner struct {
  76. // The step is a func to be called to execute the next transition.
  77. // Also tried using an integer constant and a single func
  78. // with a switch, but using the func directly was 10% faster
  79. // on a 64-bit Mac Mini, and it's nicer to read.
  80. step func(*scanner, byte) int
  81. // Reached end of top-level value.
  82. endTop bool
  83. // Stack of what we're in the middle of - array values, object keys, object values.
  84. parseState []int
  85. // Error that happened, if any.
  86. err error
  87. // 1-byte redo (see undo method)
  88. redo bool
  89. redoCode int
  90. redoState func(*scanner, byte) int
  91. // total bytes consumed, updated by decoder.Decode
  92. bytes int64
  93. }
  94. // These values are returned by the state transition functions
  95. // assigned to scanner.state and the method scanner.eof.
  96. // They give details about the current state of the scan that
  97. // callers might be interested to know about.
  98. // It is okay to ignore the return value of any particular
  99. // call to scanner.state: if one call returns scanError,
  100. // every subsequent call will return scanError too.
  101. const (
  102. // Continue.
  103. scanContinue = iota // uninteresting byte
  104. scanBeginLiteral // end implied by next result != scanContinue
  105. scanBeginObject // begin object
  106. scanObjectKey // just finished object key (string)
  107. scanObjectValue // just finished non-last object value
  108. scanEndObject // end object (implies scanObjectValue if possible)
  109. scanBeginArray // begin array
  110. scanArrayValue // just finished array value
  111. scanEndArray // end array (implies scanArrayValue if possible)
  112. scanBeginName // begin function call
  113. scanParam // begin function argument
  114. scanEndParams // end function call
  115. scanSkipSpace // space byte; can skip; known to be last "continue" result
  116. // Stop.
  117. scanEnd // top-level value ended *before* this byte; known to be first "stop" result
  118. scanError // hit an error, scanner.err.
  119. )
  120. // These values are stored in the parseState stack.
  121. // They give the current state of a composite value
  122. // being scanned. If the parser is inside a nested value
  123. // the parseState describes the nested state, outermost at entry 0.
  124. const (
  125. parseObjectKey = iota // parsing object key (before colon)
  126. parseObjectValue // parsing object value (after colon)
  127. parseArrayValue // parsing array value
  128. parseName // parsing unquoted name
  129. parseParam // parsing function argument value
  130. )
  131. // reset prepares the scanner for use.
  132. // It must be called before calling s.step.
  133. func (s *scanner) reset() {
  134. s.step = stateBeginValue
  135. s.parseState = s.parseState[0:0]
  136. s.err = nil
  137. s.redo = false
  138. s.endTop = false
  139. }
  140. // eof tells the scanner that the end of input has been reached.
  141. // It returns a scan status just as s.step does.
  142. func (s *scanner) eof() int {
  143. if s.err != nil {
  144. return scanError
  145. }
  146. if s.endTop {
  147. return scanEnd
  148. }
  149. s.step(s, ' ')
  150. if s.endTop {
  151. return scanEnd
  152. }
  153. if s.err == nil {
  154. s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
  155. }
  156. return scanError
  157. }
  158. // pushParseState pushes a new parse state p onto the parse stack.
  159. func (s *scanner) pushParseState(p int) {
  160. s.parseState = append(s.parseState, p)
  161. }
  162. // popParseState pops a parse state (already obtained) off the stack
  163. // and updates s.step accordingly.
  164. func (s *scanner) popParseState() {
  165. n := len(s.parseState) - 1
  166. s.parseState = s.parseState[0:n]
  167. s.redo = false
  168. if n == 0 {
  169. s.step = stateEndTop
  170. s.endTop = true
  171. } else {
  172. s.step = stateEndValue
  173. }
  174. }
  175. func isSpace(c byte) bool {
  176. return c == ' ' || c == '\t' || c == '\r' || c == '\n'
  177. }
  178. // stateBeginValueOrEmpty is the state after reading `[`.
  179. func stateBeginValueOrEmpty(s *scanner, c byte) int {
  180. if c <= ' ' && isSpace(c) {
  181. return scanSkipSpace
  182. }
  183. if c == ']' {
  184. return stateEndValue(s, c)
  185. }
  186. return stateBeginValue(s, c)
  187. }
  188. // stateBeginValue is the state at the beginning of the input.
  189. func stateBeginValue(s *scanner, c byte) int {
  190. if c <= ' ' && isSpace(c) {
  191. return scanSkipSpace
  192. }
  193. switch c {
  194. case '{':
  195. s.step = stateBeginStringOrEmpty
  196. s.pushParseState(parseObjectKey)
  197. return scanBeginObject
  198. case '[':
  199. s.step = stateBeginValueOrEmpty
  200. s.pushParseState(parseArrayValue)
  201. return scanBeginArray
  202. case '"':
  203. s.step = stateInString
  204. return scanBeginLiteral
  205. case '-':
  206. s.step = stateNeg
  207. return scanBeginLiteral
  208. case '0': // beginning of 0.123
  209. s.step = state0
  210. return scanBeginLiteral
  211. case 'n':
  212. s.step = stateNew0
  213. return scanBeginName
  214. }
  215. if '1' <= c && c <= '9' { // beginning of 1234.5
  216. s.step = state1
  217. return scanBeginLiteral
  218. }
  219. if isName(c) {
  220. s.step = stateName
  221. return scanBeginName
  222. }
  223. return s.error(c, "looking for beginning of value")
  224. }
  225. func isName(c byte) bool {
  226. return c == '$' || c == '_' || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9'
  227. }
  228. // stateBeginStringOrEmpty is the state after reading `{`.
  229. func stateBeginStringOrEmpty(s *scanner, c byte) int {
  230. if c <= ' ' && isSpace(c) {
  231. return scanSkipSpace
  232. }
  233. if c == '}' {
  234. n := len(s.parseState)
  235. s.parseState[n-1] = parseObjectValue
  236. return stateEndValue(s, c)
  237. }
  238. return stateBeginString(s, c)
  239. }
  240. // stateBeginString is the state after reading `{"key": value,`.
  241. func stateBeginString(s *scanner, c byte) int {
  242. if c <= ' ' && isSpace(c) {
  243. return scanSkipSpace
  244. }
  245. if c == '"' {
  246. s.step = stateInString
  247. return scanBeginLiteral
  248. }
  249. if isName(c) {
  250. s.step = stateName
  251. return scanBeginName
  252. }
  253. return s.error(c, "looking for beginning of object key string")
  254. }
  255. // stateEndValue is the state after completing a value,
  256. // such as after reading `{}` or `true` or `["x"`.
  257. func stateEndValue(s *scanner, c byte) int {
  258. n := len(s.parseState)
  259. if n == 0 {
  260. // Completed top-level before the current byte.
  261. s.step = stateEndTop
  262. s.endTop = true
  263. return stateEndTop(s, c)
  264. }
  265. if c <= ' ' && isSpace(c) {
  266. s.step = stateEndValue
  267. return scanSkipSpace
  268. }
  269. ps := s.parseState[n-1]
  270. switch ps {
  271. case parseObjectKey:
  272. if c == ':' {
  273. s.parseState[n-1] = parseObjectValue
  274. s.step = stateBeginValue
  275. return scanObjectKey
  276. }
  277. return s.error(c, "after object key")
  278. case parseObjectValue:
  279. if c == ',' {
  280. s.parseState[n-1] = parseObjectKey
  281. s.step = stateBeginStringOrEmpty
  282. return scanObjectValue
  283. }
  284. if c == '}' {
  285. s.popParseState()
  286. return scanEndObject
  287. }
  288. return s.error(c, "after object key:value pair")
  289. case parseArrayValue:
  290. if c == ',' {
  291. s.step = stateBeginValueOrEmpty
  292. return scanArrayValue
  293. }
  294. if c == ']' {
  295. s.popParseState()
  296. return scanEndArray
  297. }
  298. return s.error(c, "after array element")
  299. case parseParam:
  300. if c == ',' {
  301. s.step = stateBeginValue
  302. return scanParam
  303. }
  304. if c == ')' {
  305. s.popParseState()
  306. return scanEndParams
  307. }
  308. return s.error(c, "after array element")
  309. }
  310. return s.error(c, "")
  311. }
  312. // stateEndTop is the state after finishing the top-level value,
  313. // such as after reading `{}` or `[1,2,3]`.
  314. // Only space characters should be seen now.
  315. func stateEndTop(s *scanner, c byte) int {
  316. if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
  317. // Complain about non-space byte on next call.
  318. s.error(c, "after top-level value")
  319. }
  320. return scanEnd
  321. }
  322. // stateInString is the state after reading `"`.
  323. func stateInString(s *scanner, c byte) int {
  324. if c == '"' {
  325. s.step = stateEndValue
  326. return scanContinue
  327. }
  328. if c == '\\' {
  329. s.step = stateInStringEsc
  330. return scanContinue
  331. }
  332. if c < 0x20 {
  333. return s.error(c, "in string literal")
  334. }
  335. return scanContinue
  336. }
  337. // stateInStringEsc is the state after reading `"\` during a quoted string.
  338. func stateInStringEsc(s *scanner, c byte) int {
  339. switch c {
  340. case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
  341. s.step = stateInString
  342. return scanContinue
  343. case 'u':
  344. s.step = stateInStringEscU
  345. return scanContinue
  346. }
  347. return s.error(c, "in string escape code")
  348. }
  349. // stateInStringEscU is the state after reading `"\u` during a quoted string.
  350. func stateInStringEscU(s *scanner, c byte) int {
  351. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  352. s.step = stateInStringEscU1
  353. return scanContinue
  354. }
  355. // numbers
  356. return s.error(c, "in \\u hexadecimal character escape")
  357. }
  358. // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
  359. func stateInStringEscU1(s *scanner, c byte) int {
  360. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  361. s.step = stateInStringEscU12
  362. return scanContinue
  363. }
  364. // numbers
  365. return s.error(c, "in \\u hexadecimal character escape")
  366. }
  367. // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
  368. func stateInStringEscU12(s *scanner, c byte) int {
  369. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  370. s.step = stateInStringEscU123
  371. return scanContinue
  372. }
  373. // numbers
  374. return s.error(c, "in \\u hexadecimal character escape")
  375. }
  376. // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
  377. func stateInStringEscU123(s *scanner, c byte) int {
  378. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  379. s.step = stateInString
  380. return scanContinue
  381. }
  382. // numbers
  383. return s.error(c, "in \\u hexadecimal character escape")
  384. }
  385. // stateNeg is the state after reading `-` during a number.
  386. func stateNeg(s *scanner, c byte) int {
  387. if c == '0' {
  388. s.step = state0
  389. return scanContinue
  390. }
  391. if '1' <= c && c <= '9' {
  392. s.step = state1
  393. return scanContinue
  394. }
  395. return s.error(c, "in numeric literal")
  396. }
  397. // state1 is the state after reading a non-zero integer during a number,
  398. // such as after reading `1` or `100` but not `0`.
  399. func state1(s *scanner, c byte) int {
  400. if '0' <= c && c <= '9' {
  401. s.step = state1
  402. return scanContinue
  403. }
  404. return state0(s, c)
  405. }
  406. // state0 is the state after reading `0` during a number.
  407. func state0(s *scanner, c byte) int {
  408. if c == '.' {
  409. s.step = stateDot
  410. return scanContinue
  411. }
  412. if c == 'e' || c == 'E' {
  413. s.step = stateE
  414. return scanContinue
  415. }
  416. return stateEndValue(s, c)
  417. }
  418. // stateDot is the state after reading the integer and decimal point in a number,
  419. // such as after reading `1.`.
  420. func stateDot(s *scanner, c byte) int {
  421. if '0' <= c && c <= '9' {
  422. s.step = stateDot0
  423. return scanContinue
  424. }
  425. return s.error(c, "after decimal point in numeric literal")
  426. }
  427. // stateDot0 is the state after reading the integer, decimal point, and subsequent
  428. // digits of a number, such as after reading `3.14`.
  429. func stateDot0(s *scanner, c byte) int {
  430. if '0' <= c && c <= '9' {
  431. return scanContinue
  432. }
  433. if c == 'e' || c == 'E' {
  434. s.step = stateE
  435. return scanContinue
  436. }
  437. return stateEndValue(s, c)
  438. }
  439. // stateE is the state after reading the mantissa and e in a number,
  440. // such as after reading `314e` or `0.314e`.
  441. func stateE(s *scanner, c byte) int {
  442. if c == '+' || c == '-' {
  443. s.step = stateESign
  444. return scanContinue
  445. }
  446. return stateESign(s, c)
  447. }
  448. // stateESign is the state after reading the mantissa, e, and sign in a number,
  449. // such as after reading `314e-` or `0.314e+`.
  450. func stateESign(s *scanner, c byte) int {
  451. if '0' <= c && c <= '9' {
  452. s.step = stateE0
  453. return scanContinue
  454. }
  455. return s.error(c, "in exponent of numeric literal")
  456. }
  457. // stateE0 is the state after reading the mantissa, e, optional sign,
  458. // and at least one digit of the exponent in a number,
  459. // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
  460. func stateE0(s *scanner, c byte) int {
  461. if '0' <= c && c <= '9' {
  462. return scanContinue
  463. }
  464. return stateEndValue(s, c)
  465. }
  466. // stateNew0 is the state after reading `n`.
  467. func stateNew0(s *scanner, c byte) int {
  468. if c == 'e' {
  469. s.step = stateNew1
  470. return scanContinue
  471. }
  472. s.step = stateName
  473. return stateName(s, c)
  474. }
  475. // stateNew1 is the state after reading `ne`.
  476. func stateNew1(s *scanner, c byte) int {
  477. if c == 'w' {
  478. s.step = stateNew2
  479. return scanContinue
  480. }
  481. s.step = stateName
  482. return stateName(s, c)
  483. }
  484. // stateNew2 is the state after reading `new`.
  485. func stateNew2(s *scanner, c byte) int {
  486. s.step = stateName
  487. if c == ' ' {
  488. return scanContinue
  489. }
  490. return stateName(s, c)
  491. }
  492. // stateName is the state while reading an unquoted function name.
  493. func stateName(s *scanner, c byte) int {
  494. if isName(c) {
  495. return scanContinue
  496. }
  497. if c == '(' {
  498. s.step = stateParamOrEmpty
  499. s.pushParseState(parseParam)
  500. return scanParam
  501. }
  502. return stateEndValue(s, c)
  503. }
  504. // stateParamOrEmpty is the state after reading `(`.
  505. func stateParamOrEmpty(s *scanner, c byte) int {
  506. if c <= ' ' && isSpace(c) {
  507. return scanSkipSpace
  508. }
  509. if c == ')' {
  510. return stateEndValue(s, c)
  511. }
  512. return stateBeginValue(s, c)
  513. }
  514. // stateT is the state after reading `t`.
  515. func stateT(s *scanner, c byte) int {
  516. if c == 'r' {
  517. s.step = stateTr
  518. return scanContinue
  519. }
  520. return s.error(c, "in literal true (expecting 'r')")
  521. }
  522. // stateTr is the state after reading `tr`.
  523. func stateTr(s *scanner, c byte) int {
  524. if c == 'u' {
  525. s.step = stateTru
  526. return scanContinue
  527. }
  528. return s.error(c, "in literal true (expecting 'u')")
  529. }
  530. // stateTru is the state after reading `tru`.
  531. func stateTru(s *scanner, c byte) int {
  532. if c == 'e' {
  533. s.step = stateEndValue
  534. return scanContinue
  535. }
  536. return s.error(c, "in literal true (expecting 'e')")
  537. }
  538. // stateF is the state after reading `f`.
  539. func stateF(s *scanner, c byte) int {
  540. if c == 'a' {
  541. s.step = stateFa
  542. return scanContinue
  543. }
  544. return s.error(c, "in literal false (expecting 'a')")
  545. }
  546. // stateFa is the state after reading `fa`.
  547. func stateFa(s *scanner, c byte) int {
  548. if c == 'l' {
  549. s.step = stateFal
  550. return scanContinue
  551. }
  552. return s.error(c, "in literal false (expecting 'l')")
  553. }
  554. // stateFal is the state after reading `fal`.
  555. func stateFal(s *scanner, c byte) int {
  556. if c == 's' {
  557. s.step = stateFals
  558. return scanContinue
  559. }
  560. return s.error(c, "in literal false (expecting 's')")
  561. }
  562. // stateFals is the state after reading `fals`.
  563. func stateFals(s *scanner, c byte) int {
  564. if c == 'e' {
  565. s.step = stateEndValue
  566. return scanContinue
  567. }
  568. return s.error(c, "in literal false (expecting 'e')")
  569. }
  570. // stateN is the state after reading `n`.
  571. func stateN(s *scanner, c byte) int {
  572. if c == 'u' {
  573. s.step = stateNu
  574. return scanContinue
  575. }
  576. return s.error(c, "in literal null (expecting 'u')")
  577. }
  578. // stateNu is the state after reading `nu`.
  579. func stateNu(s *scanner, c byte) int {
  580. if c == 'l' {
  581. s.step = stateNul
  582. return scanContinue
  583. }
  584. return s.error(c, "in literal null (expecting 'l')")
  585. }
  586. // stateNul is the state after reading `nul`.
  587. func stateNul(s *scanner, c byte) int {
  588. if c == 'l' {
  589. s.step = stateEndValue
  590. return scanContinue
  591. }
  592. return s.error(c, "in literal null (expecting 'l')")
  593. }
  594. // stateError is the state after reaching a syntax error,
  595. // such as after reading `[1}` or `5.1.2`.
  596. func stateError(s *scanner, c byte) int {
  597. return scanError
  598. }
  599. // error records an error and switches to the error state.
  600. func (s *scanner) error(c byte, context string) int {
  601. s.step = stateError
  602. s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
  603. return scanError
  604. }
  605. // quoteChar formats c as a quoted character literal
  606. func quoteChar(c byte) string {
  607. // special cases - different from quoted strings
  608. if c == '\'' {
  609. return `'\''`
  610. }
  611. if c == '"' {
  612. return `'"'`
  613. }
  614. // use quoted string with different quotation marks
  615. s := strconv.Quote(string(c))
  616. return "'" + s[1:len(s)-1] + "'"
  617. }
  618. // undo causes the scanner to return scanCode from the next state transition.
  619. // This gives callers a simple 1-byte undo mechanism.
  620. func (s *scanner) undo(scanCode int) {
  621. if s.redo {
  622. panic("json: invalid use of scanner")
  623. }
  624. s.redoCode = scanCode
  625. s.redoState = s.step
  626. s.step = stateRedo
  627. s.redo = true
  628. }
  629. // stateRedo helps implement the scanner's 1-byte undo.
  630. func stateRedo(s *scanner, c byte) int {
  631. s.redo = false
  632. s.step = s.redoState
  633. return s.redoCode
  634. }