lex.go 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. // Copyright 2016 Frank Schroeder. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //
  5. // Parts of the lexer are from the template/text/parser package
  6. // For these parts the following applies:
  7. //
  8. // Copyright 2011 The Go Authors. All rights reserved.
  9. // Use of this source code is governed by a BSD-style
  10. // license that can be found in the LICENSE file of the go 1.2
  11. // distribution.
  12. package properties
  13. import (
  14. "fmt"
  15. "strconv"
  16. "strings"
  17. "unicode/utf8"
  18. )
  19. // item represents a token or text string returned from the scanner.
  20. type item struct {
  21. typ itemType // The type of this item.
  22. pos int // The starting position, in bytes, of this item in the input string.
  23. val string // The value of this item.
  24. }
  25. func (i item) String() string {
  26. switch {
  27. case i.typ == itemEOF:
  28. return "EOF"
  29. case i.typ == itemError:
  30. return i.val
  31. case len(i.val) > 10:
  32. return fmt.Sprintf("%.10q...", i.val)
  33. }
  34. return fmt.Sprintf("%q", i.val)
  35. }
  36. // itemType identifies the type of lex items.
  37. type itemType int
  38. const (
  39. itemError itemType = iota // error occurred; value is text of error
  40. itemEOF
  41. itemKey // a key
  42. itemValue // a value
  43. itemComment // a comment
  44. )
  45. // defines a constant for EOF
  46. const eof = -1
  47. // permitted whitespace characters space, FF and TAB
  48. const whitespace = " \f\t"
  49. // stateFn represents the state of the scanner as a function that returns the next state.
  50. type stateFn func(*lexer) stateFn
  51. // lexer holds the state of the scanner.
  52. type lexer struct {
  53. input string // the string being scanned
  54. state stateFn // the next lexing function to enter
  55. pos int // current position in the input
  56. start int // start position of this item
  57. width int // width of last rune read from input
  58. lastPos int // position of most recent item returned by nextItem
  59. runes []rune // scanned runes for this item
  60. items chan item // channel of scanned items
  61. }
  62. // next returns the next rune in the input.
  63. func (l *lexer) next() rune {
  64. if int(l.pos) >= len(l.input) {
  65. l.width = 0
  66. return eof
  67. }
  68. r, w := utf8.DecodeRuneInString(l.input[l.pos:])
  69. l.width = w
  70. l.pos += l.width
  71. return r
  72. }
  73. // peek returns but does not consume the next rune in the input.
  74. func (l *lexer) peek() rune {
  75. r := l.next()
  76. l.backup()
  77. return r
  78. }
  79. // backup steps back one rune. Can only be called once per call of next.
  80. func (l *lexer) backup() {
  81. l.pos -= l.width
  82. }
  83. // emit passes an item back to the client.
  84. func (l *lexer) emit(t itemType) {
  85. item := item{t, l.start, string(l.runes)}
  86. l.items <- item
  87. l.start = l.pos
  88. l.runes = l.runes[:0]
  89. }
  90. // ignore skips over the pending input before this point.
  91. func (l *lexer) ignore() {
  92. l.start = l.pos
  93. }
  94. // appends the rune to the current value
  95. func (l *lexer) appendRune(r rune) {
  96. l.runes = append(l.runes, r)
  97. }
  98. // accept consumes the next rune if it's from the valid set.
  99. func (l *lexer) accept(valid string) bool {
  100. if strings.IndexRune(valid, l.next()) >= 0 {
  101. return true
  102. }
  103. l.backup()
  104. return false
  105. }
  106. // acceptRun consumes a run of runes from the valid set.
  107. func (l *lexer) acceptRun(valid string) {
  108. for strings.IndexRune(valid, l.next()) >= 0 {
  109. }
  110. l.backup()
  111. }
  112. // acceptRunUntil consumes a run of runes up to a terminator.
  113. func (l *lexer) acceptRunUntil(term rune) {
  114. for term != l.next() {
  115. }
  116. l.backup()
  117. }
  118. // hasText returns true if the current parsed text is not empty.
  119. func (l *lexer) isNotEmpty() bool {
  120. return l.pos > l.start
  121. }
  122. // lineNumber reports which line we're on, based on the position of
  123. // the previous item returned by nextItem. Doing it this way
  124. // means we don't have to worry about peek double counting.
  125. func (l *lexer) lineNumber() int {
  126. return 1 + strings.Count(l.input[:l.lastPos], "\n")
  127. }
  128. // errorf returns an error token and terminates the scan by passing
  129. // back a nil pointer that will be the next state, terminating l.nextItem.
  130. func (l *lexer) errorf(format string, args ...interface{}) stateFn {
  131. l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)}
  132. return nil
  133. }
  134. // nextItem returns the next item from the input.
  135. func (l *lexer) nextItem() item {
  136. item := <-l.items
  137. l.lastPos = item.pos
  138. return item
  139. }
  140. // lex creates a new scanner for the input string.
  141. func lex(input string) *lexer {
  142. l := &lexer{
  143. input: input,
  144. items: make(chan item),
  145. runes: make([]rune, 0, 32),
  146. }
  147. go l.run()
  148. return l
  149. }
  150. // run runs the state machine for the lexer.
  151. func (l *lexer) run() {
  152. for l.state = lexBeforeKey(l); l.state != nil; {
  153. l.state = l.state(l)
  154. }
  155. }
  156. // state functions
  157. // lexBeforeKey scans until a key begins.
  158. func lexBeforeKey(l *lexer) stateFn {
  159. switch r := l.next(); {
  160. case isEOF(r):
  161. l.emit(itemEOF)
  162. return nil
  163. case isEOL(r):
  164. l.ignore()
  165. return lexBeforeKey
  166. case isComment(r):
  167. return lexComment
  168. case isWhitespace(r):
  169. l.acceptRun(whitespace)
  170. l.ignore()
  171. return lexKey
  172. default:
  173. l.backup()
  174. return lexKey
  175. }
  176. }
  177. // lexComment scans a comment line. The comment character has already been scanned.
  178. func lexComment(l *lexer) stateFn {
  179. l.acceptRun(whitespace)
  180. l.ignore()
  181. for {
  182. switch r := l.next(); {
  183. case isEOF(r):
  184. l.ignore()
  185. l.emit(itemEOF)
  186. return nil
  187. case isEOL(r):
  188. l.emit(itemComment)
  189. return lexBeforeKey
  190. default:
  191. l.appendRune(r)
  192. }
  193. }
  194. }
  195. // lexKey scans the key up to a delimiter
  196. func lexKey(l *lexer) stateFn {
  197. var r rune
  198. Loop:
  199. for {
  200. switch r = l.next(); {
  201. case isEscape(r):
  202. err := l.scanEscapeSequence()
  203. if err != nil {
  204. return l.errorf(err.Error())
  205. }
  206. case isEndOfKey(r):
  207. l.backup()
  208. break Loop
  209. case isEOF(r):
  210. break Loop
  211. default:
  212. l.appendRune(r)
  213. }
  214. }
  215. if len(l.runes) > 0 {
  216. l.emit(itemKey)
  217. }
  218. if isEOF(r) {
  219. l.emit(itemEOF)
  220. return nil
  221. }
  222. return lexBeforeValue
  223. }
  224. // lexBeforeValue scans the delimiter between key and value.
  225. // Leading and trailing whitespace is ignored.
  226. // We expect to be just after the key.
  227. func lexBeforeValue(l *lexer) stateFn {
  228. l.acceptRun(whitespace)
  229. l.accept(":=")
  230. l.acceptRun(whitespace)
  231. l.ignore()
  232. return lexValue
  233. }
  234. // lexValue scans text until the end of the line. We expect to be just after the delimiter.
  235. func lexValue(l *lexer) stateFn {
  236. for {
  237. switch r := l.next(); {
  238. case isEscape(r):
  239. r := l.peek()
  240. if isEOL(r) {
  241. l.next()
  242. l.acceptRun(whitespace)
  243. } else {
  244. err := l.scanEscapeSequence()
  245. if err != nil {
  246. return l.errorf(err.Error())
  247. }
  248. }
  249. case isEOL(r):
  250. l.emit(itemValue)
  251. l.ignore()
  252. return lexBeforeKey
  253. case isEOF(r):
  254. l.emit(itemValue)
  255. l.emit(itemEOF)
  256. return nil
  257. default:
  258. l.appendRune(r)
  259. }
  260. }
  261. }
  262. // scanEscapeSequence scans either one of the escaped characters
  263. // or a unicode literal. We expect to be after the escape character.
  264. func (l *lexer) scanEscapeSequence() error {
  265. switch r := l.next(); {
  266. case isEscapedCharacter(r):
  267. l.appendRune(decodeEscapedCharacter(r))
  268. return nil
  269. case atUnicodeLiteral(r):
  270. return l.scanUnicodeLiteral()
  271. case isEOF(r):
  272. return fmt.Errorf("premature EOF")
  273. // silently drop the escape character and append the rune as is
  274. default:
  275. l.appendRune(r)
  276. return nil
  277. }
  278. }
  279. // scans a unicode literal in the form \uXXXX. We expect to be after the \u.
  280. func (l *lexer) scanUnicodeLiteral() error {
  281. // scan the digits
  282. d := make([]rune, 4)
  283. for i := 0; i < 4; i++ {
  284. d[i] = l.next()
  285. if d[i] == eof || !strings.ContainsRune("0123456789abcdefABCDEF", d[i]) {
  286. return fmt.Errorf("invalid unicode literal")
  287. }
  288. }
  289. // decode the digits into a rune
  290. r, err := strconv.ParseInt(string(d), 16, 0)
  291. if err != nil {
  292. return err
  293. }
  294. l.appendRune(rune(r))
  295. return nil
  296. }
  297. // decodeEscapedCharacter returns the unescaped rune. We expect to be after the escape character.
  298. func decodeEscapedCharacter(r rune) rune {
  299. switch r {
  300. case 'f':
  301. return '\f'
  302. case 'n':
  303. return '\n'
  304. case 'r':
  305. return '\r'
  306. case 't':
  307. return '\t'
  308. default:
  309. return r
  310. }
  311. }
  312. // atUnicodeLiteral reports whether we are at a unicode literal.
  313. // The escape character has already been consumed.
  314. func atUnicodeLiteral(r rune) bool {
  315. return r == 'u'
  316. }
  317. // isComment reports whether we are at the start of a comment.
  318. func isComment(r rune) bool {
  319. return r == '#' || r == '!'
  320. }
  321. // isEndOfKey reports whether the rune terminates the current key.
  322. func isEndOfKey(r rune) bool {
  323. return strings.ContainsRune(" \f\t\r\n:=", r)
  324. }
  325. // isEOF reports whether we are at EOF.
  326. func isEOF(r rune) bool {
  327. return r == eof
  328. }
  329. // isEOL reports whether we are at a new line character.
  330. func isEOL(r rune) bool {
  331. return r == '\n' || r == '\r'
  332. }
  333. // isEscape reports whether the rune is the escape character which
  334. // prefixes unicode literals and other escaped characters.
  335. func isEscape(r rune) bool {
  336. return r == '\\'
  337. }
  338. // isEscapedCharacter reports whether we are at one of the characters that need escaping.
  339. // The escape character has already been consumed.
  340. func isEscapedCharacter(r rune) bool {
  341. return strings.ContainsRune(" :=fnrt", r)
  342. }
  343. // isWhitespace reports whether the rune is a whitespace character.
  344. func isWhitespace(r rune) bool {
  345. return strings.ContainsRune(whitespace, r)
  346. }