stringreplacer.go 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package misspell
  5. import (
  6. "io"
  7. // "log"
  8. "strings"
  9. )
  10. // StringReplacer replaces a list of strings with replacements.
  11. // It is safe for concurrent use by multiple goroutines.
  12. type StringReplacer struct {
  13. r replacer
  14. }
  15. // replacer is the interface that a replacement algorithm needs to implement.
  16. type replacer interface {
  17. Replace(s string) string
  18. WriteString(w io.Writer, s string) (n int, err error)
  19. }
  20. // NewStringReplacer returns a new Replacer from a list of old, new string pairs.
  21. // Replacements are performed in order, without overlapping matches.
  22. func NewStringReplacer(oldnew ...string) *StringReplacer {
  23. if len(oldnew)%2 == 1 {
  24. panic("strings.NewReplacer: odd argument count")
  25. }
  26. return &StringReplacer{r: makeGenericReplacer(oldnew)}
  27. }
  28. // Replace returns a copy of s with all replacements performed.
  29. func (r *StringReplacer) Replace(s string) string {
  30. return r.r.Replace(s)
  31. }
  32. // WriteString writes s to w with all replacements performed.
  33. func (r *StringReplacer) WriteString(w io.Writer, s string) (n int, err error) {
  34. return r.r.WriteString(w, s)
  35. }
  36. // trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
  37. // and values may be empty. For example, the trie containing keys "ax", "ay",
  38. // "bcbc", "x" and "xy" could have eight nodes:
  39. //
  40. // n0 -
  41. // n1 a-
  42. // n2 .x+
  43. // n3 .y+
  44. // n4 b-
  45. // n5 .cbc+
  46. // n6 x+
  47. // n7 .y+
  48. //
  49. // n0 is the root node, and its children are n1, n4 and n6; n1's children are
  50. // n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
  51. // with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
  52. // (marked with a trailing "+") are complete keys.
  53. type trieNode struct {
  54. // value is the value of the trie node's key/value pair. It is empty if
  55. // this node is not a complete key.
  56. value string
  57. // priority is the priority (higher is more important) of the trie node's
  58. // key/value pair; keys are not necessarily matched shortest- or longest-
  59. // first. Priority is positive if this node is a complete key, and zero
  60. // otherwise. In the example above, positive/zero priorities are marked
  61. // with a trailing "+" or "-".
  62. priority int
  63. // A trie node may have zero, one or more child nodes:
  64. // * if the remaining fields are zero, there are no children.
  65. // * if prefix and next are non-zero, there is one child in next.
  66. // * if table is non-zero, it defines all the children.
  67. //
  68. // Prefixes are preferred over tables when there is one child, but the
  69. // root node always uses a table for lookup efficiency.
  70. // prefix is the difference in keys between this trie node and the next.
  71. // In the example above, node n4 has prefix "cbc" and n4's next node is n5.
  72. // Node n5 has no children and so has zero prefix, next and table fields.
  73. prefix string
  74. next *trieNode
  75. // table is a lookup table indexed by the next byte in the key, after
  76. // remapping that byte through genericReplacer.mapping to create a dense
  77. // index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
  78. // 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
  79. // genericReplacer.tableSize will be 5. Node n0's table will be
  80. // []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
  81. // 'a', 'b' and 'x'.
  82. table []*trieNode
  83. }
  84. func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
  85. if key == "" {
  86. if t.priority == 0 {
  87. t.value = val
  88. t.priority = priority
  89. }
  90. return
  91. }
  92. if t.prefix != "" {
  93. // Need to split the prefix among multiple nodes.
  94. var n int // length of the longest common prefix
  95. for ; n < len(t.prefix) && n < len(key); n++ {
  96. if t.prefix[n] != key[n] {
  97. break
  98. }
  99. }
  100. if n == len(t.prefix) {
  101. t.next.add(key[n:], val, priority, r)
  102. } else if n == 0 {
  103. // First byte differs, start a new lookup table here. Looking up
  104. // what is currently t.prefix[0] will lead to prefixNode, and
  105. // looking up key[0] will lead to keyNode.
  106. var prefixNode *trieNode
  107. if len(t.prefix) == 1 {
  108. prefixNode = t.next
  109. } else {
  110. prefixNode = &trieNode{
  111. prefix: t.prefix[1:],
  112. next: t.next,
  113. }
  114. }
  115. keyNode := new(trieNode)
  116. t.table = make([]*trieNode, r.tableSize)
  117. t.table[r.mapping[t.prefix[0]]] = prefixNode
  118. t.table[r.mapping[key[0]]] = keyNode
  119. t.prefix = ""
  120. t.next = nil
  121. keyNode.add(key[1:], val, priority, r)
  122. } else {
  123. // Insert new node after the common section of the prefix.
  124. next := &trieNode{
  125. prefix: t.prefix[n:],
  126. next: t.next,
  127. }
  128. t.prefix = t.prefix[:n]
  129. t.next = next
  130. next.add(key[n:], val, priority, r)
  131. }
  132. } else if t.table != nil {
  133. // Insert into existing table.
  134. m := r.mapping[key[0]]
  135. if t.table[m] == nil {
  136. t.table[m] = new(trieNode)
  137. }
  138. t.table[m].add(key[1:], val, priority, r)
  139. } else {
  140. t.prefix = key
  141. t.next = new(trieNode)
  142. t.next.add("", val, priority, r)
  143. }
  144. }
  145. func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
  146. // Iterate down the trie to the end, and grab the value and keylen with
  147. // the highest priority.
  148. bestPriority := 0
  149. node := &r.root
  150. n := 0
  151. for node != nil {
  152. if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
  153. bestPriority = node.priority
  154. val = node.value
  155. keylen = n
  156. found = true
  157. }
  158. if s == "" {
  159. break
  160. }
  161. if node.table != nil {
  162. index := r.mapping[ByteToLower(s[0])]
  163. if int(index) == r.tableSize {
  164. break
  165. }
  166. node = node.table[index]
  167. s = s[1:]
  168. n++
  169. } else if node.prefix != "" && StringHasPrefixFold(s, node.prefix) {
  170. n += len(node.prefix)
  171. s = s[len(node.prefix):]
  172. node = node.next
  173. } else {
  174. break
  175. }
  176. }
  177. return
  178. }
  179. // genericReplacer is the fully generic algorithm.
  180. // It's used as a fallback when nothing faster can be used.
  181. type genericReplacer struct {
  182. root trieNode
  183. // tableSize is the size of a trie node's lookup table. It is the number
  184. // of unique key bytes.
  185. tableSize int
  186. // mapping maps from key bytes to a dense index for trieNode.table.
  187. mapping [256]byte
  188. }
  189. func makeGenericReplacer(oldnew []string) *genericReplacer {
  190. r := new(genericReplacer)
  191. // Find each byte used, then assign them each an index.
  192. for i := 0; i < len(oldnew); i += 2 {
  193. key := strings.ToLower(oldnew[i])
  194. for j := 0; j < len(key); j++ {
  195. r.mapping[key[j]] = 1
  196. }
  197. }
  198. for _, b := range r.mapping {
  199. r.tableSize += int(b)
  200. }
  201. var index byte
  202. for i, b := range r.mapping {
  203. if b == 0 {
  204. r.mapping[i] = byte(r.tableSize)
  205. } else {
  206. r.mapping[i] = index
  207. index++
  208. }
  209. }
  210. // Ensure root node uses a lookup table (for performance).
  211. r.root.table = make([]*trieNode, r.tableSize)
  212. for i := 0; i < len(oldnew); i += 2 {
  213. r.root.add(strings.ToLower(oldnew[i]), oldnew[i+1], len(oldnew)-i, r)
  214. }
  215. return r
  216. }
  217. type appendSliceWriter []byte
  218. // Write writes to the buffer to satisfy io.Writer.
  219. func (w *appendSliceWriter) Write(p []byte) (int, error) {
  220. *w = append(*w, p...)
  221. return len(p), nil
  222. }
  223. // WriteString writes to the buffer without string->[]byte->string allocations.
  224. func (w *appendSliceWriter) WriteString(s string) (int, error) {
  225. *w = append(*w, s...)
  226. return len(s), nil
  227. }
  228. type stringWriterIface interface {
  229. WriteString(string) (int, error)
  230. }
  231. type stringWriter struct {
  232. w io.Writer
  233. }
  234. func (w stringWriter) WriteString(s string) (int, error) {
  235. return w.w.Write([]byte(s))
  236. }
  237. func getStringWriter(w io.Writer) stringWriterIface {
  238. sw, ok := w.(stringWriterIface)
  239. if !ok {
  240. sw = stringWriter{w}
  241. }
  242. return sw
  243. }
  244. func (r *genericReplacer) Replace(s string) string {
  245. buf := make(appendSliceWriter, 0, len(s))
  246. r.WriteString(&buf, s)
  247. return string(buf)
  248. }
  249. func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
  250. sw := getStringWriter(w)
  251. var last, wn int
  252. var prevMatchEmpty bool
  253. for i := 0; i <= len(s); {
  254. // Fast path: s[i] is not a prefix of any pattern.
  255. if i != len(s) && r.root.priority == 0 {
  256. index := int(r.mapping[ByteToLower(s[i])])
  257. if index == r.tableSize || r.root.table[index] == nil {
  258. i++
  259. continue
  260. }
  261. }
  262. // Ignore the empty match iff the previous loop found the empty match.
  263. val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
  264. prevMatchEmpty = match && keylen == 0
  265. if match {
  266. orig := s[i : i+keylen]
  267. switch CaseStyle(orig) {
  268. case CaseUnknown:
  269. // pretend we didn't match
  270. // i++
  271. // continue
  272. case CaseUpper:
  273. val = strings.ToUpper(val)
  274. case CaseLower:
  275. val = strings.ToLower(val)
  276. case CaseTitle:
  277. if len(val) < 2 {
  278. val = strings.ToUpper(val)
  279. } else {
  280. val = strings.ToUpper(val[:1]) + strings.ToLower(val[1:])
  281. }
  282. }
  283. wn, err = sw.WriteString(s[last:i])
  284. n += wn
  285. if err != nil {
  286. return
  287. }
  288. //log.Printf("%d: Going to correct %q with %q", i, s[i:i+keylen], val)
  289. wn, err = sw.WriteString(val)
  290. n += wn
  291. if err != nil {
  292. return
  293. }
  294. i += keylen
  295. last = i
  296. continue
  297. }
  298. i++
  299. }
  300. if last != len(s) {
  301. wn, err = sw.WriteString(s[last:])
  302. n += wn
  303. }
  304. return
  305. }