replace.go 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. package misspell
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "regexp"
  7. "strings"
  8. "text/scanner"
  9. )
  10. func max(x, y int) int {
  11. if x > y {
  12. return x
  13. }
  14. return y
  15. }
  16. func inArray(haystack []string, needle string) bool {
  17. for _, word := range haystack {
  18. if needle == word {
  19. return true
  20. }
  21. }
  22. return false
  23. }
  24. var wordRegexp = regexp.MustCompile(`[a-zA-Z0-9']+`)
  25. // Diff is datastructure showing what changed in a single line
  26. type Diff struct {
  27. Filename string
  28. FullLine string
  29. Line int
  30. Column int
  31. Original string
  32. Corrected string
  33. }
  34. // Replacer is the main struct for spelling correction
  35. type Replacer struct {
  36. Replacements []string
  37. Debug bool
  38. engine *StringReplacer
  39. corrected map[string]string
  40. }
  41. // New creates a new default Replacer using the main rule list
  42. func New() *Replacer {
  43. r := Replacer{
  44. Replacements: DictMain,
  45. }
  46. r.Compile()
  47. return &r
  48. }
  49. // RemoveRule deletes existings rules.
  50. // TODO: make inplace to save memory
  51. func (r *Replacer) RemoveRule(ignore []string) {
  52. newwords := make([]string, 0, len(r.Replacements))
  53. for i := 0; i < len(r.Replacements); i += 2 {
  54. if inArray(ignore, r.Replacements[i]) {
  55. continue
  56. }
  57. newwords = append(newwords, r.Replacements[i:i+2]...)
  58. }
  59. r.engine = nil
  60. r.Replacements = newwords
  61. }
  62. // AddRuleList appends new rules.
  63. // Input is in the same form as Strings.Replacer: [ old1, new1, old2, new2, ....]
  64. // Note: does not check for duplictes
  65. func (r *Replacer) AddRuleList(additions []string) {
  66. r.engine = nil
  67. r.Replacements = append(r.Replacements, additions...)
  68. }
  69. // Compile compiles the rules. Required before using the Replace functions
  70. func (r *Replacer) Compile() {
  71. r.corrected = make(map[string]string, len(r.Replacements)/2)
  72. for i := 0; i < len(r.Replacements); i += 2 {
  73. r.corrected[r.Replacements[i]] = r.Replacements[i+1]
  74. }
  75. r.engine = NewStringReplacer(r.Replacements...)
  76. }
  77. /*
  78. line1 and line2 are different
  79. extract words from each line1
  80. replace word -> newword
  81. if word == new-word
  82. continue
  83. if new-word in list of replacements
  84. continue
  85. new word not original, and not in list of replacements
  86. some substring got mixed up. UNdo
  87. */
  88. func (r *Replacer) recheckLine(s string, lineNum int, buf io.Writer, next func(Diff)) {
  89. first := 0
  90. redacted := RemoveNotWords(s)
  91. idx := wordRegexp.FindAllStringIndex(redacted, -1)
  92. for _, ab := range idx {
  93. word := s[ab[0]:ab[1]]
  94. newword := r.engine.Replace(word)
  95. if newword == word {
  96. // no replacement done
  97. continue
  98. }
  99. // ignore camelCase words
  100. // https://github.com/client9/misspell/issues/113
  101. if CaseStyle(word) == CaseUnknown {
  102. continue
  103. }
  104. if StringEqualFold(r.corrected[strings.ToLower(word)], newword) {
  105. // word got corrected into something we know
  106. io.WriteString(buf, s[first:ab[0]])
  107. io.WriteString(buf, newword)
  108. first = ab[1]
  109. next(Diff{
  110. FullLine: s,
  111. Line: lineNum,
  112. Original: word,
  113. Corrected: newword,
  114. Column: ab[0],
  115. })
  116. continue
  117. }
  118. // Word got corrected into something unknown. Ignore it
  119. }
  120. io.WriteString(buf, s[first:])
  121. }
  122. // ReplaceGo is a specialized routine for correcting Golang source
  123. // files. Currently only checks comments, not identifiers for
  124. // spelling.
  125. func (r *Replacer) ReplaceGo(input string) (string, []Diff) {
  126. var s scanner.Scanner
  127. s.Init(strings.NewReader(input))
  128. s.Mode = scanner.ScanIdents | scanner.ScanFloats | scanner.ScanChars | scanner.ScanStrings | scanner.ScanRawStrings | scanner.ScanComments
  129. lastPos := 0
  130. output := ""
  131. Loop:
  132. for {
  133. switch s.Scan() {
  134. case scanner.Comment:
  135. origComment := s.TokenText()
  136. newComment := r.engine.Replace(origComment)
  137. if origComment != newComment {
  138. // s.Pos().Offset is the end of the current token
  139. // subtract len(origComment) to get the start of the token
  140. offset := s.Pos().Offset
  141. output = output + input[lastPos:offset-len(origComment)] + newComment
  142. lastPos = offset
  143. }
  144. case scanner.EOF:
  145. break Loop
  146. }
  147. }
  148. if lastPos == 0 {
  149. // no changes, no copies
  150. return input, nil
  151. }
  152. if lastPos < len(input) {
  153. output = output + input[lastPos:]
  154. }
  155. diffs := make([]Diff, 0, 8)
  156. buf := bytes.NewBuffer(make([]byte, 0, max(len(input), len(output))+100))
  157. // faster that making a bytes.Buffer and bufio.ReadString
  158. outlines := strings.SplitAfter(output, "\n")
  159. inlines := strings.SplitAfter(input, "\n")
  160. for i := 0; i < len(inlines); i++ {
  161. if inlines[i] == outlines[i] {
  162. buf.WriteString(outlines[i])
  163. continue
  164. }
  165. r.recheckLine(inlines[i], i+1, buf, func(d Diff) {
  166. diffs = append(diffs, d)
  167. })
  168. }
  169. return buf.String(), diffs
  170. }
  171. // Replace is corrects misspellings in input, returning corrected version
  172. // along with a list of diffs.
  173. func (r *Replacer) Replace(input string) (string, []Diff) {
  174. output := r.engine.Replace(input)
  175. if input == output {
  176. return input, nil
  177. }
  178. diffs := make([]Diff, 0, 8)
  179. buf := bytes.NewBuffer(make([]byte, 0, max(len(input), len(output))+100))
  180. // faster that making a bytes.Buffer and bufio.ReadString
  181. outlines := strings.SplitAfter(output, "\n")
  182. inlines := strings.SplitAfter(input, "\n")
  183. for i := 0; i < len(inlines); i++ {
  184. if inlines[i] == outlines[i] {
  185. buf.WriteString(outlines[i])
  186. continue
  187. }
  188. r.recheckLine(inlines[i], i+1, buf, func(d Diff) {
  189. diffs = append(diffs, d)
  190. })
  191. }
  192. return buf.String(), diffs
  193. }
  194. // ReplaceReader applies spelling corrections to a reader stream. Diffs are
  195. // emitted through a callback.
  196. func (r *Replacer) ReplaceReader(raw io.Reader, w io.Writer, next func(Diff)) error {
  197. var (
  198. err error
  199. line string
  200. lineNum int
  201. )
  202. reader := bufio.NewReader(raw)
  203. for err == nil {
  204. lineNum++
  205. line, err = reader.ReadString('\n')
  206. // if it's EOF, then line has the last line
  207. // don't like the check of err here and
  208. // in for loop
  209. if err != nil && err != io.EOF {
  210. return err
  211. }
  212. // easily 5x faster than regexp+map
  213. if line == r.engine.Replace(line) {
  214. io.WriteString(w, line)
  215. continue
  216. }
  217. // but it can be inaccurate, so we need to double check
  218. r.recheckLine(line, lineNum, w, next)
  219. }
  220. return nil
  221. }