notwords.go 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. package misspell
  2. import (
  3. "bytes"
  4. "regexp"
  5. "strings"
  6. )
  7. var (
  8. reEmail = regexp.MustCompile(`[a-zA-Z0-9_.%+-]+@[a-zA-Z0-9-.]+\.[a-zA-Z]{2,6}[^a-zA-Z]`)
  9. reHost = regexp.MustCompile(`[a-zA-Z0-9-.]+\.[a-zA-Z]+`)
  10. reBackslash = regexp.MustCompile(`\\[a-z]`)
  11. )
  12. // RemovePath attempts to strip away embedded file system paths, e.g.
  13. // /foo/bar or /static/myimg.png
  14. //
  15. // TODO: windows style
  16. //
  17. func RemovePath(s string) string {
  18. out := bytes.Buffer{}
  19. var idx int
  20. for len(s) > 0 {
  21. if idx = strings.IndexByte(s, '/'); idx == -1 {
  22. out.WriteString(s)
  23. break
  24. }
  25. if idx > 0 {
  26. idx--
  27. }
  28. var chclass string
  29. switch s[idx] {
  30. case '/', ' ', '\n', '\t', '\r':
  31. chclass = " \n\r\t"
  32. case '[':
  33. chclass = "]\n"
  34. case '(':
  35. chclass = ")\n"
  36. default:
  37. out.WriteString(s[:idx+2])
  38. s = s[idx+2:]
  39. continue
  40. }
  41. endx := strings.IndexAny(s[idx+1:], chclass)
  42. if endx != -1 {
  43. out.WriteString(s[:idx+1])
  44. out.Write(bytes.Repeat([]byte{' '}, endx))
  45. s = s[idx+endx+1:]
  46. } else {
  47. out.WriteString(s)
  48. break
  49. }
  50. }
  51. return out.String()
  52. }
  53. // replaceWithBlanks returns a string with the same number of spaces as the input
  54. func replaceWithBlanks(s string) string {
  55. return strings.Repeat(" ", len(s))
  56. }
  57. // RemoveEmail remove email-like strings, e.g. "nickg+junk@xfoobar.com", "nickg@xyz.abc123.biz"
  58. func RemoveEmail(s string) string {
  59. return reEmail.ReplaceAllStringFunc(s, replaceWithBlanks)
  60. }
  61. // RemoveHost removes host-like strings "foobar.com" "abc123.fo1231.biz"
  62. func RemoveHost(s string) string {
  63. return reHost.ReplaceAllStringFunc(s, replaceWithBlanks)
  64. }
  65. // RemoveBackslashEscapes removes characters that are preceeded by a backslash
  66. // commonly found in printf format stringd "\nto"
  67. func removeBackslashEscapes(s string) string {
  68. return reBackslash.ReplaceAllStringFunc(s, replaceWithBlanks)
  69. }
  70. // RemoveNotWords blanks out all the not words
  71. func RemoveNotWords(s string) string {
  72. // do most selective/specific first
  73. return removeBackslashEscapes(RemoveHost(RemoveEmail(RemovePath(StripURL(s)))))
  74. }