mime.go 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. package misspell
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "io/ioutil"
  7. "net/http"
  8. "os"
  9. "path/filepath"
  10. "strings"
  11. )
  12. // The number of possible binary formats is very large
  13. // items that might be checked into a repo or be an
  14. // artifact of a build. Additions welcome.
  15. //
  16. // Golang's internal table is very small and can't be
  17. // relied on. Even then things like ".js" have a mime
  18. // type of "application/javascipt" which isn't very helpful.
  19. // "[x]" means we have sniff test and suffix test should be eliminated
  20. var binary = map[string]bool{
  21. ".a": true, // [ ] archive
  22. ".bin": true, // [ ] binary
  23. ".bz2": true, // [ ] compression
  24. ".class": true, // [x] Java class file
  25. ".dll": true, // [ ] shared library
  26. ".exe": true, // [ ] binary
  27. ".gif": true, // [ ] image
  28. ".gpg": true, // [x] text, but really all base64
  29. ".gz": true, // [ ] compression
  30. ".ico": true, // [ ] image
  31. ".jar": true, // [x] archive
  32. ".jpeg": true, // [ ] image
  33. ".jpg": true, // [ ] image
  34. ".mp3": true, // [ ] audio
  35. ".mp4": true, // [ ] video
  36. ".mpeg": true, // [ ] video
  37. ".o": true, // [ ] object file
  38. ".pdf": true, // [x] pdf
  39. ".png": true, // [x] image
  40. ".pyc": true, // [ ] Python bytecode
  41. ".pyo": true, // [ ] Python bytecode
  42. ".so": true, // [x] shared library
  43. ".swp": true, // [ ] vim swap file
  44. ".tar": true, // [ ] archive
  45. ".tiff": true, // [ ] image
  46. ".woff": true, // [ ] font
  47. ".woff2": true, // [ ] font
  48. ".xz": true, // [ ] compression
  49. ".z": true, // [ ] compression
  50. ".zip": true, // [x] archive
  51. }
  52. // isBinaryFilename returns true if the file is likely to be binary
  53. //
  54. // Better heuristics could be done here, in particular a binary
  55. // file is unlikely to be UTF-8 encoded. However this is cheap
  56. // and will solve the immediate need of making sure common
  57. // binary formats are not corrupted by mistake.
  58. func isBinaryFilename(s string) bool {
  59. return binary[strings.ToLower(filepath.Ext(s))]
  60. }
  61. var scm = map[string]bool{
  62. ".bzr": true,
  63. ".git": true,
  64. ".hg": true,
  65. ".svn": true,
  66. "CVS": true,
  67. }
  68. // isSCMPath returns true if the path is likely part of a (private) SCM
  69. // directory. E.g. ./git/something = true
  70. func isSCMPath(s string) bool {
  71. // hack for .git/COMMIT_EDITMSG and .git/TAG_EDITMSG
  72. // normally we don't look at anything in .git
  73. // but COMMIT_EDITMSG and TAG_EDITMSG are used as
  74. // temp files for git commits. Allowing misspell to inspect
  75. // these files allows for commit-msg hooks
  76. // https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks
  77. if strings.Contains(filepath.Base(s), "EDITMSG") {
  78. return false
  79. }
  80. parts := strings.Split(filepath.Clean(s), string(filepath.Separator))
  81. for _, dir := range parts {
  82. if scm[dir] {
  83. return true
  84. }
  85. }
  86. return false
  87. }
  88. var magicHeaders = [][]byte{
  89. // Issue #68
  90. // PGP messages and signatures are "text" but really just
  91. // blobs of base64-text and should not be misspell-checked
  92. []byte("-----BEGIN PGP MESSAGE-----"),
  93. []byte("-----BEGIN PGP SIGNATURE-----"),
  94. // ELF
  95. {0x7f, 0x45, 0x4c, 0x46},
  96. // Postscript
  97. {0x25, 0x21, 0x50, 0x53},
  98. // PDF
  99. {0x25, 0x50, 0x44, 0x46},
  100. // Java class file
  101. // https://en.wikipedia.org/wiki/Java_class_file
  102. {0xCA, 0xFE, 0xBA, 0xBE},
  103. // PNG
  104. // https://en.wikipedia.org/wiki/Portable_Network_Graphics
  105. {0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a},
  106. // ZIP, JAR, ODF, OOXML
  107. {0x50, 0x4B, 0x03, 0x04},
  108. {0x50, 0x4B, 0x05, 0x06},
  109. {0x50, 0x4B, 0x07, 0x08},
  110. }
  111. func isTextFile(raw []byte) bool {
  112. for _, magic := range magicHeaders {
  113. if bytes.HasPrefix(raw, magic) {
  114. return false
  115. }
  116. }
  117. // allow any text/ type with utf-8 encoding
  118. // DetectContentType sometimes returns charset=utf-16 for XML stuff
  119. // in which case ignore.
  120. mime := http.DetectContentType(raw)
  121. return strings.HasPrefix(mime, "text/") && strings.HasSuffix(mime, "charset=utf-8")
  122. }
  123. // ReadTextFile returns the contents of a file, first testing if it is a text file
  124. // returns ("", nil) if not a text file
  125. // returns ("", error) if error
  126. // returns (string, nil) if text
  127. //
  128. // unfortunately, in worse case, this does
  129. // 1 stat
  130. // 1 open,read,close of 512 bytes
  131. // 1 more stat,open, read everything, close (via ioutil.ReadAll)
  132. // This could be kinder to the filesystem.
  133. //
  134. // This uses some heuristics of the file's extension (e.g. .zip, .txt) and
  135. // uses a sniffer to determine if the file is text or not.
  136. // Using file extensions isn't great, but probably
  137. // good enough for real-world use.
  138. // Golang's built in sniffer is problematic for differnet reasons. It's
  139. // optimized for HTML, and is very limited in detection. It would be good
  140. // to explicitly add some tests for ELF/DWARF formats to make sure we never
  141. // corrupt binary files.
  142. func ReadTextFile(filename string) (string, error) {
  143. if isBinaryFilename(filename) {
  144. return "", nil
  145. }
  146. if isSCMPath(filename) {
  147. return "", nil
  148. }
  149. fstat, err := os.Stat(filename)
  150. if err != nil {
  151. return "", fmt.Errorf("Unable to stat %q: %s", filename, err)
  152. }
  153. // directory: nothing to do.
  154. if fstat.IsDir() {
  155. return "", nil
  156. }
  157. // avoid reading in multi-gig files
  158. // if input is large, read the first 512 bytes to sniff type
  159. // if not-text, then exit
  160. isText := false
  161. if fstat.Size() > 50000 {
  162. fin, err := os.Open(filename)
  163. if err != nil {
  164. return "", fmt.Errorf("Unable to open large file %q: %s", filename, err)
  165. }
  166. defer fin.Close()
  167. buf := make([]byte, 512)
  168. _, err = io.ReadFull(fin, buf)
  169. if err != nil {
  170. return "", fmt.Errorf("Unable to read 512 bytes from %q: %s", filename, err)
  171. }
  172. if !isTextFile(buf) {
  173. return "", nil
  174. }
  175. // set so we don't double check this file
  176. isText = true
  177. }
  178. // read in whole file
  179. raw, err := ioutil.ReadFile(filename)
  180. if err != nil {
  181. return "", fmt.Errorf("Unable to read all %q: %s", filename, err)
  182. }
  183. if !isText && !isTextFile(raw) {
  184. return "", nil
  185. }
  186. return string(raw), nil
  187. }