123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211 |
- package misspell
- import (
- "bytes"
- "fmt"
- "io"
- "io/ioutil"
- "net/http"
- "os"
- "path/filepath"
- "strings"
- )
- // The number of possible binary formats is very large
- // items that might be checked into a repo or be an
- // artifact of a build. Additions welcome.
- //
- // Golang's internal table is very small and can't be
- // relied on. Even then things like ".js" have a mime
- // type of "application/javascipt" which isn't very helpful.
- // "[x]" means we have sniff test and suffix test should be eliminated
- var binary = map[string]bool{
- ".a": true, // [ ] archive
- ".bin": true, // [ ] binary
- ".bz2": true, // [ ] compression
- ".class": true, // [x] Java class file
- ".dll": true, // [ ] shared library
- ".exe": true, // [ ] binary
- ".gif": true, // [ ] image
- ".gpg": true, // [x] text, but really all base64
- ".gz": true, // [ ] compression
- ".ico": true, // [ ] image
- ".jar": true, // [x] archive
- ".jpeg": true, // [ ] image
- ".jpg": true, // [ ] image
- ".mp3": true, // [ ] audio
- ".mp4": true, // [ ] video
- ".mpeg": true, // [ ] video
- ".o": true, // [ ] object file
- ".pdf": true, // [x] pdf
- ".png": true, // [x] image
- ".pyc": true, // [ ] Python bytecode
- ".pyo": true, // [ ] Python bytecode
- ".so": true, // [x] shared library
- ".swp": true, // [ ] vim swap file
- ".tar": true, // [ ] archive
- ".tiff": true, // [ ] image
- ".woff": true, // [ ] font
- ".woff2": true, // [ ] font
- ".xz": true, // [ ] compression
- ".z": true, // [ ] compression
- ".zip": true, // [x] archive
- }
- // isBinaryFilename returns true if the file is likely to be binary
- //
- // Better heuristics could be done here, in particular a binary
- // file is unlikely to be UTF-8 encoded. However this is cheap
- // and will solve the immediate need of making sure common
- // binary formats are not corrupted by mistake.
- func isBinaryFilename(s string) bool {
- return binary[strings.ToLower(filepath.Ext(s))]
- }
- var scm = map[string]bool{
- ".bzr": true,
- ".git": true,
- ".hg": true,
- ".svn": true,
- "CVS": true,
- }
- // isSCMPath returns true if the path is likely part of a (private) SCM
- // directory. E.g. ./git/something = true
- func isSCMPath(s string) bool {
- // hack for .git/COMMIT_EDITMSG and .git/TAG_EDITMSG
- // normally we don't look at anything in .git
- // but COMMIT_EDITMSG and TAG_EDITMSG are used as
- // temp files for git commits. Allowing misspell to inspect
- // these files allows for commit-msg hooks
- // https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks
- if strings.Contains(filepath.Base(s), "EDITMSG") {
- return false
- }
- parts := strings.Split(filepath.Clean(s), string(filepath.Separator))
- for _, dir := range parts {
- if scm[dir] {
- return true
- }
- }
- return false
- }
- var magicHeaders = [][]byte{
- // Issue #68
- // PGP messages and signatures are "text" but really just
- // blobs of base64-text and should not be misspell-checked
- []byte("-----BEGIN PGP MESSAGE-----"),
- []byte("-----BEGIN PGP SIGNATURE-----"),
- // ELF
- {0x7f, 0x45, 0x4c, 0x46},
- // Postscript
- {0x25, 0x21, 0x50, 0x53},
- // PDF
- {0x25, 0x50, 0x44, 0x46},
- // Java class file
- // https://en.wikipedia.org/wiki/Java_class_file
- {0xCA, 0xFE, 0xBA, 0xBE},
- // PNG
- // https://en.wikipedia.org/wiki/Portable_Network_Graphics
- {0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a},
- // ZIP, JAR, ODF, OOXML
- {0x50, 0x4B, 0x03, 0x04},
- {0x50, 0x4B, 0x05, 0x06},
- {0x50, 0x4B, 0x07, 0x08},
- }
- func isTextFile(raw []byte) bool {
- for _, magic := range magicHeaders {
- if bytes.HasPrefix(raw, magic) {
- return false
- }
- }
- // allow any text/ type with utf-8 encoding
- // DetectContentType sometimes returns charset=utf-16 for XML stuff
- // in which case ignore.
- mime := http.DetectContentType(raw)
- return strings.HasPrefix(mime, "text/") && strings.HasSuffix(mime, "charset=utf-8")
- }
- // ReadTextFile returns the contents of a file, first testing if it is a text file
- // returns ("", nil) if not a text file
- // returns ("", error) if error
- // returns (string, nil) if text
- //
- // unfortunately, in worse case, this does
- // 1 stat
- // 1 open,read,close of 512 bytes
- // 1 more stat,open, read everything, close (via ioutil.ReadAll)
- // This could be kinder to the filesystem.
- //
- // This uses some heuristics of the file's extension (e.g. .zip, .txt) and
- // uses a sniffer to determine if the file is text or not.
- // Using file extensions isn't great, but probably
- // good enough for real-world use.
- // Golang's built in sniffer is problematic for differnet reasons. It's
- // optimized for HTML, and is very limited in detection. It would be good
- // to explicitly add some tests for ELF/DWARF formats to make sure we never
- // corrupt binary files.
- func ReadTextFile(filename string) (string, error) {
- if isBinaryFilename(filename) {
- return "", nil
- }
- if isSCMPath(filename) {
- return "", nil
- }
- fstat, err := os.Stat(filename)
- if err != nil {
- return "", fmt.Errorf("Unable to stat %q: %s", filename, err)
- }
- // directory: nothing to do.
- if fstat.IsDir() {
- return "", nil
- }
- // avoid reading in multi-gig files
- // if input is large, read the first 512 bytes to sniff type
- // if not-text, then exit
- isText := false
- if fstat.Size() > 50000 {
- fin, err := os.Open(filename)
- if err != nil {
- return "", fmt.Errorf("Unable to open large file %q: %s", filename, err)
- }
- defer fin.Close()
- buf := make([]byte, 512)
- _, err = io.ReadFull(fin, buf)
- if err != nil {
- return "", fmt.Errorf("Unable to read 512 bytes from %q: %s", filename, err)
- }
- if !isTextFile(buf) {
- return "", nil
- }
- // set so we don't double check this file
- isText = true
- }
- // read in whole file
- raw, err := ioutil.ReadFile(filename)
- if err != nil {
- return "", fmt.Errorf("Unable to read all %q: %s", filename, err)
- }
- if !isText && !isTextFile(raw) {
- return "", nil
- }
- return string(raw), nil
- }
|