kvstore.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package mvcc
  15. import (
  16. "context"
  17. "encoding/binary"
  18. "errors"
  19. "fmt"
  20. "hash/crc32"
  21. "math"
  22. "sync"
  23. "sync/atomic"
  24. "time"
  25. "go.etcd.io/etcd/lease"
  26. "go.etcd.io/etcd/mvcc/backend"
  27. "go.etcd.io/etcd/mvcc/mvccpb"
  28. "go.etcd.io/etcd/pkg/schedule"
  29. "go.etcd.io/etcd/pkg/traceutil"
  30. "github.com/coreos/pkg/capnslog"
  31. "go.uber.org/zap"
  32. )
  33. var (
  34. keyBucketName = []byte("key")
  35. metaBucketName = []byte("meta")
  36. consistentIndexKeyName = []byte("consistent_index")
  37. scheduledCompactKeyName = []byte("scheduledCompactRev")
  38. finishedCompactKeyName = []byte("finishedCompactRev")
  39. ErrCompacted = errors.New("mvcc: required revision has been compacted")
  40. ErrFutureRev = errors.New("mvcc: required revision is a future revision")
  41. ErrCanceled = errors.New("mvcc: watcher is canceled")
  42. ErrClosed = errors.New("mvcc: closed")
  43. plog = capnslog.NewPackageLogger("go.etcd.io/etcd", "mvcc")
  44. )
  45. const (
  46. // markedRevBytesLen is the byte length of marked revision.
  47. // The first `revBytesLen` bytes represents a normal revision. The last
  48. // one byte is the mark.
  49. markedRevBytesLen = revBytesLen + 1
  50. markBytePosition = markedRevBytesLen - 1
  51. markTombstone byte = 't'
  52. )
  53. var restoreChunkKeys = 10000 // non-const for testing
  54. var defaultCompactBatchLimit = 1000
  55. // ConsistentIndexGetter is an interface that wraps the Get method.
  56. // Consistent index is the offset of an entry in a consistent replicated log.
  57. type ConsistentIndexGetter interface {
  58. // ConsistentIndex returns the consistent index of current executing entry.
  59. ConsistentIndex() uint64
  60. }
  61. type StoreConfig struct {
  62. CompactionBatchLimit int
  63. }
  64. type store struct {
  65. ReadView
  66. WriteView
  67. // consistentIndex caches the "consistent_index" key's value. Accessed
  68. // through atomics so must be 64-bit aligned.
  69. consistentIndex uint64
  70. cfg StoreConfig
  71. // mu read locks for txns and write locks for non-txn store changes.
  72. mu sync.RWMutex
  73. ig ConsistentIndexGetter
  74. b backend.Backend
  75. kvindex index
  76. le lease.Lessor
  77. // revMuLock protects currentRev and compactMainRev.
  78. // Locked at end of write txn and released after write txn unlock lock.
  79. // Locked before locking read txn and released after locking.
  80. revMu sync.RWMutex
  81. // currentRev is the revision of the last completed transaction.
  82. currentRev int64
  83. // compactMainRev is the main revision of the last compaction.
  84. compactMainRev int64
  85. // bytesBuf8 is a byte slice of length 8
  86. // to avoid a repetitive allocation in saveIndex.
  87. bytesBuf8 []byte
  88. fifoSched schedule.Scheduler
  89. stopc chan struct{}
  90. lg *zap.Logger
  91. }
  92. // NewStore returns a new store. It is useful to create a store inside
  93. // mvcc pkg. It should only be used for testing externally.
  94. func NewStore(lg *zap.Logger, b backend.Backend, le lease.Lessor, ig ConsistentIndexGetter, cfg StoreConfig) *store {
  95. if cfg.CompactionBatchLimit == 0 {
  96. cfg.CompactionBatchLimit = defaultCompactBatchLimit
  97. }
  98. s := &store{
  99. cfg: cfg,
  100. b: b,
  101. ig: ig,
  102. kvindex: newTreeIndex(lg),
  103. le: le,
  104. currentRev: 1,
  105. compactMainRev: -1,
  106. bytesBuf8: make([]byte, 8),
  107. fifoSched: schedule.NewFIFOScheduler(),
  108. stopc: make(chan struct{}),
  109. lg: lg,
  110. }
  111. s.ReadView = &readView{s}
  112. s.WriteView = &writeView{s}
  113. if s.le != nil {
  114. s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write(traceutil.TODO()) })
  115. }
  116. tx := s.b.BatchTx()
  117. tx.Lock()
  118. tx.UnsafeCreateBucket(keyBucketName)
  119. tx.UnsafeCreateBucket(metaBucketName)
  120. tx.Unlock()
  121. s.b.ForceCommit()
  122. s.mu.Lock()
  123. defer s.mu.Unlock()
  124. if err := s.restore(); err != nil {
  125. // TODO: return the error instead of panic here?
  126. panic("failed to recover store from backend")
  127. }
  128. return s
  129. }
  130. func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
  131. if ctx == nil || ctx.Err() != nil {
  132. s.mu.Lock()
  133. select {
  134. case <-s.stopc:
  135. default:
  136. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  137. s.fifoSched.Schedule(f)
  138. }
  139. s.mu.Unlock()
  140. return
  141. }
  142. close(ch)
  143. }
  144. func (s *store) Hash() (hash uint32, revision int64, err error) {
  145. start := time.Now()
  146. s.b.ForceCommit()
  147. h, err := s.b.Hash(DefaultIgnores)
  148. hashSec.Observe(time.Since(start).Seconds())
  149. return h, s.currentRev, err
  150. }
  151. func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev int64, err error) {
  152. start := time.Now()
  153. s.mu.RLock()
  154. s.revMu.RLock()
  155. compactRev, currentRev = s.compactMainRev, s.currentRev
  156. s.revMu.RUnlock()
  157. if rev > 0 && rev <= compactRev {
  158. s.mu.RUnlock()
  159. return 0, 0, compactRev, ErrCompacted
  160. } else if rev > 0 && rev > currentRev {
  161. s.mu.RUnlock()
  162. return 0, currentRev, 0, ErrFutureRev
  163. }
  164. if rev == 0 {
  165. rev = currentRev
  166. }
  167. keep := s.kvindex.Keep(rev)
  168. tx := s.b.ReadTx()
  169. tx.RLock()
  170. defer tx.RUnlock()
  171. s.mu.RUnlock()
  172. upper := revision{main: rev + 1}
  173. lower := revision{main: compactRev + 1}
  174. h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
  175. h.Write(keyBucketName)
  176. err = tx.UnsafeForEach(keyBucketName, func(k, v []byte) error {
  177. kr := bytesToRev(k)
  178. if !upper.GreaterThan(kr) {
  179. return nil
  180. }
  181. // skip revisions that are scheduled for deletion
  182. // due to compacting; don't skip if there isn't one.
  183. if lower.GreaterThan(kr) && len(keep) > 0 {
  184. if _, ok := keep[kr]; !ok {
  185. return nil
  186. }
  187. }
  188. h.Write(k)
  189. h.Write(v)
  190. return nil
  191. })
  192. hash = h.Sum32()
  193. hashRevSec.Observe(time.Since(start).Seconds())
  194. return hash, currentRev, compactRev, err
  195. }
  196. func (s *store) updateCompactRev(rev int64) (<-chan struct{}, error) {
  197. s.revMu.Lock()
  198. if rev <= s.compactMainRev {
  199. ch := make(chan struct{})
  200. f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
  201. s.fifoSched.Schedule(f)
  202. s.revMu.Unlock()
  203. return ch, ErrCompacted
  204. }
  205. if rev > s.currentRev {
  206. s.revMu.Unlock()
  207. return nil, ErrFutureRev
  208. }
  209. s.compactMainRev = rev
  210. rbytes := newRevBytes()
  211. revToBytes(revision{main: rev}, rbytes)
  212. tx := s.b.BatchTx()
  213. tx.Lock()
  214. tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes)
  215. tx.Unlock()
  216. // ensure that desired compaction is persisted
  217. s.b.ForceCommit()
  218. s.revMu.Unlock()
  219. return nil, nil
  220. }
  221. func (s *store) compact(trace *traceutil.Trace, rev int64) (<-chan struct{}, error) {
  222. start := time.Now()
  223. keep := s.kvindex.Compact(rev)
  224. trace.Step("compact in-memory index tree")
  225. ch := make(chan struct{})
  226. var j = func(ctx context.Context) {
  227. if ctx.Err() != nil {
  228. s.compactBarrier(ctx, ch)
  229. return
  230. }
  231. if !s.scheduleCompaction(rev, keep) {
  232. s.compactBarrier(nil, ch)
  233. return
  234. }
  235. close(ch)
  236. }
  237. s.fifoSched.Schedule(j)
  238. indexCompactionPauseMs.Observe(float64(time.Since(start) / time.Millisecond))
  239. trace.Step("schedule compaction")
  240. return ch, nil
  241. }
  242. func (s *store) compactLockfree(rev int64) (<-chan struct{}, error) {
  243. ch, err := s.updateCompactRev(rev)
  244. if nil != err {
  245. return ch, err
  246. }
  247. return s.compact(traceutil.TODO(), rev)
  248. }
  249. func (s *store) Compact(trace *traceutil.Trace, rev int64) (<-chan struct{}, error) {
  250. s.mu.Lock()
  251. ch, err := s.updateCompactRev(rev)
  252. trace.Step("check and update compact revision")
  253. if err != nil {
  254. s.mu.Unlock()
  255. return ch, err
  256. }
  257. s.mu.Unlock()
  258. return s.compact(trace, rev)
  259. }
  260. // DefaultIgnores is a map of keys to ignore in hash checking.
  261. var DefaultIgnores map[backend.IgnoreKey]struct{}
  262. func init() {
  263. DefaultIgnores = map[backend.IgnoreKey]struct{}{
  264. // consistent index might be changed due to v2 internal sync, which
  265. // is not controllable by the user.
  266. {Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {},
  267. }
  268. }
  269. func (s *store) Commit() {
  270. s.mu.Lock()
  271. defer s.mu.Unlock()
  272. tx := s.b.BatchTx()
  273. tx.Lock()
  274. s.saveIndex(tx)
  275. tx.Unlock()
  276. s.b.ForceCommit()
  277. }
  278. func (s *store) Restore(b backend.Backend) error {
  279. s.mu.Lock()
  280. defer s.mu.Unlock()
  281. close(s.stopc)
  282. s.fifoSched.Stop()
  283. atomic.StoreUint64(&s.consistentIndex, 0)
  284. s.b = b
  285. s.kvindex = newTreeIndex(s.lg)
  286. s.currentRev = 1
  287. s.compactMainRev = -1
  288. s.fifoSched = schedule.NewFIFOScheduler()
  289. s.stopc = make(chan struct{})
  290. return s.restore()
  291. }
  292. func (s *store) restore() error {
  293. s.setupMetricsReporter()
  294. min, max := newRevBytes(), newRevBytes()
  295. revToBytes(revision{main: 1}, min)
  296. revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max)
  297. keyToLease := make(map[string]lease.LeaseID)
  298. // restore index
  299. tx := s.b.BatchTx()
  300. tx.Lock()
  301. _, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
  302. if len(finishedCompactBytes) != 0 {
  303. s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
  304. if s.lg != nil {
  305. s.lg.Info(
  306. "restored last compact revision",
  307. zap.String("meta-bucket-name", string(metaBucketName)),
  308. zap.String("meta-bucket-name-key", string(finishedCompactKeyName)),
  309. zap.Int64("restored-compact-revision", s.compactMainRev),
  310. )
  311. } else {
  312. plog.Printf("restore compact to %d", s.compactMainRev)
  313. }
  314. }
  315. _, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
  316. scheduledCompact := int64(0)
  317. if len(scheduledCompactBytes) != 0 {
  318. scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main
  319. }
  320. // index keys concurrently as they're loaded in from tx
  321. keysGauge.Set(0)
  322. rkvc, revc := restoreIntoIndex(s.lg, s.kvindex)
  323. for {
  324. keys, vals := tx.UnsafeRange(keyBucketName, min, max, int64(restoreChunkKeys))
  325. if len(keys) == 0 {
  326. break
  327. }
  328. // rkvc blocks if the total pending keys exceeds the restore
  329. // chunk size to keep keys from consuming too much memory.
  330. restoreChunk(s.lg, rkvc, keys, vals, keyToLease)
  331. if len(keys) < restoreChunkKeys {
  332. // partial set implies final set
  333. break
  334. }
  335. // next set begins after where this one ended
  336. newMin := bytesToRev(keys[len(keys)-1][:revBytesLen])
  337. newMin.sub++
  338. revToBytes(newMin, min)
  339. }
  340. close(rkvc)
  341. s.currentRev = <-revc
  342. // keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
  343. // the correct revision should be set to compaction revision in the case, not the largest revision
  344. // we have seen.
  345. if s.currentRev < s.compactMainRev {
  346. s.currentRev = s.compactMainRev
  347. }
  348. if scheduledCompact <= s.compactMainRev {
  349. scheduledCompact = 0
  350. }
  351. for key, lid := range keyToLease {
  352. if s.le == nil {
  353. panic("no lessor to attach lease")
  354. }
  355. err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}})
  356. if err != nil {
  357. if s.lg != nil {
  358. s.lg.Warn(
  359. "failed to attach a lease",
  360. zap.String("lease-id", fmt.Sprintf("%016x", lid)),
  361. zap.Error(err),
  362. )
  363. } else {
  364. plog.Errorf("unexpected Attach error: %v", err)
  365. }
  366. }
  367. }
  368. tx.Unlock()
  369. if scheduledCompact != 0 {
  370. s.compactLockfree(scheduledCompact)
  371. if s.lg != nil {
  372. s.lg.Info(
  373. "resume scheduled compaction",
  374. zap.String("meta-bucket-name", string(metaBucketName)),
  375. zap.String("meta-bucket-name-key", string(scheduledCompactKeyName)),
  376. zap.Int64("scheduled-compact-revision", scheduledCompact),
  377. )
  378. } else {
  379. plog.Printf("resume scheduled compaction at %d", scheduledCompact)
  380. }
  381. }
  382. return nil
  383. }
  384. type revKeyValue struct {
  385. key []byte
  386. kv mvccpb.KeyValue
  387. kstr string
  388. }
  389. func restoreIntoIndex(lg *zap.Logger, idx index) (chan<- revKeyValue, <-chan int64) {
  390. rkvc, revc := make(chan revKeyValue, restoreChunkKeys), make(chan int64, 1)
  391. go func() {
  392. currentRev := int64(1)
  393. defer func() { revc <- currentRev }()
  394. // restore the tree index from streaming the unordered index.
  395. kiCache := make(map[string]*keyIndex, restoreChunkKeys)
  396. for rkv := range rkvc {
  397. ki, ok := kiCache[rkv.kstr]
  398. // purge kiCache if many keys but still missing in the cache
  399. if !ok && len(kiCache) >= restoreChunkKeys {
  400. i := 10
  401. for k := range kiCache {
  402. delete(kiCache, k)
  403. if i--; i == 0 {
  404. break
  405. }
  406. }
  407. }
  408. // cache miss, fetch from tree index if there
  409. if !ok {
  410. ki = &keyIndex{key: rkv.kv.Key}
  411. if idxKey := idx.KeyIndex(ki); idxKey != nil {
  412. kiCache[rkv.kstr], ki = idxKey, idxKey
  413. ok = true
  414. }
  415. }
  416. rev := bytesToRev(rkv.key)
  417. currentRev = rev.main
  418. if ok {
  419. if isTombstone(rkv.key) {
  420. ki.tombstone(lg, rev.main, rev.sub)
  421. continue
  422. }
  423. ki.put(lg, rev.main, rev.sub)
  424. } else if !isTombstone(rkv.key) {
  425. ki.restore(lg, revision{rkv.kv.CreateRevision, 0}, rev, rkv.kv.Version)
  426. idx.Insert(ki)
  427. kiCache[rkv.kstr] = ki
  428. }
  429. }
  430. }()
  431. return rkvc, revc
  432. }
  433. func restoreChunk(lg *zap.Logger, kvc chan<- revKeyValue, keys, vals [][]byte, keyToLease map[string]lease.LeaseID) {
  434. for i, key := range keys {
  435. rkv := revKeyValue{key: key}
  436. if err := rkv.kv.Unmarshal(vals[i]); err != nil {
  437. if lg != nil {
  438. lg.Fatal("failed to unmarshal mvccpb.KeyValue", zap.Error(err))
  439. } else {
  440. plog.Fatalf("cannot unmarshal event: %v", err)
  441. }
  442. }
  443. rkv.kstr = string(rkv.kv.Key)
  444. if isTombstone(key) {
  445. delete(keyToLease, rkv.kstr)
  446. } else if lid := lease.LeaseID(rkv.kv.Lease); lid != lease.NoLease {
  447. keyToLease[rkv.kstr] = lid
  448. } else {
  449. delete(keyToLease, rkv.kstr)
  450. }
  451. kvc <- rkv
  452. }
  453. }
  454. func (s *store) Close() error {
  455. close(s.stopc)
  456. s.fifoSched.Stop()
  457. return nil
  458. }
  459. func (s *store) saveIndex(tx backend.BatchTx) {
  460. if s.ig == nil {
  461. return
  462. }
  463. bs := s.bytesBuf8
  464. ci := s.ig.ConsistentIndex()
  465. binary.BigEndian.PutUint64(bs, ci)
  466. // put the index into the underlying backend
  467. // tx has been locked in TxnBegin, so there is no need to lock it again
  468. tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
  469. atomic.StoreUint64(&s.consistentIndex, ci)
  470. }
  471. func (s *store) ConsistentIndex() uint64 {
  472. if ci := atomic.LoadUint64(&s.consistentIndex); ci > 0 {
  473. return ci
  474. }
  475. tx := s.b.BatchTx()
  476. tx.Lock()
  477. defer tx.Unlock()
  478. _, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
  479. if len(vs) == 0 {
  480. return 0
  481. }
  482. v := binary.BigEndian.Uint64(vs[0])
  483. atomic.StoreUint64(&s.consistentIndex, v)
  484. return v
  485. }
  486. func (s *store) setupMetricsReporter() {
  487. b := s.b
  488. reportDbTotalSizeInBytesMu.Lock()
  489. reportDbTotalSizeInBytes = func() float64 { return float64(b.Size()) }
  490. reportDbTotalSizeInBytesMu.Unlock()
  491. reportDbTotalSizeInBytesDebugMu.Lock()
  492. reportDbTotalSizeInBytesDebug = func() float64 { return float64(b.Size()) }
  493. reportDbTotalSizeInBytesDebugMu.Unlock()
  494. reportDbTotalSizeInUseInBytesMu.Lock()
  495. reportDbTotalSizeInUseInBytes = func() float64 { return float64(b.SizeInUse()) }
  496. reportDbTotalSizeInUseInBytesMu.Unlock()
  497. reportDbOpenReadTxNMu.Lock()
  498. reportDbOpenReadTxN = func() float64 { return float64(b.OpenReadTxN()) }
  499. reportDbOpenReadTxNMu.Unlock()
  500. reportCurrentRevMu.Lock()
  501. reportCurrentRev = func() float64 {
  502. s.revMu.RLock()
  503. defer s.revMu.RUnlock()
  504. return float64(s.currentRev)
  505. }
  506. reportCurrentRevMu.Unlock()
  507. reportCompactRevMu.Lock()
  508. reportCompactRev = func() float64 {
  509. s.revMu.RLock()
  510. defer s.revMu.RUnlock()
  511. return float64(s.compactMainRev)
  512. }
  513. reportCompactRevMu.Unlock()
  514. }
  515. // appendMarkTombstone appends tombstone mark to normal revision bytes.
  516. func appendMarkTombstone(lg *zap.Logger, b []byte) []byte {
  517. if len(b) != revBytesLen {
  518. if lg != nil {
  519. lg.Panic(
  520. "cannot append tombstone mark to non-normal revision bytes",
  521. zap.Int("expected-revision-bytes-size", revBytesLen),
  522. zap.Int("given-revision-bytes-size", len(b)),
  523. )
  524. } else {
  525. plog.Panicf("cannot append mark to non normal revision bytes")
  526. }
  527. }
  528. return append(b, markTombstone)
  529. }
  530. // isTombstone checks whether the revision bytes is a tombstone.
  531. func isTombstone(b []byte) bool {
  532. return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone
  533. }