http.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578
  1. // Copyright 2015 The etcd Authors
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package rafthttp
  15. import (
  16. "context"
  17. "errors"
  18. "fmt"
  19. "io/ioutil"
  20. "net/http"
  21. "path"
  22. "strings"
  23. "time"
  24. "go.etcd.io/etcd/etcdserver/api/snap"
  25. pioutil "go.etcd.io/etcd/pkg/ioutil"
  26. "go.etcd.io/etcd/pkg/types"
  27. "go.etcd.io/etcd/raft/raftpb"
  28. "go.etcd.io/etcd/version"
  29. humanize "github.com/dustin/go-humanize"
  30. "go.uber.org/zap"
  31. )
  32. const (
  33. // connReadLimitByte limits the number of bytes
  34. // a single read can read out.
  35. //
  36. // 64KB should be large enough for not causing
  37. // throughput bottleneck as well as small enough
  38. // for not causing a read timeout.
  39. connReadLimitByte = 64 * 1024
  40. )
  41. var (
  42. RaftPrefix = "/raft"
  43. ProbingPrefix = path.Join(RaftPrefix, "probing")
  44. RaftStreamPrefix = path.Join(RaftPrefix, "stream")
  45. RaftSnapshotPrefix = path.Join(RaftPrefix, "snapshot")
  46. errIncompatibleVersion = errors.New("incompatible version")
  47. errClusterIDMismatch = errors.New("cluster ID mismatch")
  48. )
  49. type peerGetter interface {
  50. Get(id types.ID) Peer
  51. }
  52. type writerToResponse interface {
  53. WriteTo(w http.ResponseWriter)
  54. }
  55. type pipelineHandler struct {
  56. lg *zap.Logger
  57. localID types.ID
  58. tr Transporter
  59. r Raft
  60. cid types.ID
  61. }
  62. // newPipelineHandler returns a handler for handling raft messages
  63. // from pipeline for RaftPrefix.
  64. //
  65. // The handler reads out the raft message from request body,
  66. // and forwards it to the given raft state machine for processing.
  67. func newPipelineHandler(t *Transport, r Raft, cid types.ID) http.Handler {
  68. return &pipelineHandler{
  69. lg: t.Logger,
  70. localID: t.ID,
  71. tr: t,
  72. r: r,
  73. cid: cid,
  74. }
  75. }
  76. func (h *pipelineHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  77. if r.Method != "POST" {
  78. w.Header().Set("Allow", "POST")
  79. http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
  80. return
  81. }
  82. w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
  83. if err := checkClusterCompatibilityFromHeader(h.lg, h.localID, r.Header, h.cid); err != nil {
  84. http.Error(w, err.Error(), http.StatusPreconditionFailed)
  85. return
  86. }
  87. addRemoteFromRequest(h.tr, r)
  88. // Limit the data size that could be read from the request body, which ensures that read from
  89. // connection will not time out accidentally due to possible blocking in underlying implementation.
  90. limitedr := pioutil.NewLimitedBufferReader(r.Body, connReadLimitByte)
  91. b, err := ioutil.ReadAll(limitedr)
  92. if err != nil {
  93. if h.lg != nil {
  94. h.lg.Warn(
  95. "failed to read Raft message",
  96. zap.String("local-member-id", h.localID.String()),
  97. zap.Error(err),
  98. )
  99. } else {
  100. plog.Errorf("failed to read raft message (%v)", err)
  101. }
  102. http.Error(w, "error reading raft message", http.StatusBadRequest)
  103. recvFailures.WithLabelValues(r.RemoteAddr).Inc()
  104. return
  105. }
  106. var m raftpb.Message
  107. if err := m.Unmarshal(b); err != nil {
  108. if h.lg != nil {
  109. h.lg.Warn(
  110. "failed to unmarshal Raft message",
  111. zap.String("local-member-id", h.localID.String()),
  112. zap.Error(err),
  113. )
  114. } else {
  115. plog.Errorf("failed to unmarshal raft message (%v)", err)
  116. }
  117. http.Error(w, "error unmarshalling raft message", http.StatusBadRequest)
  118. recvFailures.WithLabelValues(r.RemoteAddr).Inc()
  119. return
  120. }
  121. receivedBytes.WithLabelValues(types.ID(m.From).String()).Add(float64(len(b)))
  122. if err := h.r.Process(context.TODO(), m); err != nil {
  123. switch v := err.(type) {
  124. case writerToResponse:
  125. v.WriteTo(w)
  126. default:
  127. if h.lg != nil {
  128. h.lg.Warn(
  129. "failed to process Raft message",
  130. zap.String("local-member-id", h.localID.String()),
  131. zap.Error(err),
  132. )
  133. } else {
  134. plog.Warningf("failed to process raft message (%v)", err)
  135. }
  136. http.Error(w, "error processing raft message", http.StatusInternalServerError)
  137. w.(http.Flusher).Flush()
  138. // disconnect the http stream
  139. panic(err)
  140. }
  141. return
  142. }
  143. // Write StatusNoContent header after the message has been processed by
  144. // raft, which facilitates the client to report MsgSnap status.
  145. w.WriteHeader(http.StatusNoContent)
  146. }
  147. type snapshotHandler struct {
  148. lg *zap.Logger
  149. tr Transporter
  150. r Raft
  151. snapshotter *snap.Snapshotter
  152. localID types.ID
  153. cid types.ID
  154. }
  155. func newSnapshotHandler(t *Transport, r Raft, snapshotter *snap.Snapshotter, cid types.ID) http.Handler {
  156. return &snapshotHandler{
  157. lg: t.Logger,
  158. tr: t,
  159. r: r,
  160. snapshotter: snapshotter,
  161. localID: t.ID,
  162. cid: cid,
  163. }
  164. }
  165. const unknownSnapshotSender = "UNKNOWN_SNAPSHOT_SENDER"
  166. // ServeHTTP serves HTTP request to receive and process snapshot message.
  167. //
  168. // If request sender dies without closing underlying TCP connection,
  169. // the handler will keep waiting for the request body until TCP keepalive
  170. // finds out that the connection is broken after several minutes.
  171. // This is acceptable because
  172. // 1. snapshot messages sent through other TCP connections could still be
  173. // received and processed.
  174. // 2. this case should happen rarely, so no further optimization is done.
  175. func (h *snapshotHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  176. start := time.Now()
  177. if r.Method != "POST" {
  178. w.Header().Set("Allow", "POST")
  179. http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
  180. snapshotReceiveFailures.WithLabelValues(unknownSnapshotSender).Inc()
  181. return
  182. }
  183. w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
  184. if err := checkClusterCompatibilityFromHeader(h.lg, h.localID, r.Header, h.cid); err != nil {
  185. http.Error(w, err.Error(), http.StatusPreconditionFailed)
  186. snapshotReceiveFailures.WithLabelValues(unknownSnapshotSender).Inc()
  187. return
  188. }
  189. addRemoteFromRequest(h.tr, r)
  190. dec := &messageDecoder{r: r.Body}
  191. // let snapshots be very large since they can exceed 512MB for large installations
  192. m, err := dec.decodeLimit(uint64(1 << 63))
  193. from := types.ID(m.From).String()
  194. if err != nil {
  195. msg := fmt.Sprintf("failed to decode raft message (%v)", err)
  196. if h.lg != nil {
  197. h.lg.Warn(
  198. "failed to decode Raft message",
  199. zap.String("local-member-id", h.localID.String()),
  200. zap.String("remote-snapshot-sender-id", from),
  201. zap.Error(err),
  202. )
  203. } else {
  204. plog.Error(msg)
  205. }
  206. http.Error(w, msg, http.StatusBadRequest)
  207. recvFailures.WithLabelValues(r.RemoteAddr).Inc()
  208. snapshotReceiveFailures.WithLabelValues(from).Inc()
  209. return
  210. }
  211. msgSize := m.Size()
  212. receivedBytes.WithLabelValues(from).Add(float64(msgSize))
  213. if m.Type != raftpb.MsgSnap {
  214. if h.lg != nil {
  215. h.lg.Warn(
  216. "unexpected Raft message type",
  217. zap.String("local-member-id", h.localID.String()),
  218. zap.String("remote-snapshot-sender-id", from),
  219. zap.String("message-type", m.Type.String()),
  220. )
  221. } else {
  222. plog.Errorf("unexpected raft message type %s on snapshot path", m.Type)
  223. }
  224. http.Error(w, "wrong raft message type", http.StatusBadRequest)
  225. snapshotReceiveFailures.WithLabelValues(from).Inc()
  226. return
  227. }
  228. snapshotReceiveInflights.WithLabelValues(from).Inc()
  229. defer func() {
  230. snapshotReceiveInflights.WithLabelValues(from).Dec()
  231. }()
  232. if h.lg != nil {
  233. h.lg.Info(
  234. "receiving database snapshot",
  235. zap.String("local-member-id", h.localID.String()),
  236. zap.String("remote-snapshot-sender-id", from),
  237. zap.Uint64("incoming-snapshot-index", m.Snapshot.Metadata.Index),
  238. zap.Int("incoming-snapshot-message-size-bytes", msgSize),
  239. zap.String("incoming-snapshot-message-size", humanize.Bytes(uint64(msgSize))),
  240. )
  241. } else {
  242. plog.Infof("receiving database snapshot [index:%d, from %s] ...", m.Snapshot.Metadata.Index, types.ID(m.From))
  243. }
  244. // save incoming database snapshot.
  245. n, err := h.snapshotter.SaveDBFrom(r.Body, m.Snapshot.Metadata.Index)
  246. if err != nil {
  247. msg := fmt.Sprintf("failed to save KV snapshot (%v)", err)
  248. if h.lg != nil {
  249. h.lg.Warn(
  250. "failed to save incoming database snapshot",
  251. zap.String("local-member-id", h.localID.String()),
  252. zap.String("remote-snapshot-sender-id", from),
  253. zap.Uint64("incoming-snapshot-index", m.Snapshot.Metadata.Index),
  254. zap.Error(err),
  255. )
  256. } else {
  257. plog.Error(msg)
  258. }
  259. http.Error(w, msg, http.StatusInternalServerError)
  260. snapshotReceiveFailures.WithLabelValues(from).Inc()
  261. return
  262. }
  263. receivedBytes.WithLabelValues(from).Add(float64(n))
  264. if h.lg != nil {
  265. h.lg.Info(
  266. "received and saved database snapshot",
  267. zap.String("local-member-id", h.localID.String()),
  268. zap.String("remote-snapshot-sender-id", from),
  269. zap.Uint64("incoming-snapshot-index", m.Snapshot.Metadata.Index),
  270. zap.Int64("incoming-snapshot-size-bytes", n),
  271. zap.String("incoming-snapshot-size", humanize.Bytes(uint64(n))),
  272. )
  273. } else {
  274. plog.Infof("received and saved database snapshot [index: %d, from: %s] successfully", m.Snapshot.Metadata.Index, types.ID(m.From))
  275. }
  276. if err := h.r.Process(context.TODO(), m); err != nil {
  277. switch v := err.(type) {
  278. // Process may return writerToResponse error when doing some
  279. // additional checks before calling raft.Node.Step.
  280. case writerToResponse:
  281. v.WriteTo(w)
  282. default:
  283. msg := fmt.Sprintf("failed to process raft message (%v)", err)
  284. if h.lg != nil {
  285. h.lg.Warn(
  286. "failed to process Raft message",
  287. zap.String("local-member-id", h.localID.String()),
  288. zap.String("remote-snapshot-sender-id", from),
  289. zap.Error(err),
  290. )
  291. } else {
  292. plog.Error(msg)
  293. }
  294. http.Error(w, msg, http.StatusInternalServerError)
  295. snapshotReceiveFailures.WithLabelValues(from).Inc()
  296. }
  297. return
  298. }
  299. // Write StatusNoContent header after the message has been processed by
  300. // raft, which facilitates the client to report MsgSnap status.
  301. w.WriteHeader(http.StatusNoContent)
  302. snapshotReceive.WithLabelValues(from).Inc()
  303. snapshotReceiveSeconds.WithLabelValues(from).Observe(time.Since(start).Seconds())
  304. }
  305. type streamHandler struct {
  306. lg *zap.Logger
  307. tr *Transport
  308. peerGetter peerGetter
  309. r Raft
  310. id types.ID
  311. cid types.ID
  312. }
  313. func newStreamHandler(t *Transport, pg peerGetter, r Raft, id, cid types.ID) http.Handler {
  314. return &streamHandler{
  315. lg: t.Logger,
  316. tr: t,
  317. peerGetter: pg,
  318. r: r,
  319. id: id,
  320. cid: cid,
  321. }
  322. }
  323. func (h *streamHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
  324. if r.Method != "GET" {
  325. w.Header().Set("Allow", "GET")
  326. http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
  327. return
  328. }
  329. w.Header().Set("X-Server-Version", version.Version)
  330. w.Header().Set("X-Etcd-Cluster-ID", h.cid.String())
  331. if err := checkClusterCompatibilityFromHeader(h.lg, h.tr.ID, r.Header, h.cid); err != nil {
  332. http.Error(w, err.Error(), http.StatusPreconditionFailed)
  333. return
  334. }
  335. var t streamType
  336. switch path.Dir(r.URL.Path) {
  337. case streamTypeMsgAppV2.endpoint():
  338. t = streamTypeMsgAppV2
  339. case streamTypeMessage.endpoint():
  340. t = streamTypeMessage
  341. default:
  342. if h.lg != nil {
  343. h.lg.Debug(
  344. "ignored unexpected streaming request path",
  345. zap.String("local-member-id", h.tr.ID.String()),
  346. zap.String("remote-peer-id-stream-handler", h.id.String()),
  347. zap.String("path", r.URL.Path),
  348. )
  349. } else {
  350. plog.Debugf("ignored unexpected streaming request path %s", r.URL.Path)
  351. }
  352. http.Error(w, "invalid path", http.StatusNotFound)
  353. return
  354. }
  355. fromStr := path.Base(r.URL.Path)
  356. from, err := types.IDFromString(fromStr)
  357. if err != nil {
  358. if h.lg != nil {
  359. h.lg.Warn(
  360. "failed to parse path into ID",
  361. zap.String("local-member-id", h.tr.ID.String()),
  362. zap.String("remote-peer-id-stream-handler", h.id.String()),
  363. zap.String("path", fromStr),
  364. zap.Error(err),
  365. )
  366. } else {
  367. plog.Errorf("failed to parse from %s into ID (%v)", fromStr, err)
  368. }
  369. http.Error(w, "invalid from", http.StatusNotFound)
  370. return
  371. }
  372. if h.r.IsIDRemoved(uint64(from)) {
  373. if h.lg != nil {
  374. h.lg.Warn(
  375. "rejected stream from remote peer because it was removed",
  376. zap.String("local-member-id", h.tr.ID.String()),
  377. zap.String("remote-peer-id-stream-handler", h.id.String()),
  378. zap.String("remote-peer-id-from", from.String()),
  379. )
  380. } else {
  381. plog.Warningf("rejected the stream from peer %s since it was removed", from)
  382. }
  383. http.Error(w, "removed member", http.StatusGone)
  384. return
  385. }
  386. p := h.peerGetter.Get(from)
  387. if p == nil {
  388. // This may happen in following cases:
  389. // 1. user starts a remote peer that belongs to a different cluster
  390. // with the same cluster ID.
  391. // 2. local etcd falls behind of the cluster, and cannot recognize
  392. // the members that joined after its current progress.
  393. if urls := r.Header.Get("X-PeerURLs"); urls != "" {
  394. h.tr.AddRemote(from, strings.Split(urls, ","))
  395. }
  396. if h.lg != nil {
  397. h.lg.Warn(
  398. "failed to find remote peer in cluster",
  399. zap.String("local-member-id", h.tr.ID.String()),
  400. zap.String("remote-peer-id-stream-handler", h.id.String()),
  401. zap.String("remote-peer-id-from", from.String()),
  402. zap.String("cluster-id", h.cid.String()),
  403. )
  404. } else {
  405. plog.Errorf("failed to find member %s in cluster %s", from, h.cid)
  406. }
  407. http.Error(w, "error sender not found", http.StatusNotFound)
  408. return
  409. }
  410. wto := h.id.String()
  411. if gto := r.Header.Get("X-Raft-To"); gto != wto {
  412. if h.lg != nil {
  413. h.lg.Warn(
  414. "ignored streaming request; ID mismatch",
  415. zap.String("local-member-id", h.tr.ID.String()),
  416. zap.String("remote-peer-id-stream-handler", h.id.String()),
  417. zap.String("remote-peer-id-header", gto),
  418. zap.String("remote-peer-id-from", from.String()),
  419. zap.String("cluster-id", h.cid.String()),
  420. )
  421. } else {
  422. plog.Errorf("streaming request ignored (ID mismatch got %s want %s)", gto, wto)
  423. }
  424. http.Error(w, "to field mismatch", http.StatusPreconditionFailed)
  425. return
  426. }
  427. w.WriteHeader(http.StatusOK)
  428. w.(http.Flusher).Flush()
  429. c := newCloseNotifier()
  430. conn := &outgoingConn{
  431. t: t,
  432. Writer: w,
  433. Flusher: w.(http.Flusher),
  434. Closer: c,
  435. localID: h.tr.ID,
  436. peerID: h.id,
  437. }
  438. p.attachOutgoingConn(conn)
  439. <-c.closeNotify()
  440. }
  441. // checkClusterCompatibilityFromHeader checks the cluster compatibility of
  442. // the local member from the given header.
  443. // It checks whether the version of local member is compatible with
  444. // the versions in the header, and whether the cluster ID of local member
  445. // matches the one in the header.
  446. func checkClusterCompatibilityFromHeader(lg *zap.Logger, localID types.ID, header http.Header, cid types.ID) error {
  447. remoteName := header.Get("X-Server-From")
  448. remoteServer := serverVersion(header)
  449. remoteVs := ""
  450. if remoteServer != nil {
  451. remoteVs = remoteServer.String()
  452. }
  453. remoteMinClusterVer := minClusterVersion(header)
  454. remoteMinClusterVs := ""
  455. if remoteMinClusterVer != nil {
  456. remoteMinClusterVs = remoteMinClusterVer.String()
  457. }
  458. localServer, localMinCluster, err := checkVersionCompatibility(remoteName, remoteServer, remoteMinClusterVer)
  459. localVs := ""
  460. if localServer != nil {
  461. localVs = localServer.String()
  462. }
  463. localMinClusterVs := ""
  464. if localMinCluster != nil {
  465. localMinClusterVs = localMinCluster.String()
  466. }
  467. if err != nil {
  468. if lg != nil {
  469. lg.Warn(
  470. "failed to check version compatibility",
  471. zap.String("local-member-id", localID.String()),
  472. zap.String("local-member-cluster-id", cid.String()),
  473. zap.String("local-member-server-version", localVs),
  474. zap.String("local-member-server-minimum-cluster-version", localMinClusterVs),
  475. zap.String("remote-peer-server-name", remoteName),
  476. zap.String("remote-peer-server-version", remoteVs),
  477. zap.String("remote-peer-server-minimum-cluster-version", remoteMinClusterVs),
  478. zap.Error(err),
  479. )
  480. } else {
  481. plog.Errorf("request version incompatibility (%v)", err)
  482. }
  483. return errIncompatibleVersion
  484. }
  485. if gcid := header.Get("X-Etcd-Cluster-ID"); gcid != cid.String() {
  486. if lg != nil {
  487. lg.Warn(
  488. "request cluster ID mismatch",
  489. zap.String("local-member-id", localID.String()),
  490. zap.String("local-member-cluster-id", cid.String()),
  491. zap.String("local-member-server-version", localVs),
  492. zap.String("local-member-server-minimum-cluster-version", localMinClusterVs),
  493. zap.String("remote-peer-server-name", remoteName),
  494. zap.String("remote-peer-server-version", remoteVs),
  495. zap.String("remote-peer-server-minimum-cluster-version", remoteMinClusterVs),
  496. zap.String("remote-peer-cluster-id", gcid),
  497. )
  498. } else {
  499. plog.Errorf("request cluster ID mismatch (got %s want %s)", gcid, cid)
  500. }
  501. return errClusterIDMismatch
  502. }
  503. return nil
  504. }
  505. type closeNotifier struct {
  506. done chan struct{}
  507. }
  508. func newCloseNotifier() *closeNotifier {
  509. return &closeNotifier{
  510. done: make(chan struct{}),
  511. }
  512. }
  513. func (n *closeNotifier) Close() error {
  514. close(n.done)
  515. return nil
  516. }
  517. func (n *closeNotifier) closeNotify() <-chan struct{} { return n.done }