remote_runtime.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. /*
  2. Copyright 2016 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package remote
  14. import (
  15. "context"
  16. "errors"
  17. "fmt"
  18. "strings"
  19. "time"
  20. "google.golang.org/grpc"
  21. "k8s.io/klog"
  22. internalapi "k8s.io/cri-api/pkg/apis"
  23. runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1alpha2"
  24. "k8s.io/kubernetes/pkg/kubelet/util"
  25. "k8s.io/kubernetes/pkg/kubelet/util/logreduction"
  26. utilexec "k8s.io/utils/exec"
  27. )
  28. // RemoteRuntimeService is a gRPC implementation of internalapi.RuntimeService.
  29. type RemoteRuntimeService struct {
  30. timeout time.Duration
  31. runtimeClient runtimeapi.RuntimeServiceClient
  32. // Cache last per-container error message to reduce log spam
  33. logReduction *logreduction.LogReduction
  34. }
  35. const (
  36. // How frequently to report identical errors
  37. identicalErrorDelay = 1 * time.Minute
  38. )
  39. // NewRemoteRuntimeService creates a new internalapi.RuntimeService.
  40. func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration) (internalapi.RuntimeService, error) {
  41. klog.V(3).Infof("Connecting to runtime service %s", endpoint)
  42. addr, dialer, err := util.GetAddressAndDialer(endpoint)
  43. if err != nil {
  44. return nil, err
  45. }
  46. ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout)
  47. defer cancel()
  48. conn, err := grpc.DialContext(ctx, addr, grpc.WithInsecure(), grpc.WithContextDialer(dialer), grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(maxMsgSize)))
  49. if err != nil {
  50. klog.Errorf("Connect remote runtime %s failed: %v", addr, err)
  51. return nil, err
  52. }
  53. return &RemoteRuntimeService{
  54. timeout: connectionTimeout,
  55. runtimeClient: runtimeapi.NewRuntimeServiceClient(conn),
  56. logReduction: logreduction.NewLogReduction(identicalErrorDelay),
  57. }, nil
  58. }
  59. // Version returns the runtime name, runtime version and runtime API version.
  60. func (r *RemoteRuntimeService) Version(apiVersion string) (*runtimeapi.VersionResponse, error) {
  61. ctx, cancel := getContextWithTimeout(r.timeout)
  62. defer cancel()
  63. typedVersion, err := r.runtimeClient.Version(ctx, &runtimeapi.VersionRequest{
  64. Version: apiVersion,
  65. })
  66. if err != nil {
  67. klog.Errorf("Version from runtime service failed: %v", err)
  68. return nil, err
  69. }
  70. if typedVersion.Version == "" || typedVersion.RuntimeName == "" || typedVersion.RuntimeApiVersion == "" || typedVersion.RuntimeVersion == "" {
  71. return nil, fmt.Errorf("not all fields are set in VersionResponse (%q)", *typedVersion)
  72. }
  73. return typedVersion, err
  74. }
  75. // RunPodSandbox creates and starts a pod-level sandbox. Runtimes should ensure
  76. // the sandbox is in ready state.
  77. func (r *RemoteRuntimeService) RunPodSandbox(config *runtimeapi.PodSandboxConfig, runtimeHandler string) (string, error) {
  78. // Use 2 times longer timeout for sandbox operation (4 mins by default)
  79. // TODO: Make the pod sandbox timeout configurable.
  80. ctx, cancel := getContextWithTimeout(r.timeout * 2)
  81. defer cancel()
  82. resp, err := r.runtimeClient.RunPodSandbox(ctx, &runtimeapi.RunPodSandboxRequest{
  83. Config: config,
  84. RuntimeHandler: runtimeHandler,
  85. })
  86. if err != nil {
  87. klog.Errorf("RunPodSandbox from runtime service failed: %v", err)
  88. return "", err
  89. }
  90. if resp.PodSandboxId == "" {
  91. errorMessage := fmt.Sprintf("PodSandboxId is not set for sandbox %q", config.GetMetadata())
  92. klog.Errorf("RunPodSandbox failed: %s", errorMessage)
  93. return "", errors.New(errorMessage)
  94. }
  95. return resp.PodSandboxId, nil
  96. }
  97. // StopPodSandbox stops the sandbox. If there are any running containers in the
  98. // sandbox, they should be forced to termination.
  99. func (r *RemoteRuntimeService) StopPodSandbox(podSandBoxID string) error {
  100. ctx, cancel := getContextWithTimeout(r.timeout)
  101. defer cancel()
  102. _, err := r.runtimeClient.StopPodSandbox(ctx, &runtimeapi.StopPodSandboxRequest{
  103. PodSandboxId: podSandBoxID,
  104. })
  105. if err != nil {
  106. klog.Errorf("StopPodSandbox %q from runtime service failed: %v", podSandBoxID, err)
  107. return err
  108. }
  109. return nil
  110. }
  111. // RemovePodSandbox removes the sandbox. If there are any containers in the
  112. // sandbox, they should be forcibly removed.
  113. func (r *RemoteRuntimeService) RemovePodSandbox(podSandBoxID string) error {
  114. ctx, cancel := getContextWithTimeout(r.timeout)
  115. defer cancel()
  116. _, err := r.runtimeClient.RemovePodSandbox(ctx, &runtimeapi.RemovePodSandboxRequest{
  117. PodSandboxId: podSandBoxID,
  118. })
  119. if err != nil {
  120. klog.Errorf("RemovePodSandbox %q from runtime service failed: %v", podSandBoxID, err)
  121. return err
  122. }
  123. return nil
  124. }
  125. // PodSandboxStatus returns the status of the PodSandbox.
  126. func (r *RemoteRuntimeService) PodSandboxStatus(podSandBoxID string) (*runtimeapi.PodSandboxStatus, error) {
  127. ctx, cancel := getContextWithTimeout(r.timeout)
  128. defer cancel()
  129. resp, err := r.runtimeClient.PodSandboxStatus(ctx, &runtimeapi.PodSandboxStatusRequest{
  130. PodSandboxId: podSandBoxID,
  131. })
  132. if err != nil {
  133. return nil, err
  134. }
  135. if resp.Status != nil {
  136. if err := verifySandboxStatus(resp.Status); err != nil {
  137. return nil, err
  138. }
  139. }
  140. return resp.Status, nil
  141. }
  142. // ListPodSandbox returns a list of PodSandboxes.
  143. func (r *RemoteRuntimeService) ListPodSandbox(filter *runtimeapi.PodSandboxFilter) ([]*runtimeapi.PodSandbox, error) {
  144. ctx, cancel := getContextWithTimeout(r.timeout)
  145. defer cancel()
  146. resp, err := r.runtimeClient.ListPodSandbox(ctx, &runtimeapi.ListPodSandboxRequest{
  147. Filter: filter,
  148. })
  149. if err != nil {
  150. klog.Errorf("ListPodSandbox with filter %+v from runtime service failed: %v", filter, err)
  151. return nil, err
  152. }
  153. return resp.Items, nil
  154. }
  155. // CreateContainer creates a new container in the specified PodSandbox.
  156. func (r *RemoteRuntimeService) CreateContainer(podSandBoxID string, config *runtimeapi.ContainerConfig, sandboxConfig *runtimeapi.PodSandboxConfig) (string, error) {
  157. ctx, cancel := getContextWithTimeout(r.timeout)
  158. defer cancel()
  159. resp, err := r.runtimeClient.CreateContainer(ctx, &runtimeapi.CreateContainerRequest{
  160. PodSandboxId: podSandBoxID,
  161. Config: config,
  162. SandboxConfig: sandboxConfig,
  163. })
  164. if err != nil {
  165. klog.Errorf("CreateContainer in sandbox %q from runtime service failed: %v", podSandBoxID, err)
  166. return "", err
  167. }
  168. if resp.ContainerId == "" {
  169. errorMessage := fmt.Sprintf("ContainerId is not set for container %q", config.GetMetadata())
  170. klog.Errorf("CreateContainer failed: %s", errorMessage)
  171. return "", errors.New(errorMessage)
  172. }
  173. return resp.ContainerId, nil
  174. }
  175. // StartContainer starts the container.
  176. func (r *RemoteRuntimeService) StartContainer(containerID string) error {
  177. ctx, cancel := getContextWithTimeout(r.timeout)
  178. defer cancel()
  179. _, err := r.runtimeClient.StartContainer(ctx, &runtimeapi.StartContainerRequest{
  180. ContainerId: containerID,
  181. })
  182. if err != nil {
  183. klog.Errorf("StartContainer %q from runtime service failed: %v", containerID, err)
  184. return err
  185. }
  186. return nil
  187. }
  188. // StopContainer stops a running container with a grace period (i.e., timeout).
  189. func (r *RemoteRuntimeService) StopContainer(containerID string, timeout int64) error {
  190. // Use timeout + default timeout (2 minutes) as timeout to leave extra time
  191. // for SIGKILL container and request latency.
  192. t := r.timeout + time.Duration(timeout)*time.Second
  193. ctx, cancel := getContextWithTimeout(t)
  194. defer cancel()
  195. r.logReduction.ClearID(containerID)
  196. _, err := r.runtimeClient.StopContainer(ctx, &runtimeapi.StopContainerRequest{
  197. ContainerId: containerID,
  198. Timeout: timeout,
  199. })
  200. if err != nil {
  201. klog.Errorf("StopContainer %q from runtime service failed: %v", containerID, err)
  202. return err
  203. }
  204. return nil
  205. }
  206. // RemoveContainer removes the container. If the container is running, the container
  207. // should be forced to removal.
  208. func (r *RemoteRuntimeService) RemoveContainer(containerID string) error {
  209. ctx, cancel := getContextWithTimeout(r.timeout)
  210. defer cancel()
  211. r.logReduction.ClearID(containerID)
  212. _, err := r.runtimeClient.RemoveContainer(ctx, &runtimeapi.RemoveContainerRequest{
  213. ContainerId: containerID,
  214. })
  215. if err != nil {
  216. klog.Errorf("RemoveContainer %q from runtime service failed: %v", containerID, err)
  217. return err
  218. }
  219. return nil
  220. }
  221. // ListContainers lists containers by filters.
  222. func (r *RemoteRuntimeService) ListContainers(filter *runtimeapi.ContainerFilter) ([]*runtimeapi.Container, error) {
  223. ctx, cancel := getContextWithTimeout(r.timeout)
  224. defer cancel()
  225. resp, err := r.runtimeClient.ListContainers(ctx, &runtimeapi.ListContainersRequest{
  226. Filter: filter,
  227. })
  228. if err != nil {
  229. klog.Errorf("ListContainers with filter %+v from runtime service failed: %v", filter, err)
  230. return nil, err
  231. }
  232. return resp.Containers, nil
  233. }
  234. // ContainerStatus returns the container status.
  235. func (r *RemoteRuntimeService) ContainerStatus(containerID string) (*runtimeapi.ContainerStatus, error) {
  236. ctx, cancel := getContextWithTimeout(r.timeout)
  237. defer cancel()
  238. resp, err := r.runtimeClient.ContainerStatus(ctx, &runtimeapi.ContainerStatusRequest{
  239. ContainerId: containerID,
  240. })
  241. if err != nil {
  242. // Don't spam the log with endless messages about the same failure.
  243. if r.logReduction.ShouldMessageBePrinted(err.Error(), containerID) {
  244. klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
  245. }
  246. return nil, err
  247. }
  248. r.logReduction.ClearID(containerID)
  249. if resp.Status != nil {
  250. if err := verifyContainerStatus(resp.Status); err != nil {
  251. klog.Errorf("ContainerStatus of %q failed: %v", containerID, err)
  252. return nil, err
  253. }
  254. }
  255. return resp.Status, nil
  256. }
  257. // UpdateContainerResources updates a containers resource config
  258. func (r *RemoteRuntimeService) UpdateContainerResources(containerID string, resources *runtimeapi.LinuxContainerResources) error {
  259. ctx, cancel := getContextWithTimeout(r.timeout)
  260. defer cancel()
  261. _, err := r.runtimeClient.UpdateContainerResources(ctx, &runtimeapi.UpdateContainerResourcesRequest{
  262. ContainerId: containerID,
  263. Linux: resources,
  264. })
  265. if err != nil {
  266. klog.Errorf("UpdateContainerResources %q from runtime service failed: %v", containerID, err)
  267. return err
  268. }
  269. return nil
  270. }
  271. // ExecSync executes a command in the container, and returns the stdout output.
  272. // If command exits with a non-zero exit code, an error is returned.
  273. func (r *RemoteRuntimeService) ExecSync(containerID string, cmd []string, timeout time.Duration) (stdout []byte, stderr []byte, err error) {
  274. // Do not set timeout when timeout is 0.
  275. var ctx context.Context
  276. var cancel context.CancelFunc
  277. if timeout != 0 {
  278. // Use timeout + default timeout (2 minutes) as timeout to leave some time for
  279. // the runtime to do cleanup.
  280. ctx, cancel = getContextWithTimeout(r.timeout + timeout)
  281. } else {
  282. ctx, cancel = getContextWithCancel()
  283. }
  284. defer cancel()
  285. timeoutSeconds := int64(timeout.Seconds())
  286. req := &runtimeapi.ExecSyncRequest{
  287. ContainerId: containerID,
  288. Cmd: cmd,
  289. Timeout: timeoutSeconds,
  290. }
  291. resp, err := r.runtimeClient.ExecSync(ctx, req)
  292. if err != nil {
  293. klog.Errorf("ExecSync %s '%s' from runtime service failed: %v", containerID, strings.Join(cmd, " "), err)
  294. return nil, nil, err
  295. }
  296. err = nil
  297. if resp.ExitCode != 0 {
  298. err = utilexec.CodeExitError{
  299. Err: fmt.Errorf("command '%s' exited with %d: %s", strings.Join(cmd, " "), resp.ExitCode, resp.Stderr),
  300. Code: int(resp.ExitCode),
  301. }
  302. }
  303. return resp.Stdout, resp.Stderr, err
  304. }
  305. // Exec prepares a streaming endpoint to execute a command in the container, and returns the address.
  306. func (r *RemoteRuntimeService) Exec(req *runtimeapi.ExecRequest) (*runtimeapi.ExecResponse, error) {
  307. ctx, cancel := getContextWithTimeout(r.timeout)
  308. defer cancel()
  309. resp, err := r.runtimeClient.Exec(ctx, req)
  310. if err != nil {
  311. klog.Errorf("Exec %s '%s' from runtime service failed: %v", req.ContainerId, strings.Join(req.Cmd, " "), err)
  312. return nil, err
  313. }
  314. if resp.Url == "" {
  315. errorMessage := "URL is not set"
  316. klog.Errorf("Exec failed: %s", errorMessage)
  317. return nil, errors.New(errorMessage)
  318. }
  319. return resp, nil
  320. }
  321. // Attach prepares a streaming endpoint to attach to a running container, and returns the address.
  322. func (r *RemoteRuntimeService) Attach(req *runtimeapi.AttachRequest) (*runtimeapi.AttachResponse, error) {
  323. ctx, cancel := getContextWithTimeout(r.timeout)
  324. defer cancel()
  325. resp, err := r.runtimeClient.Attach(ctx, req)
  326. if err != nil {
  327. klog.Errorf("Attach %s from runtime service failed: %v", req.ContainerId, err)
  328. return nil, err
  329. }
  330. if resp.Url == "" {
  331. errorMessage := "URL is not set"
  332. klog.Errorf("Exec failed: %s", errorMessage)
  333. return nil, errors.New(errorMessage)
  334. }
  335. return resp, nil
  336. }
  337. // PortForward prepares a streaming endpoint to forward ports from a PodSandbox, and returns the address.
  338. func (r *RemoteRuntimeService) PortForward(req *runtimeapi.PortForwardRequest) (*runtimeapi.PortForwardResponse, error) {
  339. ctx, cancel := getContextWithTimeout(r.timeout)
  340. defer cancel()
  341. resp, err := r.runtimeClient.PortForward(ctx, req)
  342. if err != nil {
  343. klog.Errorf("PortForward %s from runtime service failed: %v", req.PodSandboxId, err)
  344. return nil, err
  345. }
  346. if resp.Url == "" {
  347. errorMessage := "URL is not set"
  348. klog.Errorf("Exec failed: %s", errorMessage)
  349. return nil, errors.New(errorMessage)
  350. }
  351. return resp, nil
  352. }
  353. // UpdateRuntimeConfig updates the config of a runtime service. The only
  354. // update payload currently supported is the pod CIDR assigned to a node,
  355. // and the runtime service just proxies it down to the network plugin.
  356. func (r *RemoteRuntimeService) UpdateRuntimeConfig(runtimeConfig *runtimeapi.RuntimeConfig) error {
  357. ctx, cancel := getContextWithTimeout(r.timeout)
  358. defer cancel()
  359. // Response doesn't contain anything of interest. This translates to an
  360. // Event notification to the network plugin, which can't fail, so we're
  361. // really looking to surface destination unreachable.
  362. _, err := r.runtimeClient.UpdateRuntimeConfig(ctx, &runtimeapi.UpdateRuntimeConfigRequest{
  363. RuntimeConfig: runtimeConfig,
  364. })
  365. if err != nil {
  366. return err
  367. }
  368. return nil
  369. }
  370. // Status returns the status of the runtime.
  371. func (r *RemoteRuntimeService) Status() (*runtimeapi.RuntimeStatus, error) {
  372. ctx, cancel := getContextWithTimeout(r.timeout)
  373. defer cancel()
  374. resp, err := r.runtimeClient.Status(ctx, &runtimeapi.StatusRequest{})
  375. if err != nil {
  376. klog.Errorf("Status from runtime service failed: %v", err)
  377. return nil, err
  378. }
  379. if resp.Status == nil || len(resp.Status.Conditions) < 2 {
  380. errorMessage := "RuntimeReady or NetworkReady condition are not set"
  381. klog.Errorf("Status failed: %s", errorMessage)
  382. return nil, errors.New(errorMessage)
  383. }
  384. return resp.Status, nil
  385. }
  386. // ContainerStats returns the stats of the container.
  387. func (r *RemoteRuntimeService) ContainerStats(containerID string) (*runtimeapi.ContainerStats, error) {
  388. ctx, cancel := getContextWithTimeout(r.timeout)
  389. defer cancel()
  390. resp, err := r.runtimeClient.ContainerStats(ctx, &runtimeapi.ContainerStatsRequest{
  391. ContainerId: containerID,
  392. })
  393. if err != nil {
  394. if r.logReduction.ShouldMessageBePrinted(err.Error(), containerID) {
  395. klog.Errorf("ContainerStatus %q from runtime service failed: %v", containerID, err)
  396. }
  397. return nil, err
  398. }
  399. r.logReduction.ClearID(containerID)
  400. return resp.GetStats(), nil
  401. }
  402. func (r *RemoteRuntimeService) ListContainerStats(filter *runtimeapi.ContainerStatsFilter) ([]*runtimeapi.ContainerStats, error) {
  403. // Do not set timeout, because writable layer stats collection takes time.
  404. // TODO(random-liu): Should we assume runtime should cache the result, and set timeout here?
  405. ctx, cancel := getContextWithCancel()
  406. defer cancel()
  407. resp, err := r.runtimeClient.ListContainerStats(ctx, &runtimeapi.ListContainerStatsRequest{
  408. Filter: filter,
  409. })
  410. if err != nil {
  411. klog.Errorf("ListContainerStats with filter %+v from runtime service failed: %v", filter, err)
  412. return nil, err
  413. }
  414. return resp.GetStats(), nil
  415. }
  416. func (r *RemoteRuntimeService) ReopenContainerLog(containerID string) error {
  417. ctx, cancel := getContextWithTimeout(r.timeout)
  418. defer cancel()
  419. _, err := r.runtimeClient.ReopenContainerLog(ctx, &runtimeapi.ReopenContainerLogRequest{ContainerId: containerID})
  420. if err != nil {
  421. klog.Errorf("ReopenContainerLog %q from runtime service failed: %v", containerID, err)
  422. return err
  423. }
  424. return nil
  425. }