numa_alignment.go 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. /*
  2. Copyright 2020 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package e2enode
  14. import (
  15. "fmt"
  16. "io/ioutil"
  17. "os"
  18. "path/filepath"
  19. "sort"
  20. "strconv"
  21. "strings"
  22. v1 "k8s.io/api/core/v1"
  23. "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
  24. "k8s.io/kubernetes/test/e2e/framework"
  25. )
  26. type numaPodResources struct {
  27. CPUToNUMANode map[int]int
  28. PCIDevsToNUMANode map[string]int
  29. }
  30. func (R *numaPodResources) CheckAlignment() bool {
  31. nodeNum := -1 // not set
  32. for _, cpuNode := range R.CPUToNUMANode {
  33. if nodeNum == -1 {
  34. nodeNum = cpuNode
  35. } else if nodeNum != cpuNode {
  36. return false
  37. }
  38. }
  39. for _, devNode := range R.PCIDevsToNUMANode {
  40. if nodeNum != devNode {
  41. return false
  42. }
  43. }
  44. return true
  45. }
  46. func (R *numaPodResources) String() string {
  47. var b strings.Builder
  48. // To store the keys in slice in sorted order
  49. var cpuKeys []int
  50. for ck := range R.CPUToNUMANode {
  51. cpuKeys = append(cpuKeys, ck)
  52. }
  53. sort.Ints(cpuKeys)
  54. for _, k := range cpuKeys {
  55. nodeNum := R.CPUToNUMANode[k]
  56. b.WriteString(fmt.Sprintf("CPU cpu#%03d=%02d\n", k, nodeNum))
  57. }
  58. var pciKeys []string
  59. for pk := range R.PCIDevsToNUMANode {
  60. pciKeys = append(pciKeys, pk)
  61. }
  62. sort.Strings(pciKeys)
  63. for _, k := range pciKeys {
  64. nodeNum := R.PCIDevsToNUMANode[k]
  65. b.WriteString(fmt.Sprintf("PCI %s=%02d\n", k, nodeNum))
  66. }
  67. return b.String()
  68. }
  69. func getCPUsPerNUMANode(nodeNum int) ([]int, error) {
  70. nodeCPUList, err := ioutil.ReadFile(fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", nodeNum))
  71. if err != nil {
  72. return nil, err
  73. }
  74. cpus, err := cpuset.Parse(strings.TrimSpace(string(nodeCPUList)))
  75. if err != nil {
  76. return nil, err
  77. }
  78. return cpus.ToSlice(), nil
  79. }
  80. func getCPUToNUMANodeMapFromEnv(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, environ map[string]string, numaNodes int) (map[int]int, error) {
  81. var cpuIDs []int
  82. cpuListAllowedEnvVar := "CPULIST_ALLOWED"
  83. for name, value := range environ {
  84. if name == cpuListAllowedEnvVar {
  85. cpus, err := cpuset.Parse(value)
  86. if err != nil {
  87. return nil, err
  88. }
  89. cpuIDs = cpus.ToSlice()
  90. }
  91. }
  92. if len(cpuIDs) == 0 {
  93. return nil, fmt.Errorf("variable %q not found in environ", cpuListAllowedEnvVar)
  94. }
  95. cpusPerNUMA := make(map[int][]int)
  96. for numaNode := 0; numaNode < numaNodes; numaNode++ {
  97. nodeCPUList := f.ExecCommandInContainer(pod.Name, cnt.Name,
  98. "/bin/cat", fmt.Sprintf("/sys/devices/system/node/node%d/cpulist", numaNode))
  99. cpus, err := cpuset.Parse(nodeCPUList)
  100. if err != nil {
  101. return nil, err
  102. }
  103. cpusPerNUMA[numaNode] = cpus.ToSlice()
  104. }
  105. // CPU IDs -> NUMA Node ID
  106. CPUToNUMANode := make(map[int]int)
  107. for nodeNum, cpus := range cpusPerNUMA {
  108. for _, cpu := range cpus {
  109. CPUToNUMANode[cpu] = nodeNum
  110. }
  111. }
  112. // filter out only the allowed CPUs
  113. CPUMap := make(map[int]int)
  114. for _, cpuID := range cpuIDs {
  115. _, ok := CPUToNUMANode[cpuID]
  116. if !ok {
  117. return nil, fmt.Errorf("CPU %d not found on NUMA map: %v", cpuID, CPUToNUMANode)
  118. }
  119. CPUMap[cpuID] = CPUToNUMANode[cpuID]
  120. }
  121. return CPUMap, nil
  122. }
  123. func getPCIDeviceToNumaNodeMapFromEnv(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, environ map[string]string) (map[string]int, error) {
  124. pciDevPrefix := "PCIDEVICE_"
  125. // at this point we don't care which plugin selected the device,
  126. // we only need to know which devices were assigned to the POD.
  127. // Hence, do prefix search for the variable and fetch the device(s).
  128. NUMAPerDev := make(map[string]int)
  129. for name, value := range environ {
  130. if !strings.HasPrefix(name, pciDevPrefix) {
  131. continue
  132. }
  133. // a single plugin can allocate more than a single device
  134. pciDevs := strings.Split(value, ",")
  135. for _, pciDev := range pciDevs {
  136. pciDevNUMANode := f.ExecCommandInContainer(pod.Name, cnt.Name,
  137. "/bin/cat", fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", pciDev))
  138. NUMAPerDev[pciDev] = numaNodeFromSysFsEntry(pciDevNUMANode)
  139. }
  140. }
  141. return NUMAPerDev, nil
  142. }
  143. func makeEnvMap(logs string) (map[string]string, error) {
  144. podEnv := strings.Split(logs, "\n")
  145. envMap := make(map[string]string)
  146. for _, envVar := range podEnv {
  147. if len(envVar) == 0 {
  148. continue
  149. }
  150. pair := strings.SplitN(envVar, "=", 2)
  151. if len(pair) != 2 {
  152. return nil, fmt.Errorf("unable to split %q", envVar)
  153. }
  154. envMap[pair[0]] = pair[1]
  155. }
  156. return envMap, nil
  157. }
  158. type testEnvInfo struct {
  159. numaNodes int
  160. sriovResourceName string
  161. policy string
  162. }
  163. func containerWantsDevices(cnt *v1.Container, envInfo *testEnvInfo) bool {
  164. _, found := cnt.Resources.Requests[v1.ResourceName(envInfo.sriovResourceName)]
  165. return found
  166. }
  167. func checkNUMAAlignment(f *framework.Framework, pod *v1.Pod, cnt *v1.Container, logs string, envInfo *testEnvInfo) (*numaPodResources, error) {
  168. var err error
  169. podEnv, err := makeEnvMap(logs)
  170. if err != nil {
  171. return nil, err
  172. }
  173. CPUToNUMANode, err := getCPUToNUMANodeMapFromEnv(f, pod, cnt, podEnv, envInfo.numaNodes)
  174. if err != nil {
  175. return nil, err
  176. }
  177. PCIDevsToNUMANode, err := getPCIDeviceToNumaNodeMapFromEnv(f, pod, cnt, podEnv)
  178. if err != nil {
  179. return nil, err
  180. }
  181. if containerWantsDevices(cnt, envInfo) && len(PCIDevsToNUMANode) == 0 {
  182. return nil, fmt.Errorf("no PCI devices found in environ")
  183. }
  184. numaRes := numaPodResources{
  185. CPUToNUMANode: CPUToNUMANode,
  186. PCIDevsToNUMANode: PCIDevsToNUMANode,
  187. }
  188. aligned := numaRes.CheckAlignment()
  189. if !aligned {
  190. err = fmt.Errorf("NUMA resources not aligned")
  191. }
  192. return &numaRes, err
  193. }
  194. type pciDeviceInfo struct {
  195. Address string
  196. NUMANode int
  197. IsPhysFn bool
  198. IsVFn bool
  199. }
  200. func getPCIDeviceInfo(sysPCIDir string) ([]pciDeviceInfo, error) {
  201. var pciDevs []pciDeviceInfo
  202. entries, err := ioutil.ReadDir(sysPCIDir)
  203. if err != nil {
  204. return nil, err
  205. }
  206. for _, entry := range entries {
  207. isPhysFn := false
  208. isVFn := false
  209. if _, err := os.Stat(filepath.Join(sysPCIDir, entry.Name(), "sriov_numvfs")); err == nil {
  210. isPhysFn = true
  211. } else if !os.IsNotExist(err) {
  212. // unexpected error. Bail out
  213. return nil, err
  214. }
  215. if _, err := os.Stat(filepath.Join(sysPCIDir, entry.Name(), "physfn")); err == nil {
  216. isVFn = true
  217. } else if !os.IsNotExist(err) {
  218. // unexpected error. Bail out
  219. return nil, err
  220. }
  221. content, err := ioutil.ReadFile(filepath.Join(sysPCIDir, entry.Name(), "numa_node"))
  222. if err != nil {
  223. return nil, err
  224. }
  225. pciDevs = append(pciDevs, pciDeviceInfo{
  226. Address: entry.Name(),
  227. NUMANode: numaNodeFromSysFsEntry(string(content)),
  228. IsPhysFn: isPhysFn,
  229. IsVFn: isVFn,
  230. })
  231. }
  232. return pciDevs, nil
  233. }
  234. func numaNodeFromSysFsEntry(content string) int {
  235. nodeNum, err := strconv.Atoi(strings.TrimSpace(content))
  236. framework.ExpectNoError(err, "error detecting the device numa_node from sysfs: %v", err)
  237. return nodeNum
  238. }