From bdd72c8cf7946b5fab0ad36cbb683d38ab4180ee Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Wed, 9 Mar 2022 11:28:46 +0200 Subject: [PATCH] gpu: Add numa node mapping label for GPUs Signed-off-by: Tuomas Katila --- cmd/gpu_nfdhook/README.md | 3 + cmd/gpu_nfdhook/labeler.go | 66 +++++++++++++++ cmd/gpu_nfdhook/labeler_test.go | 145 ++++++++++++++++++++++++++++++++ 3 files changed, 214 insertions(+) diff --git a/cmd/gpu_nfdhook/README.md b/cmd/gpu_nfdhook/README.md index 0461ffed..1a4d562a 100644 --- a/cmd/gpu_nfdhook/README.md +++ b/cmd/gpu_nfdhook/README.md @@ -34,12 +34,15 @@ name | type | description| |`gpu.intel.com/cards`| string | list of card names separated by '`.`'. The names match host `card*`-folders under `/sys/class/drm/`. Deprecated, use `gpu-numbers`. |`gpu.intel.com/gpu-numbers`| string | list of numbers separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card` in devfs, and `/sys/class/drm/card` in sysfs. |`gpu.intel.com/tiles`| number | sum of all detected GPU tiles in the system. +|`gpu.intel.com/numa-gpu-map`| string | list of numa node to gpu mappings. If the value of the `gpu-numbers` label would not fit into the 63 character length limit, you will also get labels `gpu-numbers2`, `gpu-numbers3`... until all the gpu numbers have been labeled. The tile count `gpu.intel.com/tiles` describes the total amount of tiles on the system. System is expected to be homogeneous, and thus the number of tiles per GPU can be calculated by dividing the tile count with GPU count. +The `numa-gpu-map` label is a list of numa to gpu mapping items separated by `_`. Each list item has a numa node id combined with a list of gpu indices. e.g. 0-1.2.3 would mean: numa node 0 has gpus 1, 2 and 3. More complex example would be: 0-0.1_1-3.4 where numa node 0 would have gpus 0 and 1, and numa node 1 would have gpus 3 and 4. As with `gpu-numbers`, this label will be extended to multiple labels if the length of the value exceeds the max label length. + ## PCI-groups (optional) GPUs which share the same pci paths under `/sys/devices/pci*` can be grouped into a label. GPU nums are separated by '`.`' and diff --git a/cmd/gpu_nfdhook/labeler.go b/cmd/gpu_nfdhook/labeler.go index 44068f79..be78f3ea 100644 --- a/cmd/gpu_nfdhook/labeler.go +++ b/cmd/gpu_nfdhook/labeler.go @@ -37,6 +37,7 @@ const ( millicoreLabelName = "millicores" pciGroupLabelName = "pci-groups" tilesLabelName = "tiles" + numaMappingName = "numa-gpu-map" millicoresPerGPU = 1000 memoryOverrideEnv = "GPU_MEMORY_OVERRIDE" memoryReservedEnv = "GPU_MEMORY_RESERVED" @@ -190,6 +191,25 @@ func (l *labeler) getTileCount(gpuName string) (numTiles uint64) { return uint64(len(files)) } +// getNumaNode reads the cards numa node. +func (l *labeler) getNumaNode(gpuName string) int { + filePath := filepath.Join(l.sysfsDRMDir, gpuName, "device/numa_node") + + data, err := os.ReadFile(filePath) + if err != nil { + klog.Warning("Can't read file: ", err) + return -1 + } + + numa, err := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 64) + if err != nil { + klog.Warning("Can't convert numa_node: ", err) + return -1 + } + + return int(numa) +} + // addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value. func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) { value := int64(0) @@ -353,6 +373,8 @@ func (l *labeler) createLabels() error { gpuNumList := []string{} tileCount := 0 + numaMapping := make(map[int][]string) + for _, gpuName := range gpuNameList { gpuNum := "" // extract gpu number as a string. scan() has already checked name syntax @@ -367,6 +389,17 @@ func (l *labeler) createLabels() error { memoryAmount := l.getMemoryAmount(gpuName, numTiles) gpuNumList = append(gpuNumList, gpuName[4:]) + // get numa node of the GPU + numaNode := l.getNumaNode(gpuName) + + if numaNode >= 0 { + // and store the gpu under that node id + numaList := numaMapping[numaNode] + numaList = append(numaList, gpuNum) + + numaMapping[numaNode] = numaList + } + // try to add capability labels l.createCapabilityLabels(gpuNum, numTiles) @@ -390,6 +423,18 @@ func (l *labeler) createLabels() error { l.labels[labelNamespace+gpuNumListLabelName+strconv.FormatInt(int64(i+1), 10)] = gpuNumLists[i] } + if len(numaMapping) > 0 { + // add numa node mapping to labels: gpu.intel.com/numa-gpu-map="0-0.1.2.3_1-4.5.6.7" + numaMappingLabel := createNumaNodeMappingLabel(numaMapping) + + numaMappingLabelList := split(numaMappingLabel, labelMaxLength) + + l.labels[labelNamespace+numaMappingName] = numaMappingLabelList[0] + for i := 1; i < len(numaMappingLabelList); i++ { + l.labels[labelNamespace+numaMappingName+strconv.FormatInt(int64(i+1), 10)] = numaMappingLabelList[i] + } + } + // all GPUs get default number of millicores (1000) l.labels.addNumericLabel(labelNamespace+millicoreLabelName, int64(millicoresPerGPU*gpuCount)) @@ -408,6 +453,27 @@ func (l *labeler) createLabels() error { return nil } +func createNumaNodeMappingLabel(mapping map[int][]string) string { + parts := []string{} + + numas := []int{} + for numaNode := range mapping { + numas = append(numas, numaNode) + } + + sort.Ints(numas) + + for numaNode := range numas { + gpus := mapping[numaNode] + numaString := strconv.FormatInt(int64(numaNode), 10) + gpusString := strings.Join(gpus, ".") + + parts = append(parts, numaString+"-"+gpusString) + } + + return strings.Join(parts, "_") +} + func (l *labeler) printLabels() { for key, val := range l.labels { fmt.Println(key + "=" + val) diff --git a/cmd/gpu_nfdhook/labeler_test.go b/cmd/gpu_nfdhook/labeler_test.go index d0244b60..374af850 100644 --- a/cmd/gpu_nfdhook/labeler_test.go +++ b/cmd/gpu_nfdhook/labeler_test.go @@ -493,6 +493,151 @@ func getTestCases() []testcase { "gpu.intel.com/tiles": "3", }, }, + { + sysfsdirs: []string{ + "card0/device/drm/card0", + "card0/gt/gt0", + "card1/device/drm/card1", + "card1/gt/gt0", + }, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card0/device/numa_node": []byte("0"), + "card0/lmem_total_bytes": []byte("8000"), + "card1/device/vendor": []byte("0x8086"), + "card1/lmem_total_bytes": []byte("8000"), + "card1/device/numa_node": []byte("1"), + }, + name: "successful labeling with two cards and numa node info", + memoryOverride: 16000000000, + capabilityFile: map[string][]byte{ + "0/i915_capabilities": []byte( + "platform: new\n" + + "gen: 9"), + "1/i915_capabilities": []byte( + "platform: newnew\n" + + "gen: 9"), + }, + expectedRetval: nil, + expectedLabels: labelMap{ + "gpu.intel.com/graphics_version": "9", + "gpu.intel.com/media_version": "9", + "gpu.intel.com/millicores": "2000", + "gpu.intel.com/memory.max": "16000", + "gpu.intel.com/platform_new.count": "1", + "gpu.intel.com/platform_new.present": "true", + "gpu.intel.com/platform_new.tiles": "1", + "gpu.intel.com/platform_newnew.count": "1", + "gpu.intel.com/platform_newnew.present": "true", + "gpu.intel.com/platform_newnew.tiles": "1", + "gpu.intel.com/platform_gen": "9", + "gpu.intel.com/gpu-numbers": "0.1", + "gpu.intel.com/cards": "card0.card1", + "gpu.intel.com/tiles": "2", + "gpu.intel.com/numa-gpu-map": "0-0_1-1", + }, + }, + { + sysfsdirs: []string{ + "card0/device/drm/card0", + "card1/device/drm/card1", + "card2/device/drm/card2", + "card3/device/drm/card3", + "card4/device/drm/card4", + "card5/device/drm/card5", + "card6/device/drm/card6", + "card7/device/drm/card7", + "card8/device/drm/card8", + "card9/device/drm/card9", + "card10/device/drm/card10", + "card11/device/drm/card11", + "card12/device/drm/card12", + "card13/device/drm/card13", + "card14/device/drm/card14", + "card15/device/drm/card15", + "card16/device/drm/card16", + "card17/device/drm/card17", + "card18/device/drm/card18", + "card19/device/drm/card19", + "card20/device/drm/card20", + "card21/device/drm/card21", + "card22/device/drm/card22", + "card23/device/drm/card23", + "card24/device/drm/card24", + "card25/device/drm/card25", + "card26/device/drm/card26", + }, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card0/device/numa_node": []byte("0"), + "card1/device/vendor": []byte("0x8086"), + "card1/device/numa_node": []byte("0"), + "card2/device/vendor": []byte("0x8086"), + "card2/device/numa_node": []byte("0"), + "card3/device/vendor": []byte("0x8086"), + "card3/device/numa_node": []byte("0"), + "card4/device/vendor": []byte("0x8086"), + "card4/device/numa_node": []byte("0"), + "card5/device/vendor": []byte("0x8086"), + "card5/device/numa_node": []byte("0"), + "card6/device/vendor": []byte("0x8086"), + "card6/device/numa_node": []byte("0"), + "card7/device/vendor": []byte("0x8086"), + "card7/device/numa_node": []byte("0"), + "card8/device/vendor": []byte("0x8086"), + "card8/device/numa_node": []byte("0"), + "card9/device/vendor": []byte("0x8086"), + "card9/device/numa_node": []byte("2"), + "card10/device/vendor": []byte("0x8086"), + "card10/device/numa_node": []byte("2"), + "card11/device/vendor": []byte("0x8086"), + "card11/device/numa_node": []byte("2"), + "card12/device/vendor": []byte("0x8086"), + "card12/device/numa_node": []byte("2"), + "card13/device/vendor": []byte("0x8086"), + "card13/device/numa_node": []byte("1"), + "card14/device/vendor": []byte("0x8086"), + "card14/device/numa_node": []byte("1"), + "card15/device/vendor": []byte("0x8086"), + "card15/device/numa_node": []byte("1"), + "card16/device/vendor": []byte("0x8086"), + "card16/device/numa_node": []byte("1"), + "card17/device/vendor": []byte("0x8086"), + "card17/device/numa_node": []byte("1"), + "card18/device/vendor": []byte("0x8086"), + "card18/device/numa_node": []byte("1"), + "card19/device/vendor": []byte("0x8086"), + "card19/device/numa_node": []byte("1"), + "card20/device/vendor": []byte("0x8086"), + "card20/device/numa_node": []byte("1"), + "card21/device/vendor": []byte("0x8086"), + "card21/device/numa_node": []byte("1"), + "card22/device/vendor": []byte("0x8086"), + "card22/device/numa_node": []byte("3"), + "card23/device/vendor": []byte("0x8086"), + "card23/device/numa_node": []byte("3"), + "card24/device/vendor": []byte("0x8086"), + "card24/device/numa_node": []byte("3"), + "card25/device/vendor": []byte("0x8086"), + "card25/device/numa_node": []byte("3"), + "card26/device/vendor": []byte("0x8086"), + "card26/device/numa_node": []byte("3"), + }, + name: "successful labeling with two cards and numa node info", + memoryOverride: 16000000000, + capabilityFile: map[string][]byte{}, + expectedRetval: nil, + expectedLabels: labelMap{ + "gpu.intel.com/cards": "card0.card1.card10.card11.card12.card13.card14.card15.card16.ca", + "gpu.intel.com/gpu-numbers": "0.1.10.11.12.13.14.15.16.17.18.19.2.20.21.22.23.24.25.26.3.4.5.", + "gpu.intel.com/gpu-numbers2": "6.7.8.9", + "gpu.intel.com/memory.max": "432000000000", + "gpu.intel.com/millicores": "27000", + "gpu.intel.com/numa-gpu-map": "0-0.1.2.3.4.5.6.7.8_1-13.14.15.16.17.18.19.20.21_2-10.11.12.9_3", + "gpu.intel.com/numa-gpu-map2": "-22.23.24.25.26", + "gpu.intel.com/tiles": "27", + }, + }, } }