From 6f57c55ef8b7653f778fd381b0dfd70dc890b9fe Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Fri, 10 Dec 2021 15:19:33 +0200 Subject: [PATCH] Add a total tile count to node's labels This label isn't dependent on the debugfs as the platform specific tile count is. Signed-off-by: Tuomas Katila --- cmd/gpu_nfdhook/README.md | 3 + cmd/gpu_nfdhook/labeler.go | 7 +++ cmd/gpu_nfdhook/labeler_test.go | 101 +++++++++++++++++++++++++++++++- 3 files changed, 109 insertions(+), 2 deletions(-) diff --git a/cmd/gpu_nfdhook/README.md b/cmd/gpu_nfdhook/README.md index 7defe665..0461ffed 100644 --- a/cmd/gpu_nfdhook/README.md +++ b/cmd/gpu_nfdhook/README.md @@ -33,10 +33,13 @@ name | type | description| |`gpu.intel.com/memory.max`| number | sum of detected [GPU memory amounts](#GPU-memory) in bytes OR environment variable value * GPU count |`gpu.intel.com/cards`| string | list of card names separated by '`.`'. The names match host `card*`-folders under `/sys/class/drm/`. Deprecated, use `gpu-numbers`. |`gpu.intel.com/gpu-numbers`| string | list of numbers separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card` in devfs, and `/sys/class/drm/card` in sysfs. +|`gpu.intel.com/tiles`| number | sum of all detected GPU tiles in the system. If the value of the `gpu-numbers` label would not fit into the 63 character length limit, you will also get labels `gpu-numbers2`, `gpu-numbers3`... until all the gpu numbers have been labeled. +The tile count `gpu.intel.com/tiles` describes the total amount of tiles on the system. System is expected to be homogeneous, and thus the number of tiles per GPU can be calculated by dividing the tile count with GPU count. + ## PCI-groups (optional) GPUs which share the same pci paths under `/sys/devices/pci*` can be grouped into a label. GPU nums are separated by '`.`' and diff --git a/cmd/gpu_nfdhook/labeler.go b/cmd/gpu_nfdhook/labeler.go index 43237a90..44068f79 100644 --- a/cmd/gpu_nfdhook/labeler.go +++ b/cmd/gpu_nfdhook/labeler.go @@ -36,6 +36,7 @@ const ( gpuNumListLabelName = "gpu-numbers" millicoreLabelName = "millicores" pciGroupLabelName = "pci-groups" + tilesLabelName = "tiles" millicoresPerGPU = 1000 memoryOverrideEnv = "GPU_MEMORY_OVERRIDE" memoryReservedEnv = "GPU_MEMORY_RESERVED" @@ -350,6 +351,7 @@ func (l *labeler) createLabels() error { } gpuNumList := []string{} + tileCount := 0 for _, gpuName := range gpuNameList { gpuNum := "" @@ -360,6 +362,8 @@ func (l *labeler) createLabels() error { } numTiles := l.getTileCount(gpuName) + tileCount += int(numTiles) + memoryAmount := l.getMemoryAmount(gpuName, numTiles) gpuNumList = append(gpuNumList, gpuName[4:]) @@ -370,6 +374,9 @@ func (l *labeler) createLabels() error { } gpuCount := len(gpuNumList) + + l.labels.addNumericLabel(labelNamespace+tilesLabelName, int64(tileCount)) + if gpuCount > 0 { // add gpu list label (example: "card0.card1.card2") - deprecated l.labels[labelNamespace+gpuListLabelName] = split(strings.Join(gpuNameList, "."), labelMaxLength)[0] diff --git a/cmd/gpu_nfdhook/labeler_test.go b/cmd/gpu_nfdhook/labeler_test.go index 6f49066d..d0244b60 100644 --- a/cmd/gpu_nfdhook/labeler_test.go +++ b/cmd/gpu_nfdhook/labeler_test.go @@ -64,6 +64,7 @@ func getTestCases() []testcase { "gpu.intel.com/platform_gen": "9", "gpu.intel.com/cards": "card0", "gpu.intel.com/gpu-numbers": "0", + "gpu.intel.com/tiles": "1", }, }, { @@ -82,7 +83,9 @@ func getTestCases() []testcase { "gen: 9"), }, expectedRetval: nil, - expectedLabels: labelMap{}, + expectedLabels: labelMap{ + "gpu.intel.com/tiles": "0", + }, }, { sysfsdirs: []string{ @@ -113,6 +116,7 @@ func getTestCases() []testcase { "gpu.intel.com/platform_gen": "9", "gpu.intel.com/cards": "card0", "gpu.intel.com/gpu-numbers": "0", + "gpu.intel.com/tiles": "2", }, }, { @@ -144,6 +148,7 @@ func getTestCases() []testcase { "gpu.intel.com/platform_gen": "9", "gpu.intel.com/cards": "card0", "gpu.intel.com/gpu-numbers": "0", + "gpu.intel.com/tiles": "1", }, }, { @@ -172,6 +177,7 @@ func getTestCases() []testcase { "gpu.intel.com/platform_gen": "9", "gpu.intel.com/cards": "card0", "gpu.intel.com/gpu-numbers": "0", + "gpu.intel.com/tiles": "1", }, }, { @@ -196,6 +202,7 @@ func getTestCases() []testcase { "gpu.intel.com/platform_new.tiles": "1", "gpu.intel.com/cards": "card0", "gpu.intel.com/gpu-numbers": "0", + "gpu.intel.com/tiles": "1", }, }, { @@ -225,6 +232,7 @@ func getTestCases() []testcase { "gpu.intel.com/platform_gen": "12", "gpu.intel.com/cards": "card0", "gpu.intel.com/gpu-numbers": "0", + "gpu.intel.com/tiles": "1", }, }, { @@ -252,6 +260,7 @@ func getTestCases() []testcase { "gpu.intel.com/platform_gen": "12", "gpu.intel.com/cards": "card0", "gpu.intel.com/gpu-numbers": "0", + "gpu.intel.com/tiles": "1", }, }, { @@ -276,6 +285,7 @@ func getTestCases() []testcase { "gpu.intel.com/memory.max": "32000000000", "gpu.intel.com/cards": "card0.card1", "gpu.intel.com/gpu-numbers": "0.1", + "gpu.intel.com/tiles": "2", }, }, { @@ -301,6 +311,7 @@ func getTestCases() []testcase { "gpu.intel.com/cards": "card0.card1", "gpu.intel.com/gpu-numbers": "0.1", "gpu.intel.com/pci-groups": "0.1", + "gpu.intel.com/tiles": "2", }, pciGroupLevel: 2, }, @@ -327,6 +338,7 @@ func getTestCases() []testcase { "gpu.intel.com/cards": "card0.card1", "gpu.intel.com/gpu-numbers": "0.1", "gpu.intel.com/pci-groups": "0_1", + "gpu.intel.com/tiles": "2", }, pciGroupLevel: 4, }, @@ -403,6 +415,82 @@ func getTestCases() []testcase { "gpu.intel.com/cards": "card0.card1.card10.card11.card12.card13.card14.card15.card16.ca", "gpu.intel.com/gpu-numbers": "0.1.10.11.12.13.14.15.16.17.18.19.2.20.21.22.23.24.25.26.3.4.5.", "gpu.intel.com/gpu-numbers2": "6.7.8.9", + "gpu.intel.com/tiles": "27", + }, + }, + { + sysfsdirs: []string{ + "card0/device/drm/card0", + "card0/gt/gt0", + "card0/gt/gt1", + "card0/gt/gt3", + }, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card0/lmem_total_bytes": []byte("8000"), + }, + name: "successful labeling via card0/lmem_total_bytes and three tiles", + memoryOverride: 16000000000, + capabilityFile: map[string][]byte{ + "0/i915_capabilities": []byte( + "platform: new\n" + + "gen: 9"), + }, + expectedRetval: nil, + expectedLabels: labelMap{ + "gpu.intel.com/graphics_version": "9", + "gpu.intel.com/media_version": "9", + "gpu.intel.com/millicores": "1000", + "gpu.intel.com/memory.max": "24000", + "gpu.intel.com/platform_new.count": "1", + "gpu.intel.com/platform_new.present": "true", + "gpu.intel.com/platform_new.tiles": "3", + "gpu.intel.com/platform_gen": "9", + "gpu.intel.com/cards": "card0", + "gpu.intel.com/gpu-numbers": "0", + "gpu.intel.com/tiles": "3", + }, + }, + { + sysfsdirs: []string{ + "card0/device/drm/card0", + "card0/gt/gt0", + "card0/gt/gt1", + "card1/device/drm/card1", + "card1/gt/gt0", + }, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card0/lmem_total_bytes": []byte("8000"), + "card1/device/vendor": []byte("0x8086"), + "card1/lmem_total_bytes": []byte("8000"), + }, + name: "successful labeling with two cards and total three tiles", + memoryOverride: 16000000000, + capabilityFile: map[string][]byte{ + "0/i915_capabilities": []byte( + "platform: new\n" + + "gen: 9"), + "1/i915_capabilities": []byte( + "platform: newnew\n" + + "gen: 9"), + }, + expectedRetval: nil, + expectedLabels: labelMap{ + "gpu.intel.com/graphics_version": "9", + "gpu.intel.com/media_version": "9", + "gpu.intel.com/millicores": "2000", + "gpu.intel.com/memory.max": "24000", + "gpu.intel.com/platform_new.count": "1", + "gpu.intel.com/platform_new.present": "true", + "gpu.intel.com/platform_new.tiles": "2", + "gpu.intel.com/platform_newnew.count": "1", + "gpu.intel.com/platform_newnew.present": "true", + "gpu.intel.com/platform_newnew.tiles": "1", + "gpu.intel.com/platform_gen": "9", + "gpu.intel.com/gpu-numbers": "0.1", + "gpu.intel.com/cards": "card0.card1", + "gpu.intel.com/tiles": "3", }, }, } @@ -410,8 +498,17 @@ func getTestCases() []testcase { func (tc *testcase) createFiles(t *testing.T, sysfs, root string) { var err error + for filename, body := range tc.capabilityFile { - if err = os.WriteFile(path.Join(root, filename), body, 0600); err != nil { + filePath := path.Join(root, filename) + dirOnly := path.Dir(filePath) + + err = os.MkdirAll(dirOnly, 0750) + if err != nil { + t.Fatalf("Failed to create base directories: %+v", err) + } + + if err = os.WriteFile(filePath, body, 0600); err != nil { t.Fatalf("Failed to create fake capability file: %+v", err) } }