Add a total tile count to node's labels

This label isn't dependent on the debugfs as the platform
specific tile count is.

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
Tuomas Katila 2021-12-10 15:19:33 +02:00
parent 4ffbfeba29
commit 6f57c55ef8
3 changed files with 109 additions and 2 deletions

View File

@ -33,10 +33,13 @@ name | type | description|
|`gpu.intel.com/memory.max`| number | sum of detected [GPU memory amounts](#GPU-memory) in bytes OR environment variable value * GPU count
|`gpu.intel.com/cards`| string | list of card names separated by '`.`'. The names match host `card*`-folders under `/sys/class/drm/`. Deprecated, use `gpu-numbers`.
|`gpu.intel.com/gpu-numbers`| string | list of numbers separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card<num>` in devfs, and `/sys/class/drm/card<num>` in sysfs.
|`gpu.intel.com/tiles`| number | sum of all detected GPU tiles in the system.
If the value of the `gpu-numbers` label would not fit into the 63 character length limit, you will also get labels `gpu-numbers2`,
`gpu-numbers3`... until all the gpu numbers have been labeled.
The tile count `gpu.intel.com/tiles` describes the total amount of tiles on the system. System is expected to be homogeneous, and thus the number of tiles per GPU can be calculated by dividing the tile count with GPU count.
## PCI-groups (optional)
GPUs which share the same pci paths under `/sys/devices/pci*` can be grouped into a label. GPU nums are separated by '`.`' and

View File

@ -36,6 +36,7 @@ const (
gpuNumListLabelName = "gpu-numbers"
millicoreLabelName = "millicores"
pciGroupLabelName = "pci-groups"
tilesLabelName = "tiles"
millicoresPerGPU = 1000
memoryOverrideEnv = "GPU_MEMORY_OVERRIDE"
memoryReservedEnv = "GPU_MEMORY_RESERVED"
@ -350,6 +351,7 @@ func (l *labeler) createLabels() error {
}
gpuNumList := []string{}
tileCount := 0
for _, gpuName := range gpuNameList {
gpuNum := ""
@ -360,6 +362,8 @@ func (l *labeler) createLabels() error {
}
numTiles := l.getTileCount(gpuName)
tileCount += int(numTiles)
memoryAmount := l.getMemoryAmount(gpuName, numTiles)
gpuNumList = append(gpuNumList, gpuName[4:])
@ -370,6 +374,9 @@ func (l *labeler) createLabels() error {
}
gpuCount := len(gpuNumList)
l.labels.addNumericLabel(labelNamespace+tilesLabelName, int64(tileCount))
if gpuCount > 0 {
// add gpu list label (example: "card0.card1.card2") - deprecated
l.labels[labelNamespace+gpuListLabelName] = split(strings.Join(gpuNameList, "."), labelMaxLength)[0]

View File

@ -64,6 +64,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -82,7 +83,9 @@ func getTestCases() []testcase {
"gen: 9"),
},
expectedRetval: nil,
expectedLabels: labelMap{},
expectedLabels: labelMap{
"gpu.intel.com/tiles": "0",
},
},
{
sysfsdirs: []string{
@ -113,6 +116,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "2",
},
},
{
@ -144,6 +148,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -172,6 +177,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -196,6 +202,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_new.tiles": "1",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -225,6 +232,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "12",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -252,6 +260,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "12",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -276,6 +285,7 @@ func getTestCases() []testcase {
"gpu.intel.com/memory.max": "32000000000",
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/gpu-numbers": "0.1",
"gpu.intel.com/tiles": "2",
},
},
{
@ -301,6 +311,7 @@ func getTestCases() []testcase {
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/gpu-numbers": "0.1",
"gpu.intel.com/pci-groups": "0.1",
"gpu.intel.com/tiles": "2",
},
pciGroupLevel: 2,
},
@ -327,6 +338,7 @@ func getTestCases() []testcase {
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/gpu-numbers": "0.1",
"gpu.intel.com/pci-groups": "0_1",
"gpu.intel.com/tiles": "2",
},
pciGroupLevel: 4,
},
@ -403,6 +415,82 @@ func getTestCases() []testcase {
"gpu.intel.com/cards": "card0.card1.card10.card11.card12.card13.card14.card15.card16.ca",
"gpu.intel.com/gpu-numbers": "0.1.10.11.12.13.14.15.16.17.18.19.2.20.21.22.23.24.25.26.3.4.5.",
"gpu.intel.com/gpu-numbers2": "6.7.8.9",
"gpu.intel.com/tiles": "27",
},
},
{
sysfsdirs: []string{
"card0/device/drm/card0",
"card0/gt/gt0",
"card0/gt/gt1",
"card0/gt/gt3",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card0/lmem_total_bytes": []byte("8000"),
},
name: "successful labeling via card0/lmem_total_bytes and three tiles",
memoryOverride: 16000000000,
capabilityFile: map[string][]byte{
"0/i915_capabilities": []byte(
"platform: new\n" +
"gen: 9"),
},
expectedRetval: nil,
expectedLabels: labelMap{
"gpu.intel.com/graphics_version": "9",
"gpu.intel.com/media_version": "9",
"gpu.intel.com/millicores": "1000",
"gpu.intel.com/memory.max": "24000",
"gpu.intel.com/platform_new.count": "1",
"gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "3",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "3",
},
},
{
sysfsdirs: []string{
"card0/device/drm/card0",
"card0/gt/gt0",
"card0/gt/gt1",
"card1/device/drm/card1",
"card1/gt/gt0",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card0/lmem_total_bytes": []byte("8000"),
"card1/device/vendor": []byte("0x8086"),
"card1/lmem_total_bytes": []byte("8000"),
},
name: "successful labeling with two cards and total three tiles",
memoryOverride: 16000000000,
capabilityFile: map[string][]byte{
"0/i915_capabilities": []byte(
"platform: new\n" +
"gen: 9"),
"1/i915_capabilities": []byte(
"platform: newnew\n" +
"gen: 9"),
},
expectedRetval: nil,
expectedLabels: labelMap{
"gpu.intel.com/graphics_version": "9",
"gpu.intel.com/media_version": "9",
"gpu.intel.com/millicores": "2000",
"gpu.intel.com/memory.max": "24000",
"gpu.intel.com/platform_new.count": "1",
"gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "2",
"gpu.intel.com/platform_newnew.count": "1",
"gpu.intel.com/platform_newnew.present": "true",
"gpu.intel.com/platform_newnew.tiles": "1",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/gpu-numbers": "0.1",
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/tiles": "3",
},
},
}
@ -410,8 +498,17 @@ func getTestCases() []testcase {
func (tc *testcase) createFiles(t *testing.T, sysfs, root string) {
var err error
for filename, body := range tc.capabilityFile {
if err = os.WriteFile(path.Join(root, filename), body, 0600); err != nil {
filePath := path.Join(root, filename)
dirOnly := path.Dir(filePath)
err = os.MkdirAll(dirOnly, 0750)
if err != nil {
t.Fatalf("Failed to create base directories: %+v", err)
}
if err = os.WriteFile(filePath, body, 0600); err != nil {
t.Fatalf("Failed to create fake capability file: %+v", err)
}
}