Merge pull request #860 from tkatila/node-tilecount

Add a total tile count to node's labels
This commit is contained in:
Ed Bartosh 2022-01-26 13:09:03 +02:00 committed by GitHub
commit 0f82a95563
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 109 additions and 2 deletions

View File

@ -33,10 +33,13 @@ name | type | description|
|`gpu.intel.com/memory.max`| number | sum of detected [GPU memory amounts](#GPU-memory) in bytes OR environment variable value * GPU count
|`gpu.intel.com/cards`| string | list of card names separated by '`.`'. The names match host `card*`-folders under `/sys/class/drm/`. Deprecated, use `gpu-numbers`.
|`gpu.intel.com/gpu-numbers`| string | list of numbers separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card<num>` in devfs, and `/sys/class/drm/card<num>` in sysfs.
|`gpu.intel.com/tiles`| number | sum of all detected GPU tiles in the system.
If the value of the `gpu-numbers` label would not fit into the 63 character length limit, you will also get labels `gpu-numbers2`,
`gpu-numbers3`... until all the gpu numbers have been labeled.
The tile count `gpu.intel.com/tiles` describes the total amount of tiles on the system. System is expected to be homogeneous, and thus the number of tiles per GPU can be calculated by dividing the tile count with GPU count.
## PCI-groups (optional)
GPUs which share the same pci paths under `/sys/devices/pci*` can be grouped into a label. GPU nums are separated by '`.`' and

View File

@ -36,6 +36,7 @@ const (
gpuNumListLabelName = "gpu-numbers"
millicoreLabelName = "millicores"
pciGroupLabelName = "pci-groups"
tilesLabelName = "tiles"
millicoresPerGPU = 1000
memoryOverrideEnv = "GPU_MEMORY_OVERRIDE"
memoryReservedEnv = "GPU_MEMORY_RESERVED"
@ -350,6 +351,7 @@ func (l *labeler) createLabels() error {
}
gpuNumList := []string{}
tileCount := 0
for _, gpuName := range gpuNameList {
gpuNum := ""
@ -360,6 +362,8 @@ func (l *labeler) createLabels() error {
}
numTiles := l.getTileCount(gpuName)
tileCount += int(numTiles)
memoryAmount := l.getMemoryAmount(gpuName, numTiles)
gpuNumList = append(gpuNumList, gpuName[4:])
@ -370,6 +374,9 @@ func (l *labeler) createLabels() error {
}
gpuCount := len(gpuNumList)
l.labels.addNumericLabel(labelNamespace+tilesLabelName, int64(tileCount))
if gpuCount > 0 {
// add gpu list label (example: "card0.card1.card2") - deprecated
l.labels[labelNamespace+gpuListLabelName] = split(strings.Join(gpuNameList, "."), labelMaxLength)[0]

View File

@ -64,6 +64,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -82,7 +83,9 @@ func getTestCases() []testcase {
"gen: 9"),
},
expectedRetval: nil,
expectedLabels: labelMap{},
expectedLabels: labelMap{
"gpu.intel.com/tiles": "0",
},
},
{
sysfsdirs: []string{
@ -113,6 +116,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "2",
},
},
{
@ -144,6 +148,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -172,6 +177,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -196,6 +202,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_new.tiles": "1",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -225,6 +232,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "12",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -252,6 +260,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_gen": "12",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "1",
},
},
{
@ -276,6 +285,7 @@ func getTestCases() []testcase {
"gpu.intel.com/memory.max": "32000000000",
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/gpu-numbers": "0.1",
"gpu.intel.com/tiles": "2",
},
},
{
@ -301,6 +311,7 @@ func getTestCases() []testcase {
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/gpu-numbers": "0.1",
"gpu.intel.com/pci-groups": "0.1",
"gpu.intel.com/tiles": "2",
},
pciGroupLevel: 2,
},
@ -327,6 +338,7 @@ func getTestCases() []testcase {
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/gpu-numbers": "0.1",
"gpu.intel.com/pci-groups": "0_1",
"gpu.intel.com/tiles": "2",
},
pciGroupLevel: 4,
},
@ -403,6 +415,82 @@ func getTestCases() []testcase {
"gpu.intel.com/cards": "card0.card1.card10.card11.card12.card13.card14.card15.card16.ca",
"gpu.intel.com/gpu-numbers": "0.1.10.11.12.13.14.15.16.17.18.19.2.20.21.22.23.24.25.26.3.4.5.",
"gpu.intel.com/gpu-numbers2": "6.7.8.9",
"gpu.intel.com/tiles": "27",
},
},
{
sysfsdirs: []string{
"card0/device/drm/card0",
"card0/gt/gt0",
"card0/gt/gt1",
"card0/gt/gt3",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card0/lmem_total_bytes": []byte("8000"),
},
name: "successful labeling via card0/lmem_total_bytes and three tiles",
memoryOverride: 16000000000,
capabilityFile: map[string][]byte{
"0/i915_capabilities": []byte(
"platform: new\n" +
"gen: 9"),
},
expectedRetval: nil,
expectedLabels: labelMap{
"gpu.intel.com/graphics_version": "9",
"gpu.intel.com/media_version": "9",
"gpu.intel.com/millicores": "1000",
"gpu.intel.com/memory.max": "24000",
"gpu.intel.com/platform_new.count": "1",
"gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "3",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
"gpu.intel.com/tiles": "3",
},
},
{
sysfsdirs: []string{
"card0/device/drm/card0",
"card0/gt/gt0",
"card0/gt/gt1",
"card1/device/drm/card1",
"card1/gt/gt0",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card0/lmem_total_bytes": []byte("8000"),
"card1/device/vendor": []byte("0x8086"),
"card1/lmem_total_bytes": []byte("8000"),
},
name: "successful labeling with two cards and total three tiles",
memoryOverride: 16000000000,
capabilityFile: map[string][]byte{
"0/i915_capabilities": []byte(
"platform: new\n" +
"gen: 9"),
"1/i915_capabilities": []byte(
"platform: newnew\n" +
"gen: 9"),
},
expectedRetval: nil,
expectedLabels: labelMap{
"gpu.intel.com/graphics_version": "9",
"gpu.intel.com/media_version": "9",
"gpu.intel.com/millicores": "2000",
"gpu.intel.com/memory.max": "24000",
"gpu.intel.com/platform_new.count": "1",
"gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "2",
"gpu.intel.com/platform_newnew.count": "1",
"gpu.intel.com/platform_newnew.present": "true",
"gpu.intel.com/platform_newnew.tiles": "1",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/gpu-numbers": "0.1",
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/tiles": "3",
},
},
}
@ -410,8 +498,17 @@ func getTestCases() []testcase {
func (tc *testcase) createFiles(t *testing.T, sysfs, root string) {
var err error
for filename, body := range tc.capabilityFile {
if err = os.WriteFile(path.Join(root, filename), body, 0600); err != nil {
filePath := path.Join(root, filename)
dirOnly := path.Dir(filePath)
err = os.MkdirAll(dirOnly, 0750)
if err != nil {
t.Fatalf("Failed to create base directories: %+v", err)
}
if err = os.WriteFile(filePath, body, 0600); err != nil {
t.Fatalf("Failed to create fake capability file: %+v", err)
}
}