mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
Add a total tile count to node's labels
This label isn't dependent on the debugfs as the platform specific tile count is. Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
parent
4ffbfeba29
commit
6f57c55ef8
@ -33,10 +33,13 @@ name | type | description|
|
||||
|`gpu.intel.com/memory.max`| number | sum of detected [GPU memory amounts](#GPU-memory) in bytes OR environment variable value * GPU count
|
||||
|`gpu.intel.com/cards`| string | list of card names separated by '`.`'. The names match host `card*`-folders under `/sys/class/drm/`. Deprecated, use `gpu-numbers`.
|
||||
|`gpu.intel.com/gpu-numbers`| string | list of numbers separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card<num>` in devfs, and `/sys/class/drm/card<num>` in sysfs.
|
||||
|`gpu.intel.com/tiles`| number | sum of all detected GPU tiles in the system.
|
||||
|
||||
If the value of the `gpu-numbers` label would not fit into the 63 character length limit, you will also get labels `gpu-numbers2`,
|
||||
`gpu-numbers3`... until all the gpu numbers have been labeled.
|
||||
|
||||
The tile count `gpu.intel.com/tiles` describes the total amount of tiles on the system. System is expected to be homogeneous, and thus the number of tiles per GPU can be calculated by dividing the tile count with GPU count.
|
||||
|
||||
## PCI-groups (optional)
|
||||
|
||||
GPUs which share the same pci paths under `/sys/devices/pci*` can be grouped into a label. GPU nums are separated by '`.`' and
|
||||
|
@ -36,6 +36,7 @@ const (
|
||||
gpuNumListLabelName = "gpu-numbers"
|
||||
millicoreLabelName = "millicores"
|
||||
pciGroupLabelName = "pci-groups"
|
||||
tilesLabelName = "tiles"
|
||||
millicoresPerGPU = 1000
|
||||
memoryOverrideEnv = "GPU_MEMORY_OVERRIDE"
|
||||
memoryReservedEnv = "GPU_MEMORY_RESERVED"
|
||||
@ -350,6 +351,7 @@ func (l *labeler) createLabels() error {
|
||||
}
|
||||
|
||||
gpuNumList := []string{}
|
||||
tileCount := 0
|
||||
|
||||
for _, gpuName := range gpuNameList {
|
||||
gpuNum := ""
|
||||
@ -360,6 +362,8 @@ func (l *labeler) createLabels() error {
|
||||
}
|
||||
|
||||
numTiles := l.getTileCount(gpuName)
|
||||
tileCount += int(numTiles)
|
||||
|
||||
memoryAmount := l.getMemoryAmount(gpuName, numTiles)
|
||||
gpuNumList = append(gpuNumList, gpuName[4:])
|
||||
|
||||
@ -370,6 +374,9 @@ func (l *labeler) createLabels() error {
|
||||
}
|
||||
|
||||
gpuCount := len(gpuNumList)
|
||||
|
||||
l.labels.addNumericLabel(labelNamespace+tilesLabelName, int64(tileCount))
|
||||
|
||||
if gpuCount > 0 {
|
||||
// add gpu list label (example: "card0.card1.card2") - deprecated
|
||||
l.labels[labelNamespace+gpuListLabelName] = split(strings.Join(gpuNameList, "."), labelMaxLength)[0]
|
||||
|
@ -64,6 +64,7 @@ func getTestCases() []testcase {
|
||||
"gpu.intel.com/platform_gen": "9",
|
||||
"gpu.intel.com/cards": "card0",
|
||||
"gpu.intel.com/gpu-numbers": "0",
|
||||
"gpu.intel.com/tiles": "1",
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -82,7 +83,9 @@ func getTestCases() []testcase {
|
||||
"gen: 9"),
|
||||
},
|
||||
expectedRetval: nil,
|
||||
expectedLabels: labelMap{},
|
||||
expectedLabels: labelMap{
|
||||
"gpu.intel.com/tiles": "0",
|
||||
},
|
||||
},
|
||||
{
|
||||
sysfsdirs: []string{
|
||||
@ -113,6 +116,7 @@ func getTestCases() []testcase {
|
||||
"gpu.intel.com/platform_gen": "9",
|
||||
"gpu.intel.com/cards": "card0",
|
||||
"gpu.intel.com/gpu-numbers": "0",
|
||||
"gpu.intel.com/tiles": "2",
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -144,6 +148,7 @@ func getTestCases() []testcase {
|
||||
"gpu.intel.com/platform_gen": "9",
|
||||
"gpu.intel.com/cards": "card0",
|
||||
"gpu.intel.com/gpu-numbers": "0",
|
||||
"gpu.intel.com/tiles": "1",
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -172,6 +177,7 @@ func getTestCases() []testcase {
|
||||
"gpu.intel.com/platform_gen": "9",
|
||||
"gpu.intel.com/cards": "card0",
|
||||
"gpu.intel.com/gpu-numbers": "0",
|
||||
"gpu.intel.com/tiles": "1",
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -196,6 +202,7 @@ func getTestCases() []testcase {
|
||||
"gpu.intel.com/platform_new.tiles": "1",
|
||||
"gpu.intel.com/cards": "card0",
|
||||
"gpu.intel.com/gpu-numbers": "0",
|
||||
"gpu.intel.com/tiles": "1",
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -225,6 +232,7 @@ func getTestCases() []testcase {
|
||||
"gpu.intel.com/platform_gen": "12",
|
||||
"gpu.intel.com/cards": "card0",
|
||||
"gpu.intel.com/gpu-numbers": "0",
|
||||
"gpu.intel.com/tiles": "1",
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -252,6 +260,7 @@ func getTestCases() []testcase {
|
||||
"gpu.intel.com/platform_gen": "12",
|
||||
"gpu.intel.com/cards": "card0",
|
||||
"gpu.intel.com/gpu-numbers": "0",
|
||||
"gpu.intel.com/tiles": "1",
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -276,6 +285,7 @@ func getTestCases() []testcase {
|
||||
"gpu.intel.com/memory.max": "32000000000",
|
||||
"gpu.intel.com/cards": "card0.card1",
|
||||
"gpu.intel.com/gpu-numbers": "0.1",
|
||||
"gpu.intel.com/tiles": "2",
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -301,6 +311,7 @@ func getTestCases() []testcase {
|
||||
"gpu.intel.com/cards": "card0.card1",
|
||||
"gpu.intel.com/gpu-numbers": "0.1",
|
||||
"gpu.intel.com/pci-groups": "0.1",
|
||||
"gpu.intel.com/tiles": "2",
|
||||
},
|
||||
pciGroupLevel: 2,
|
||||
},
|
||||
@ -327,6 +338,7 @@ func getTestCases() []testcase {
|
||||
"gpu.intel.com/cards": "card0.card1",
|
||||
"gpu.intel.com/gpu-numbers": "0.1",
|
||||
"gpu.intel.com/pci-groups": "0_1",
|
||||
"gpu.intel.com/tiles": "2",
|
||||
},
|
||||
pciGroupLevel: 4,
|
||||
},
|
||||
@ -403,6 +415,82 @@ func getTestCases() []testcase {
|
||||
"gpu.intel.com/cards": "card0.card1.card10.card11.card12.card13.card14.card15.card16.ca",
|
||||
"gpu.intel.com/gpu-numbers": "0.1.10.11.12.13.14.15.16.17.18.19.2.20.21.22.23.24.25.26.3.4.5.",
|
||||
"gpu.intel.com/gpu-numbers2": "6.7.8.9",
|
||||
"gpu.intel.com/tiles": "27",
|
||||
},
|
||||
},
|
||||
{
|
||||
sysfsdirs: []string{
|
||||
"card0/device/drm/card0",
|
||||
"card0/gt/gt0",
|
||||
"card0/gt/gt1",
|
||||
"card0/gt/gt3",
|
||||
},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
"card0/lmem_total_bytes": []byte("8000"),
|
||||
},
|
||||
name: "successful labeling via card0/lmem_total_bytes and three tiles",
|
||||
memoryOverride: 16000000000,
|
||||
capabilityFile: map[string][]byte{
|
||||
"0/i915_capabilities": []byte(
|
||||
"platform: new\n" +
|
||||
"gen: 9"),
|
||||
},
|
||||
expectedRetval: nil,
|
||||
expectedLabels: labelMap{
|
||||
"gpu.intel.com/graphics_version": "9",
|
||||
"gpu.intel.com/media_version": "9",
|
||||
"gpu.intel.com/millicores": "1000",
|
||||
"gpu.intel.com/memory.max": "24000",
|
||||
"gpu.intel.com/platform_new.count": "1",
|
||||
"gpu.intel.com/platform_new.present": "true",
|
||||
"gpu.intel.com/platform_new.tiles": "3",
|
||||
"gpu.intel.com/platform_gen": "9",
|
||||
"gpu.intel.com/cards": "card0",
|
||||
"gpu.intel.com/gpu-numbers": "0",
|
||||
"gpu.intel.com/tiles": "3",
|
||||
},
|
||||
},
|
||||
{
|
||||
sysfsdirs: []string{
|
||||
"card0/device/drm/card0",
|
||||
"card0/gt/gt0",
|
||||
"card0/gt/gt1",
|
||||
"card1/device/drm/card1",
|
||||
"card1/gt/gt0",
|
||||
},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
"card0/lmem_total_bytes": []byte("8000"),
|
||||
"card1/device/vendor": []byte("0x8086"),
|
||||
"card1/lmem_total_bytes": []byte("8000"),
|
||||
},
|
||||
name: "successful labeling with two cards and total three tiles",
|
||||
memoryOverride: 16000000000,
|
||||
capabilityFile: map[string][]byte{
|
||||
"0/i915_capabilities": []byte(
|
||||
"platform: new\n" +
|
||||
"gen: 9"),
|
||||
"1/i915_capabilities": []byte(
|
||||
"platform: newnew\n" +
|
||||
"gen: 9"),
|
||||
},
|
||||
expectedRetval: nil,
|
||||
expectedLabels: labelMap{
|
||||
"gpu.intel.com/graphics_version": "9",
|
||||
"gpu.intel.com/media_version": "9",
|
||||
"gpu.intel.com/millicores": "2000",
|
||||
"gpu.intel.com/memory.max": "24000",
|
||||
"gpu.intel.com/platform_new.count": "1",
|
||||
"gpu.intel.com/platform_new.present": "true",
|
||||
"gpu.intel.com/platform_new.tiles": "2",
|
||||
"gpu.intel.com/platform_newnew.count": "1",
|
||||
"gpu.intel.com/platform_newnew.present": "true",
|
||||
"gpu.intel.com/platform_newnew.tiles": "1",
|
||||
"gpu.intel.com/platform_gen": "9",
|
||||
"gpu.intel.com/gpu-numbers": "0.1",
|
||||
"gpu.intel.com/cards": "card0.card1",
|
||||
"gpu.intel.com/tiles": "3",
|
||||
},
|
||||
},
|
||||
}
|
||||
@ -410,8 +498,17 @@ func getTestCases() []testcase {
|
||||
|
||||
func (tc *testcase) createFiles(t *testing.T, sysfs, root string) {
|
||||
var err error
|
||||
|
||||
for filename, body := range tc.capabilityFile {
|
||||
if err = os.WriteFile(path.Join(root, filename), body, 0600); err != nil {
|
||||
filePath := path.Join(root, filename)
|
||||
dirOnly := path.Dir(filePath)
|
||||
|
||||
err = os.MkdirAll(dirOnly, 0750)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create base directories: %+v", err)
|
||||
}
|
||||
|
||||
if err = os.WriteFile(filePath, body, 0600); err != nil {
|
||||
t.Fatalf("Failed to create fake capability file: %+v", err)
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user