From 75203930413e46b0a1c7d3d895f6c4315c42abd4 Mon Sep 17 00:00:00 2001 From: Ukri Niemimuukko Date: Fri, 14 Jan 2022 18:15:54 +0200 Subject: [PATCH] gpu_nfdhook: gpu-numbers and pci-groups This adds a new label "gpu-numbers" for short numbered lists of gpus, omitting "card" from the names. Also adds splitting of long label values. Similarly this adds a new label "pci-groups" for PCI groups. Grouping can be controlled by env var GPU_PCI_GROUPING_LEVEL. The env var dictates, how many pci-folder names need to match, in order for GPUs to be considered to belong in a group. Signed-off-by: Ukri Niemimuukko --- cmd/gpu_nfdhook/README.md | 23 ++++- cmd/gpu_nfdhook/labeler.go | 153 ++++++++++++++++++++++++---- cmd/gpu_nfdhook/labeler_test.go | 174 +++++++++++++++++++++++++++++++- 3 files changed, 327 insertions(+), 23 deletions(-) diff --git a/cmd/gpu_nfdhook/README.md b/cmd/gpu_nfdhook/README.md index 9c20de2e..7defe665 100644 --- a/cmd/gpu_nfdhook/README.md +++ b/cmd/gpu_nfdhook/README.md @@ -31,7 +31,28 @@ name | type | description| -----|------|------| |`gpu.intel.com/millicores`| number | node GPU count * 1000. Can be used as a finer grained shared execution fraction. |`gpu.intel.com/memory.max`| number | sum of detected [GPU memory amounts](#GPU-memory) in bytes OR environment variable value * GPU count -|`gpu.intel.com/cards`| string | list of card names separated by '`.`'. The names match host `card*`-folders under `/sys/class/drm/`. +|`gpu.intel.com/cards`| string | list of card names separated by '`.`'. The names match host `card*`-folders under `/sys/class/drm/`. Deprecated, use `gpu-numbers`. +|`gpu.intel.com/gpu-numbers`| string | list of numbers separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card` in devfs, and `/sys/class/drm/card` in sysfs. + +If the value of the `gpu-numbers` label would not fit into the 63 character length limit, you will also get labels `gpu-numbers2`, +`gpu-numbers3`... until all the gpu numbers have been labeled. + +## PCI-groups (optional) + +GPUs which share the same pci paths under `/sys/devices/pci*` can be grouped into a label. GPU nums are separated by '`.`' and +groups are separated by '`_`'. The label is created only if environment variable named `GPU_PCI_GROUPING_LEVEL` has a value greater +than zero. GPUs are considered to belong to the same group, if as many identical folder names are found for the GPUs, as is the value +of the environment variable. Counting starts from the folder name which starts with `pci`. + +For example, the SG1 card has 4 GPUs, which end up sharing pci-folder names under `/sys/devices`. With a `GPU_PCI_GROUPING_LEVEL` +of 3, a node with two such SG1 cards could produce a `pci-groups` label with a value of `0.1.2.3_4.5.6.7`. + +name | type | description| +-----|------|------| +|`gpu.intel.com/pci-groups`| string | list of pci-groups separated by '`_`'. GPU numbers in the groups are separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card` in devfs, and `/sys/class/drm/card` in sysfs. + +If the value of the `pci-groups` label would not fit into the 63 character length limit, you will also get labels `pci-groups2`, +`pci-groups3`... until all the pci groups have been labeled. ## Capability labels (optional) diff --git a/cmd/gpu_nfdhook/labeler.go b/cmd/gpu_nfdhook/labeler.go index 0e7326e8..43237a90 100644 --- a/cmd/gpu_nfdhook/labeler.go +++ b/cmd/gpu_nfdhook/labeler.go @@ -21,6 +21,7 @@ import ( "path" "path/filepath" "regexp" + "sort" "strconv" "strings" @@ -30,15 +31,19 @@ import ( ) const ( - labelNamespace = "gpu.intel.com/" - gpuListLabelName = "cards" - millicoreLabelName = "millicores" - millicoresPerGPU = 1000 - memoryOverrideEnv = "GPU_MEMORY_OVERRIDE" - memoryReservedEnv = "GPU_MEMORY_RESERVED" - gpuDeviceRE = `^card[0-9]+$` - controlDeviceRE = `^controlD[0-9]+$` - vendorString = "0x8086" + labelNamespace = "gpu.intel.com/" + gpuListLabelName = "cards" + gpuNumListLabelName = "gpu-numbers" + millicoreLabelName = "millicores" + pciGroupLabelName = "pci-groups" + millicoresPerGPU = 1000 + memoryOverrideEnv = "GPU_MEMORY_OVERRIDE" + memoryReservedEnv = "GPU_MEMORY_RESERVED" + pciGroupingEnv = "GPU_PCI_GROUPING_LEVEL" + gpuDeviceRE = `^card[0-9]+$` + controlDeviceRE = `^controlD[0-9]+$` + vendorString = "0x8086" + labelMaxLength = 63 ) type labelMap map[string]string @@ -62,6 +67,38 @@ func newLabeler(sysfsDRMDir, debugfsDRIDir string) *labeler { } } +// getPCIPathParts returns a subPath from the given full path starting from folder with prefix "pci". +// returns "" in case not enough folders are found after the one starting with "pci". +func getPCIPathParts(numFolders uint64, fullPath string) string { + parts := strings.Split(fullPath, "/") + + if len(parts) == 1 { + return "" + } + + foundPci := false + subPath := "" + separator := "" + + for _, part := range parts { + if !foundPci && strings.HasPrefix(part, "pci") { + foundPci = true + } + + if foundPci && numFolders > 0 { + subPath = subPath + separator + part + separator = "/" + numFolders-- + } + + if numFolders == 0 { + return subPath + } + } + + return "" +} + func (l *labeler) scan() ([]string, error) { files, err := os.ReadDir(l.sysfsDRMDir) gpuNameList := []string{} @@ -164,9 +201,9 @@ func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) { } // createCapabilityLabels creates labels from the gpu capability file under debugfs. -func (l *labeler) createCapabilityLabels(cardNum string, numTiles uint64) { +func (l *labeler) createCapabilityLabels(gpuNum string, numTiles uint64) { // try to read the capabilities from the i915_capabilities file - file, err := os.Open(filepath.Join(l.debugfsDRIDir, cardNum, "i915_capabilities")) + file, err := os.Open(filepath.Join(l.debugfsDRIDir, gpuNum, "i915_capabilities")) if err != nil { klog.V(3).Infof("Couldn't open file:%s", err.Error()) // debugfs is not stable, there is no need to spam with error level prints return @@ -245,6 +282,66 @@ scanning: } } +// this returns pci groups label value, groups separated by "_", gpus separated by ".". +// Example for two groups with 4 gpus: "0.1.2.3_4.5.6.7". +func (l *labeler) createPCIGroupLabel(gpuNumList []string) string { + pciGroups := map[string][]string{} + + pciGroupLevel := getEnvVarNumber(pciGroupingEnv) + if pciGroupLevel == 0 { + return "" + } + + for _, gpuNum := range gpuNumList { + symLinkTarget, err := filepath.EvalSymlinks(path.Join(l.sysfsDRMDir, "card"+gpuNum)) + + if err == nil { + if pathPart := getPCIPathParts(pciGroupLevel, symLinkTarget); pathPart != "" { + pciGroups[pathPart] = append(pciGroups[pathPart], gpuNum) + } + } + } + + labelValue := "" + separator := "" + + // process in stable order by sorting + keys := []string{} + for key := range pciGroups { + keys = append(keys, key) + } + + sort.Strings(keys) + + for _, key := range keys { + labelValue = labelValue + separator + strings.Join(pciGroups[key], ".") + separator = "_" + } + + return labelValue +} + +// split returns the given string cut to chunks of size up to maxLength size. +// maxLength refers to the max length of the strings in the returned slice. +// If the whole input string fits under maxLength, it is not split. +// split("foo_bar", 4) returns []string{"foo_", "bar"}. +func split(str string, maxLength uint) []string { + remainingString := str + results := []string{} + + for len(remainingString) >= 0 { + if uint(len(remainingString)) <= maxLength { + results = append(results, remainingString) + return results + } + + results = append(results, remainingString[:maxLength]) + remainingString = remainingString[maxLength:] + } + + return results +} + // createLabels is the main function of plugin labeler, it creates label-value pairs for the gpus. func (l *labeler) createLabels() error { gpuNameList, err := l.scan() @@ -252,19 +349,19 @@ func (l *labeler) createLabels() error { return err } + gpuNumList := []string{} + for _, gpuName := range gpuNameList { gpuNum := "" - // extract card number as a string. scan() has already checked name syntax + // extract gpu number as a string. scan() has already checked name syntax _, err = fmt.Sscanf(gpuName, "card%s", &gpuNum) if err != nil { return errors.Wrap(err, "gpu name parsing error") } - // read the tile count numTiles := l.getTileCount(gpuName) - - // read memory amount memoryAmount := l.getMemoryAmount(gpuName, numTiles) + gpuNumList = append(gpuNumList, gpuName[4:]) // try to add capability labels l.createCapabilityLabels(gpuNum, numTiles) @@ -272,13 +369,33 @@ func (l *labeler) createLabels() error { l.labels.addNumericLabel(labelNamespace+"memory.max", int64(memoryAmount)) } - gpuCount := len(gpuNameList) + gpuCount := len(gpuNumList) if gpuCount > 0 { - // add gpu list label (example: "card0.card1.card2") - l.labels[labelNamespace+gpuListLabelName] = strings.Join(gpuNameList, ".") + // add gpu list label (example: "card0.card1.card2") - deprecated + l.labels[labelNamespace+gpuListLabelName] = split(strings.Join(gpuNameList, "."), labelMaxLength)[0] + + // add gpu num list label(s) (example: "0.1.2", which is short form of "card0.card1.card2") + allGPUs := strings.Join(gpuNumList, ".") + gpuNumLists := split(allGPUs, labelMaxLength) + + l.labels[labelNamespace+gpuNumListLabelName] = gpuNumLists[0] + for i := 1; i < len(gpuNumLists); i++ { + l.labels[labelNamespace+gpuNumListLabelName+strconv.FormatInt(int64(i+1), 10)] = gpuNumLists[i] + } // all GPUs get default number of millicores (1000) l.labels.addNumericLabel(labelNamespace+millicoreLabelName, int64(millicoresPerGPU*gpuCount)) + + // aa pci-group label(s), (two group example: "1.2.3.4_5.6.7.8") + allPCIGroups := l.createPCIGroupLabel(gpuNumList) + if allPCIGroups != "" { + pciGroups := split(allPCIGroups, labelMaxLength) + + l.labels[labelNamespace+pciGroupLabelName] = pciGroups[0] + for i := 1; i < len(gpuNumLists); i++ { + l.labels[labelNamespace+pciGroupLabelName+strconv.FormatInt(int64(i+1), 10)] = pciGroups[i] + } + } } return nil diff --git a/cmd/gpu_nfdhook/labeler_test.go b/cmd/gpu_nfdhook/labeler_test.go index be028a6d..6f49066d 100644 --- a/cmd/gpu_nfdhook/labeler_test.go +++ b/cmd/gpu_nfdhook/labeler_test.go @@ -31,6 +31,7 @@ type testcase struct { sysfsdirs []string memoryOverride uint64 memoryReserved uint64 + pciGroupLevel uint64 } func getTestCases() []testcase { @@ -62,6 +63,7 @@ func getTestCases() []testcase { "gpu.intel.com/media_version": "9", "gpu.intel.com/platform_gen": "9", "gpu.intel.com/cards": "card0", + "gpu.intel.com/gpu-numbers": "0", }, }, { @@ -110,6 +112,7 @@ func getTestCases() []testcase { "gpu.intel.com/media_version": "9", "gpu.intel.com/platform_gen": "9", "gpu.intel.com/cards": "card0", + "gpu.intel.com/gpu-numbers": "0", }, }, { @@ -140,6 +143,7 @@ func getTestCases() []testcase { "gpu.intel.com/media_version": "9", "gpu.intel.com/platform_gen": "9", "gpu.intel.com/cards": "card0", + "gpu.intel.com/gpu-numbers": "0", }, }, { @@ -167,6 +171,7 @@ func getTestCases() []testcase { "gpu.intel.com/media_version": "9", "gpu.intel.com/platform_gen": "9", "gpu.intel.com/cards": "card0", + "gpu.intel.com/gpu-numbers": "0", }, }, { @@ -190,6 +195,7 @@ func getTestCases() []testcase { "gpu.intel.com/platform_new.present": "true", "gpu.intel.com/platform_new.tiles": "1", "gpu.intel.com/cards": "card0", + "gpu.intel.com/gpu-numbers": "0", }, }, { @@ -218,6 +224,7 @@ func getTestCases() []testcase { "gpu.intel.com/media_version": "12.5", "gpu.intel.com/platform_gen": "12", "gpu.intel.com/cards": "card0", + "gpu.intel.com/gpu-numbers": "0", }, }, { @@ -244,6 +251,7 @@ func getTestCases() []testcase { "gpu.intel.com/media_version": "12.5", "gpu.intel.com/platform_gen": "12", "gpu.intel.com/cards": "card0", + "gpu.intel.com/gpu-numbers": "0", }, }, { @@ -264,9 +272,137 @@ func getTestCases() []testcase { }, expectedRetval: nil, expectedLabels: labelMap{ - "gpu.intel.com/millicores": "2000", - "gpu.intel.com/memory.max": "32000000000", - "gpu.intel.com/cards": "card0.card1", + "gpu.intel.com/millicores": "2000", + "gpu.intel.com/memory.max": "32000000000", + "gpu.intel.com/cards": "card0.card1", + "gpu.intel.com/gpu-numbers": "0.1", + }, + }, + { + sysfsdirs: []string{ + "card0/device/drm/card0", + "card1/device/drm/card1", + }, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card1/device/vendor": []byte("0x8086"), + }, + name: "when all the gpus are in the same pci-group", + memoryOverride: 16000000000, + capabilityFile: map[string][]byte{ + "foobar": []byte( + "platform: new\n" + + "gen: 9"), + }, + expectedRetval: nil, + expectedLabels: labelMap{ + "gpu.intel.com/millicores": "2000", + "gpu.intel.com/memory.max": "32000000000", + "gpu.intel.com/cards": "card0.card1", + "gpu.intel.com/gpu-numbers": "0.1", + "gpu.intel.com/pci-groups": "0.1", + }, + pciGroupLevel: 2, + }, + { + sysfsdirs: []string{ + "card0/device/drm/card0", + "card1/device/drm/card1", + }, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card1/device/vendor": []byte("0x8086"), + }, + name: "when all the gpus belong to different pci-groups", + memoryOverride: 16000000000, + capabilityFile: map[string][]byte{ + "foobar": []byte( + "platform: new\n" + + "gen: 9"), + }, + expectedRetval: nil, + expectedLabels: labelMap{ + "gpu.intel.com/millicores": "2000", + "gpu.intel.com/memory.max": "32000000000", + "gpu.intel.com/cards": "card0.card1", + "gpu.intel.com/gpu-numbers": "0.1", + "gpu.intel.com/pci-groups": "0_1", + }, + pciGroupLevel: 4, + }, + { + sysfsdirs: []string{ + "card0/device/drm/card0", + "card1/device/drm/card1", + "card2/device/drm/card2", + "card3/device/drm/card3", + "card4/device/drm/card4", + "card5/device/drm/card5", + "card6/device/drm/card6", + "card7/device/drm/card7", + "card8/device/drm/card8", + "card9/device/drm/card9", + "card10/device/drm/card10", + "card11/device/drm/card11", + "card12/device/drm/card12", + "card13/device/drm/card13", + "card14/device/drm/card14", + "card15/device/drm/card15", + "card16/device/drm/card16", + "card17/device/drm/card17", + "card18/device/drm/card18", + "card19/device/drm/card19", + "card20/device/drm/card20", + "card21/device/drm/card21", + "card22/device/drm/card22", + "card23/device/drm/card23", + "card24/device/drm/card24", + "card25/device/drm/card25", + "card26/device/drm/card26", + }, + sysfsfiles: map[string][]byte{ + "card0/device/vendor": []byte("0x8086"), + "card1/device/vendor": []byte("0x8086"), + "card2/device/vendor": []byte("0x8086"), + "card3/device/vendor": []byte("0x8086"), + "card4/device/vendor": []byte("0x8086"), + "card5/device/vendor": []byte("0x8086"), + "card6/device/vendor": []byte("0x8086"), + "card7/device/vendor": []byte("0x8086"), + "card8/device/vendor": []byte("0x8086"), + "card9/device/vendor": []byte("0x8086"), + "card10/device/vendor": []byte("0x8086"), + "card11/device/vendor": []byte("0x8086"), + "card12/device/vendor": []byte("0x8086"), + "card13/device/vendor": []byte("0x8086"), + "card14/device/vendor": []byte("0x8086"), + "card15/device/vendor": []byte("0x8086"), + "card16/device/vendor": []byte("0x8086"), + "card17/device/vendor": []byte("0x8086"), + "card18/device/vendor": []byte("0x8086"), + "card19/device/vendor": []byte("0x8086"), + "card20/device/vendor": []byte("0x8086"), + "card21/device/vendor": []byte("0x8086"), + "card22/device/vendor": []byte("0x8086"), + "card23/device/vendor": []byte("0x8086"), + "card24/device/vendor": []byte("0x8086"), + "card25/device/vendor": []byte("0x8086"), + "card26/device/vendor": []byte("0x8086"), + }, + name: "when there are way too many gpus, cards label gets truncated", + memoryOverride: 16000000000, + capabilityFile: map[string][]byte{ + "foobar": []byte( + "platform: new\n" + + "gen: 9"), + }, + expectedRetval: nil, + expectedLabels: labelMap{ + "gpu.intel.com/millicores": "27000", + "gpu.intel.com/memory.max": "432000000000", + "gpu.intel.com/cards": "card0.card1.card10.card11.card12.card13.card14.card15.card16.ca", + "gpu.intel.com/gpu-numbers": "0.1.10.11.12.13.14.15.16.17.18.19.2.20.21.22.23.24.25.26.3.4.5.", + "gpu.intel.com/gpu-numbers2": "6.7.8.9", }, }, } @@ -293,6 +429,35 @@ func (tc *testcase) createFiles(t *testing.T, sysfs, root string) { } } +func TestSplit(t *testing.T) { + tests := []struct { + name string + str string + expectedResult []string + maxLength uint + }{ + { + name: "single small enough input string passes through unsplit", + str: "1.2.3.4", + maxLength: 10, + expectedResult: []string{"1.2.3.4"}, + }, + { + name: "foo_bar with maxLength 4 gets split to foo_ and bar", + str: "foo_bar", + maxLength: 4, + expectedResult: []string{"foo_", "bar"}, + }, + } + + for _, test := range tests { + result := split(test.str, test.maxLength) + if !reflect.DeepEqual(test.expectedResult, result) { + t.Errorf("\n%q ended up with unexpected result %v vs expected %v", test.name, result, test.expectedResult) + } + } +} + func TestLabeling(t *testing.T) { root, err := os.MkdirTemp("", "test_new_device_plugin") if err != nil { @@ -315,12 +480,13 @@ func TestLabeling(t *testing.T) { if err != nil { t.Fatalf("couldn't create dir: %s", err.Error()) } - sysfs := path.Join(subroot, sysfsDirectory) + sysfs := path.Join(subroot, "pci0000:00/0000:00:1b.4", sysfsDirectory) tc.createFiles(t, sysfs, subroot) os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10)) os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10)) + os.Setenv(pciGroupingEnv, strconv.FormatUint(tc.pciGroupLevel, 10)) labeler := newLabeler(sysfs, subroot) err = labeler.createLabels()