Merge pull request #854 from uniemimu/pcigrouping

gpu_nfdhook: gpu-numbers and pci-groups
This commit is contained in:
Ed Bartosh 2022-01-25 11:28:30 +02:00 committed by GitHub
commit 4ffbfeba29
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 327 additions and 23 deletions

View File

@ -31,7 +31,28 @@ name | type | description|
-----|------|------|
|`gpu.intel.com/millicores`| number | node GPU count * 1000. Can be used as a finer grained shared execution fraction.
|`gpu.intel.com/memory.max`| number | sum of detected [GPU memory amounts](#GPU-memory) in bytes OR environment variable value * GPU count
|`gpu.intel.com/cards`| string | list of card names separated by '`.`'. The names match host `card*`-folders under `/sys/class/drm/`.
|`gpu.intel.com/cards`| string | list of card names separated by '`.`'. The names match host `card*`-folders under `/sys/class/drm/`. Deprecated, use `gpu-numbers`.
|`gpu.intel.com/gpu-numbers`| string | list of numbers separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card<num>` in devfs, and `/sys/class/drm/card<num>` in sysfs.
If the value of the `gpu-numbers` label would not fit into the 63 character length limit, you will also get labels `gpu-numbers2`,
`gpu-numbers3`... until all the gpu numbers have been labeled.
## PCI-groups (optional)
GPUs which share the same pci paths under `/sys/devices/pci*` can be grouped into a label. GPU nums are separated by '`.`' and
groups are separated by '`_`'. The label is created only if environment variable named `GPU_PCI_GROUPING_LEVEL` has a value greater
than zero. GPUs are considered to belong to the same group, if as many identical folder names are found for the GPUs, as is the value
of the environment variable. Counting starts from the folder name which starts with `pci`.
For example, the SG1 card has 4 GPUs, which end up sharing pci-folder names under `/sys/devices`. With a `GPU_PCI_GROUPING_LEVEL`
of 3, a node with two such SG1 cards could produce a `pci-groups` label with a value of `0.1.2.3_4.5.6.7`.
name | type | description|
-----|------|------|
|`gpu.intel.com/pci-groups`| string | list of pci-groups separated by '`_`'. GPU numbers in the groups are separated by '`.`'. The numbers correspond to device file numbers for the primary nodes of given GPUs in kernel DRI subsystem, listed as `/dev/dri/card<num>` in devfs, and `/sys/class/drm/card<num>` in sysfs.
If the value of the `pci-groups` label would not fit into the 63 character length limit, you will also get labels `pci-groups2`,
`pci-groups3`... until all the pci groups have been labeled.
## Capability labels (optional)

View File

@ -21,6 +21,7 @@ import (
"path"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
@ -30,15 +31,19 @@ import (
)
const (
labelNamespace = "gpu.intel.com/"
gpuListLabelName = "cards"
millicoreLabelName = "millicores"
millicoresPerGPU = 1000
memoryOverrideEnv = "GPU_MEMORY_OVERRIDE"
memoryReservedEnv = "GPU_MEMORY_RESERVED"
gpuDeviceRE = `^card[0-9]+$`
controlDeviceRE = `^controlD[0-9]+$`
vendorString = "0x8086"
labelNamespace = "gpu.intel.com/"
gpuListLabelName = "cards"
gpuNumListLabelName = "gpu-numbers"
millicoreLabelName = "millicores"
pciGroupLabelName = "pci-groups"
millicoresPerGPU = 1000
memoryOverrideEnv = "GPU_MEMORY_OVERRIDE"
memoryReservedEnv = "GPU_MEMORY_RESERVED"
pciGroupingEnv = "GPU_PCI_GROUPING_LEVEL"
gpuDeviceRE = `^card[0-9]+$`
controlDeviceRE = `^controlD[0-9]+$`
vendorString = "0x8086"
labelMaxLength = 63
)
type labelMap map[string]string
@ -62,6 +67,38 @@ func newLabeler(sysfsDRMDir, debugfsDRIDir string) *labeler {
}
}
// getPCIPathParts returns a subPath from the given full path starting from folder with prefix "pci".
// returns "" in case not enough folders are found after the one starting with "pci".
func getPCIPathParts(numFolders uint64, fullPath string) string {
parts := strings.Split(fullPath, "/")
if len(parts) == 1 {
return ""
}
foundPci := false
subPath := ""
separator := ""
for _, part := range parts {
if !foundPci && strings.HasPrefix(part, "pci") {
foundPci = true
}
if foundPci && numFolders > 0 {
subPath = subPath + separator + part
separator = "/"
numFolders--
}
if numFolders == 0 {
return subPath
}
}
return ""
}
func (l *labeler) scan() ([]string, error) {
files, err := os.ReadDir(l.sysfsDRMDir)
gpuNameList := []string{}
@ -164,9 +201,9 @@ func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) {
}
// createCapabilityLabels creates labels from the gpu capability file under debugfs.
func (l *labeler) createCapabilityLabels(cardNum string, numTiles uint64) {
func (l *labeler) createCapabilityLabels(gpuNum string, numTiles uint64) {
// try to read the capabilities from the i915_capabilities file
file, err := os.Open(filepath.Join(l.debugfsDRIDir, cardNum, "i915_capabilities"))
file, err := os.Open(filepath.Join(l.debugfsDRIDir, gpuNum, "i915_capabilities"))
if err != nil {
klog.V(3).Infof("Couldn't open file:%s", err.Error()) // debugfs is not stable, there is no need to spam with error level prints
return
@ -245,6 +282,66 @@ scanning:
}
}
// this returns pci groups label value, groups separated by "_", gpus separated by ".".
// Example for two groups with 4 gpus: "0.1.2.3_4.5.6.7".
func (l *labeler) createPCIGroupLabel(gpuNumList []string) string {
pciGroups := map[string][]string{}
pciGroupLevel := getEnvVarNumber(pciGroupingEnv)
if pciGroupLevel == 0 {
return ""
}
for _, gpuNum := range gpuNumList {
symLinkTarget, err := filepath.EvalSymlinks(path.Join(l.sysfsDRMDir, "card"+gpuNum))
if err == nil {
if pathPart := getPCIPathParts(pciGroupLevel, symLinkTarget); pathPart != "" {
pciGroups[pathPart] = append(pciGroups[pathPart], gpuNum)
}
}
}
labelValue := ""
separator := ""
// process in stable order by sorting
keys := []string{}
for key := range pciGroups {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
labelValue = labelValue + separator + strings.Join(pciGroups[key], ".")
separator = "_"
}
return labelValue
}
// split returns the given string cut to chunks of size up to maxLength size.
// maxLength refers to the max length of the strings in the returned slice.
// If the whole input string fits under maxLength, it is not split.
// split("foo_bar", 4) returns []string{"foo_", "bar"}.
func split(str string, maxLength uint) []string {
remainingString := str
results := []string{}
for len(remainingString) >= 0 {
if uint(len(remainingString)) <= maxLength {
results = append(results, remainingString)
return results
}
results = append(results, remainingString[:maxLength])
remainingString = remainingString[maxLength:]
}
return results
}
// createLabels is the main function of plugin labeler, it creates label-value pairs for the gpus.
func (l *labeler) createLabels() error {
gpuNameList, err := l.scan()
@ -252,19 +349,19 @@ func (l *labeler) createLabels() error {
return err
}
gpuNumList := []string{}
for _, gpuName := range gpuNameList {
gpuNum := ""
// extract card number as a string. scan() has already checked name syntax
// extract gpu number as a string. scan() has already checked name syntax
_, err = fmt.Sscanf(gpuName, "card%s", &gpuNum)
if err != nil {
return errors.Wrap(err, "gpu name parsing error")
}
// read the tile count
numTiles := l.getTileCount(gpuName)
// read memory amount
memoryAmount := l.getMemoryAmount(gpuName, numTiles)
gpuNumList = append(gpuNumList, gpuName[4:])
// try to add capability labels
l.createCapabilityLabels(gpuNum, numTiles)
@ -272,13 +369,33 @@ func (l *labeler) createLabels() error {
l.labels.addNumericLabel(labelNamespace+"memory.max", int64(memoryAmount))
}
gpuCount := len(gpuNameList)
gpuCount := len(gpuNumList)
if gpuCount > 0 {
// add gpu list label (example: "card0.card1.card2")
l.labels[labelNamespace+gpuListLabelName] = strings.Join(gpuNameList, ".")
// add gpu list label (example: "card0.card1.card2") - deprecated
l.labels[labelNamespace+gpuListLabelName] = split(strings.Join(gpuNameList, "."), labelMaxLength)[0]
// add gpu num list label(s) (example: "0.1.2", which is short form of "card0.card1.card2")
allGPUs := strings.Join(gpuNumList, ".")
gpuNumLists := split(allGPUs, labelMaxLength)
l.labels[labelNamespace+gpuNumListLabelName] = gpuNumLists[0]
for i := 1; i < len(gpuNumLists); i++ {
l.labels[labelNamespace+gpuNumListLabelName+strconv.FormatInt(int64(i+1), 10)] = gpuNumLists[i]
}
// all GPUs get default number of millicores (1000)
l.labels.addNumericLabel(labelNamespace+millicoreLabelName, int64(millicoresPerGPU*gpuCount))
// aa pci-group label(s), (two group example: "1.2.3.4_5.6.7.8")
allPCIGroups := l.createPCIGroupLabel(gpuNumList)
if allPCIGroups != "" {
pciGroups := split(allPCIGroups, labelMaxLength)
l.labels[labelNamespace+pciGroupLabelName] = pciGroups[0]
for i := 1; i < len(gpuNumLists); i++ {
l.labels[labelNamespace+pciGroupLabelName+strconv.FormatInt(int64(i+1), 10)] = pciGroups[i]
}
}
}
return nil

View File

@ -31,6 +31,7 @@ type testcase struct {
sysfsdirs []string
memoryOverride uint64
memoryReserved uint64
pciGroupLevel uint64
}
func getTestCases() []testcase {
@ -62,6 +63,7 @@ func getTestCases() []testcase {
"gpu.intel.com/media_version": "9",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
},
},
{
@ -110,6 +112,7 @@ func getTestCases() []testcase {
"gpu.intel.com/media_version": "9",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
},
},
{
@ -140,6 +143,7 @@ func getTestCases() []testcase {
"gpu.intel.com/media_version": "9",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
},
},
{
@ -167,6 +171,7 @@ func getTestCases() []testcase {
"gpu.intel.com/media_version": "9",
"gpu.intel.com/platform_gen": "9",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
},
},
{
@ -190,6 +195,7 @@ func getTestCases() []testcase {
"gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "1",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
},
},
{
@ -218,6 +224,7 @@ func getTestCases() []testcase {
"gpu.intel.com/media_version": "12.5",
"gpu.intel.com/platform_gen": "12",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
},
},
{
@ -244,6 +251,7 @@ func getTestCases() []testcase {
"gpu.intel.com/media_version": "12.5",
"gpu.intel.com/platform_gen": "12",
"gpu.intel.com/cards": "card0",
"gpu.intel.com/gpu-numbers": "0",
},
},
{
@ -264,9 +272,137 @@ func getTestCases() []testcase {
},
expectedRetval: nil,
expectedLabels: labelMap{
"gpu.intel.com/millicores": "2000",
"gpu.intel.com/memory.max": "32000000000",
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/millicores": "2000",
"gpu.intel.com/memory.max": "32000000000",
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/gpu-numbers": "0.1",
},
},
{
sysfsdirs: []string{
"card0/device/drm/card0",
"card1/device/drm/card1",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card1/device/vendor": []byte("0x8086"),
},
name: "when all the gpus are in the same pci-group",
memoryOverride: 16000000000,
capabilityFile: map[string][]byte{
"foobar": []byte(
"platform: new\n" +
"gen: 9"),
},
expectedRetval: nil,
expectedLabels: labelMap{
"gpu.intel.com/millicores": "2000",
"gpu.intel.com/memory.max": "32000000000",
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/gpu-numbers": "0.1",
"gpu.intel.com/pci-groups": "0.1",
},
pciGroupLevel: 2,
},
{
sysfsdirs: []string{
"card0/device/drm/card0",
"card1/device/drm/card1",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card1/device/vendor": []byte("0x8086"),
},
name: "when all the gpus belong to different pci-groups",
memoryOverride: 16000000000,
capabilityFile: map[string][]byte{
"foobar": []byte(
"platform: new\n" +
"gen: 9"),
},
expectedRetval: nil,
expectedLabels: labelMap{
"gpu.intel.com/millicores": "2000",
"gpu.intel.com/memory.max": "32000000000",
"gpu.intel.com/cards": "card0.card1",
"gpu.intel.com/gpu-numbers": "0.1",
"gpu.intel.com/pci-groups": "0_1",
},
pciGroupLevel: 4,
},
{
sysfsdirs: []string{
"card0/device/drm/card0",
"card1/device/drm/card1",
"card2/device/drm/card2",
"card3/device/drm/card3",
"card4/device/drm/card4",
"card5/device/drm/card5",
"card6/device/drm/card6",
"card7/device/drm/card7",
"card8/device/drm/card8",
"card9/device/drm/card9",
"card10/device/drm/card10",
"card11/device/drm/card11",
"card12/device/drm/card12",
"card13/device/drm/card13",
"card14/device/drm/card14",
"card15/device/drm/card15",
"card16/device/drm/card16",
"card17/device/drm/card17",
"card18/device/drm/card18",
"card19/device/drm/card19",
"card20/device/drm/card20",
"card21/device/drm/card21",
"card22/device/drm/card22",
"card23/device/drm/card23",
"card24/device/drm/card24",
"card25/device/drm/card25",
"card26/device/drm/card26",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card1/device/vendor": []byte("0x8086"),
"card2/device/vendor": []byte("0x8086"),
"card3/device/vendor": []byte("0x8086"),
"card4/device/vendor": []byte("0x8086"),
"card5/device/vendor": []byte("0x8086"),
"card6/device/vendor": []byte("0x8086"),
"card7/device/vendor": []byte("0x8086"),
"card8/device/vendor": []byte("0x8086"),
"card9/device/vendor": []byte("0x8086"),
"card10/device/vendor": []byte("0x8086"),
"card11/device/vendor": []byte("0x8086"),
"card12/device/vendor": []byte("0x8086"),
"card13/device/vendor": []byte("0x8086"),
"card14/device/vendor": []byte("0x8086"),
"card15/device/vendor": []byte("0x8086"),
"card16/device/vendor": []byte("0x8086"),
"card17/device/vendor": []byte("0x8086"),
"card18/device/vendor": []byte("0x8086"),
"card19/device/vendor": []byte("0x8086"),
"card20/device/vendor": []byte("0x8086"),
"card21/device/vendor": []byte("0x8086"),
"card22/device/vendor": []byte("0x8086"),
"card23/device/vendor": []byte("0x8086"),
"card24/device/vendor": []byte("0x8086"),
"card25/device/vendor": []byte("0x8086"),
"card26/device/vendor": []byte("0x8086"),
},
name: "when there are way too many gpus, cards label gets truncated",
memoryOverride: 16000000000,
capabilityFile: map[string][]byte{
"foobar": []byte(
"platform: new\n" +
"gen: 9"),
},
expectedRetval: nil,
expectedLabels: labelMap{
"gpu.intel.com/millicores": "27000",
"gpu.intel.com/memory.max": "432000000000",
"gpu.intel.com/cards": "card0.card1.card10.card11.card12.card13.card14.card15.card16.ca",
"gpu.intel.com/gpu-numbers": "0.1.10.11.12.13.14.15.16.17.18.19.2.20.21.22.23.24.25.26.3.4.5.",
"gpu.intel.com/gpu-numbers2": "6.7.8.9",
},
},
}
@ -293,6 +429,35 @@ func (tc *testcase) createFiles(t *testing.T, sysfs, root string) {
}
}
func TestSplit(t *testing.T) {
tests := []struct {
name string
str string
expectedResult []string
maxLength uint
}{
{
name: "single small enough input string passes through unsplit",
str: "1.2.3.4",
maxLength: 10,
expectedResult: []string{"1.2.3.4"},
},
{
name: "foo_bar with maxLength 4 gets split to foo_ and bar",
str: "foo_bar",
maxLength: 4,
expectedResult: []string{"foo_", "bar"},
},
}
for _, test := range tests {
result := split(test.str, test.maxLength)
if !reflect.DeepEqual(test.expectedResult, result) {
t.Errorf("\n%q ended up with unexpected result %v vs expected %v", test.name, result, test.expectedResult)
}
}
}
func TestLabeling(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
@ -315,12 +480,13 @@ func TestLabeling(t *testing.T) {
if err != nil {
t.Fatalf("couldn't create dir: %s", err.Error())
}
sysfs := path.Join(subroot, sysfsDirectory)
sysfs := path.Join(subroot, "pci0000:00/0000:00:1b.4", sysfsDirectory)
tc.createFiles(t, sysfs, subroot)
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10))
os.Setenv(pciGroupingEnv, strconv.FormatUint(tc.pciGroupLevel, 10))
labeler := newLabeler(sysfs, subroot)
err = labeler.createLabels()