gpu nfdhook: new memory amount reading logic

This changes the memory reading to be done through lmem_total_bytes
file instead of the addr_range file.

Signed-off-by: Ukri Niemimuukko <ukri.niemimuukko@intel.com>
This commit is contained in:
Ukri Niemimuukko 2021-09-16 17:49:33 +03:00
parent 32e8c5902a
commit 64290020d7
2 changed files with 46 additions and 47 deletions

View File

@ -118,39 +118,37 @@ func fallback() uint64 {
return getEnvVarNumber(memoryOverrideEnv) return getEnvVarNumber(memoryOverrideEnv)
} }
// getTileMemoryAmount reads the total GPU memory amount from the GPU tiles and returns it and the tile count. func (l *labeler) getMemoryAmount(gpuName string, numTiles uint64) uint64 {
func (l *labeler) getTileMemoryAmount(gpuName string) (mem, numTiles uint64) {
reserved := getEnvVarNumber(memoryReservedEnv) reserved := getEnvVarNumber(memoryReservedEnv)
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*/addr_range")
files, err := filepath.Glob(filePath) filePath := filepath.Join(l.sysfsDRMDir, gpuName, "lmem_total_bytes")
dat, err := os.ReadFile(filePath)
if err != nil { if err != nil {
klog.V(4).Info("Can't read sysfs folder", err) klog.Warning("Can't read file: ", err)
return fallback(), 1 return fallback()
} }
for _, fileName := range files { totalPerTile, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64)
dat, err := os.ReadFile(fileName)
if err != nil { if err != nil {
klog.Warning("Skipping. Can't read file: ", err) klog.Warning("Can't convert lmem_total_bytes: ", err)
continue return fallback()
} }
n, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64) return totalPerTile*numTiles - reserved
if err != nil {
klog.Warning("Skipping. Can't convert addr_range: ", err)
continue
} }
numTiles++ // getTileCount reads the tile count.
mem += n func (l *labeler) getTileCount(gpuName string) (numTiles uint64) {
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*")
files, _ := filepath.Glob(filePath)
if len(files) == 0 {
return 1
} }
if mem == 0 { return uint64(len(files))
return fallback(), 1
}
return mem - reserved, numTiles
} }
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value. // addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
@ -218,8 +216,11 @@ func (l *labeler) createLabels() error {
return errors.Wrap(err, "gpu name parsing error") return errors.Wrap(err, "gpu name parsing error")
} }
// read the memory amount to find a proper max allocation value // read the tile count
memoryAmount, numTiles := l.getTileMemoryAmount(gpuName) numTiles := l.getTileCount(gpuName)
// read memory amount
memoryAmount := l.getMemoryAmount(gpuName, numTiles)
// try to add capability labels // try to add capability labels
l.createCapabilityLabels(gpuNum, numTiles) l.createCapabilityLabels(gpuNum, numTiles)

View File

@ -43,9 +43,9 @@ func getTestCases() []testcase {
}, },
sysfsfiles: map[string][]byte{ sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"), "card0/device/vendor": []byte("0x8086"),
"card0/gt/gt0/addr_range": []byte("8086"), "card0/lmem_total_bytes": []byte("8086"),
}, },
name: "successful labeling via gt0/addr_range", name: "successful labeling via lmem_total_bytes",
memoryOverride: 16000000000, memoryOverride: 16000000000,
capabilityFile: map[string][]byte{ capabilityFile: map[string][]byte{
"0/i915_capabilities": []byte( "0/i915_capabilities": []byte(
@ -89,10 +89,9 @@ func getTestCases() []testcase {
}, },
sysfsfiles: map[string][]byte{ sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"), "card0/device/vendor": []byte("0x8086"),
"card0/gt/gt0/addr_range": []byte("8086"), "card0/lmem_total_bytes": []byte("8000"),
"card0/gt/gt1/addr_range": []byte("2"),
}, },
name: "successful labeling via gt0/addr_range and gt1/addr_range", name: "successful labeling via card0/lmem_total_bytes and two tiles",
memoryOverride: 16000000000, memoryOverride: 16000000000,
capabilityFile: map[string][]byte{ capabilityFile: map[string][]byte{
"0/i915_capabilities": []byte( "0/i915_capabilities": []byte(
@ -102,7 +101,7 @@ func getTestCases() []testcase {
expectedRetval: nil, expectedRetval: nil,
expectedLabels: labelMap{ expectedLabels: labelMap{
"gpu.intel.com/millicores": "1000", "gpu.intel.com/millicores": "1000",
"gpu.intel.com/memory.max": "8088", "gpu.intel.com/memory.max": "16000",
"gpu.intel.com/platform_new.count": "1", "gpu.intel.com/platform_new.count": "1",
"gpu.intel.com/platform_new.present": "true", "gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "2", "gpu.intel.com/platform_new.tiles": "2",
@ -117,9 +116,9 @@ func getTestCases() []testcase {
}, },
sysfsfiles: map[string][]byte{ sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"), "card0/device/vendor": []byte("0x8086"),
"card0/gt/gt0/addr_range": []byte("8086"), "card0/lmem_total_bytes": []byte("8086"),
}, },
name: "successful labeling via gt0/addr_range and reserved memory", name: "successful labeling via lmem_total_bytes and reserved memory",
memoryOverride: 16000000000, memoryOverride: 16000000000,
memoryReserved: 86, memoryReserved: 86,
capabilityFile: map[string][]byte{ capabilityFile: map[string][]byte{
@ -242,20 +241,25 @@ func TestLabeling(t *testing.T) {
testcases := getTestCases() testcases := getTestCases()
for _, tc := range testcases { for _, tc := range testcases {
subroot, err := os.MkdirTemp(root, "tc")
if err != nil {
t.Fatalf("can't create temporary subroot directory: %+v", err)
}
tc := tc tc := tc
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {
err := os.MkdirAll(path.Join(root, "0"), 0750) err := os.MkdirAll(path.Join(subroot, "0"), 0750)
if err != nil { if err != nil {
t.Fatalf("couldn't create dir: %s", err.Error()) t.Fatalf("couldn't create dir: %s", err.Error())
} }
sysfs := path.Join(root, sysfsDirectory) sysfs := path.Join(subroot, sysfsDirectory)
tc.createFiles(t, sysfs, root) tc.createFiles(t, sysfs, subroot)
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10)) os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10)) os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10))
labeler := newLabeler(sysfs, root) labeler := newLabeler(sysfs, subroot)
err = labeler.createLabels() err = labeler.createLabels()
if err != nil && tc.expectedRetval == nil || if err != nil && tc.expectedRetval == nil ||
err == nil && tc.expectedRetval != nil { err == nil && tc.expectedRetval != nil {
@ -264,12 +268,6 @@ func TestLabeling(t *testing.T) {
if tc.expectedRetval == nil && !reflect.DeepEqual(labeler.labels, tc.expectedLabels) { if tc.expectedRetval == nil && !reflect.DeepEqual(labeler.labels, tc.expectedLabels) {
t.Errorf("test %v label mismatch with expectation:\n%v\n%v\n", tc.name, labeler.labels, tc.expectedLabels) t.Errorf("test %v label mismatch with expectation:\n%v\n%v\n", tc.name, labeler.labels, tc.expectedLabels)
} }
for filename := range tc.capabilityFile {
os.Remove(path.Join(root, filename))
}
for filename := range tc.sysfsfiles {
os.Remove(path.Join(sysfs, filename))
}
}) })
} }
} }