gpu nfdhook: new memory amount reading logic

This changes the memory reading to be done through lmem_total_bytes
file instead of the addr_range file.

Signed-off-by: Ukri Niemimuukko <ukri.niemimuukko@intel.com>
This commit is contained in:
Ukri Niemimuukko 2021-09-16 17:49:33 +03:00
parent 32e8c5902a
commit 64290020d7
2 changed files with 46 additions and 47 deletions

View File

@ -118,39 +118,37 @@ func fallback() uint64 {
return getEnvVarNumber(memoryOverrideEnv)
}
// getTileMemoryAmount reads the total GPU memory amount from the GPU tiles and returns it and the tile count.
func (l *labeler) getTileMemoryAmount(gpuName string) (mem, numTiles uint64) {
func (l *labeler) getMemoryAmount(gpuName string, numTiles uint64) uint64 {
reserved := getEnvVarNumber(memoryReservedEnv)
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*/addr_range")
files, err := filepath.Glob(filePath)
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "lmem_total_bytes")
dat, err := os.ReadFile(filePath)
if err != nil {
klog.V(4).Info("Can't read sysfs folder", err)
return fallback(), 1
klog.Warning("Can't read file: ", err)
return fallback()
}
for _, fileName := range files {
dat, err := os.ReadFile(fileName)
totalPerTile, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64)
if err != nil {
klog.Warning("Skipping. Can't read file: ", err)
continue
klog.Warning("Can't convert lmem_total_bytes: ", err)
return fallback()
}
n, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64)
if err != nil {
klog.Warning("Skipping. Can't convert addr_range: ", err)
continue
return totalPerTile*numTiles - reserved
}
numTiles++
mem += n
// getTileCount reads the tile count.
func (l *labeler) getTileCount(gpuName string) (numTiles uint64) {
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*")
files, _ := filepath.Glob(filePath)
if len(files) == 0 {
return 1
}
if mem == 0 {
return fallback(), 1
}
return mem - reserved, numTiles
return uint64(len(files))
}
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
@ -218,8 +216,11 @@ func (l *labeler) createLabels() error {
return errors.Wrap(err, "gpu name parsing error")
}
// read the memory amount to find a proper max allocation value
memoryAmount, numTiles := l.getTileMemoryAmount(gpuName)
// read the tile count
numTiles := l.getTileCount(gpuName)
// read memory amount
memoryAmount := l.getMemoryAmount(gpuName, numTiles)
// try to add capability labels
l.createCapabilityLabels(gpuNum, numTiles)

View File

@ -43,9 +43,9 @@ func getTestCases() []testcase {
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card0/gt/gt0/addr_range": []byte("8086"),
"card0/lmem_total_bytes": []byte("8086"),
},
name: "successful labeling via gt0/addr_range",
name: "successful labeling via lmem_total_bytes",
memoryOverride: 16000000000,
capabilityFile: map[string][]byte{
"0/i915_capabilities": []byte(
@ -89,10 +89,9 @@ func getTestCases() []testcase {
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card0/gt/gt0/addr_range": []byte("8086"),
"card0/gt/gt1/addr_range": []byte("2"),
"card0/lmem_total_bytes": []byte("8000"),
},
name: "successful labeling via gt0/addr_range and gt1/addr_range",
name: "successful labeling via card0/lmem_total_bytes and two tiles",
memoryOverride: 16000000000,
capabilityFile: map[string][]byte{
"0/i915_capabilities": []byte(
@ -102,7 +101,7 @@ func getTestCases() []testcase {
expectedRetval: nil,
expectedLabels: labelMap{
"gpu.intel.com/millicores": "1000",
"gpu.intel.com/memory.max": "8088",
"gpu.intel.com/memory.max": "16000",
"gpu.intel.com/platform_new.count": "1",
"gpu.intel.com/platform_new.present": "true",
"gpu.intel.com/platform_new.tiles": "2",
@ -117,9 +116,9 @@ func getTestCases() []testcase {
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card0/gt/gt0/addr_range": []byte("8086"),
"card0/lmem_total_bytes": []byte("8086"),
},
name: "successful labeling via gt0/addr_range and reserved memory",
name: "successful labeling via lmem_total_bytes and reserved memory",
memoryOverride: 16000000000,
memoryReserved: 86,
capabilityFile: map[string][]byte{
@ -242,20 +241,25 @@ func TestLabeling(t *testing.T) {
testcases := getTestCases()
for _, tc := range testcases {
subroot, err := os.MkdirTemp(root, "tc")
if err != nil {
t.Fatalf("can't create temporary subroot directory: %+v", err)
}
tc := tc
t.Run(tc.name, func(t *testing.T) {
err := os.MkdirAll(path.Join(root, "0"), 0750)
err := os.MkdirAll(path.Join(subroot, "0"), 0750)
if err != nil {
t.Fatalf("couldn't create dir: %s", err.Error())
}
sysfs := path.Join(root, sysfsDirectory)
sysfs := path.Join(subroot, sysfsDirectory)
tc.createFiles(t, sysfs, root)
tc.createFiles(t, sysfs, subroot)
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10))
labeler := newLabeler(sysfs, root)
labeler := newLabeler(sysfs, subroot)
err = labeler.createLabels()
if err != nil && tc.expectedRetval == nil ||
err == nil && tc.expectedRetval != nil {
@ -264,12 +268,6 @@ func TestLabeling(t *testing.T) {
if tc.expectedRetval == nil && !reflect.DeepEqual(labeler.labels, tc.expectedLabels) {
t.Errorf("test %v label mismatch with expectation:\n%v\n%v\n", tc.name, labeler.labels, tc.expectedLabels)
}
for filename := range tc.capabilityFile {
os.Remove(path.Join(root, filename))
}
for filename := range tc.sysfsfiles {
os.Remove(path.Join(sysfs, filename))
}
})
}
}