mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
gpu nfdhook: new memory amount reading logic
This changes the memory reading to be done through lmem_total_bytes file instead of the addr_range file. Signed-off-by: Ukri Niemimuukko <ukri.niemimuukko@intel.com>
This commit is contained in:
parent
32e8c5902a
commit
64290020d7
@ -118,39 +118,37 @@ func fallback() uint64 {
|
||||
return getEnvVarNumber(memoryOverrideEnv)
|
||||
}
|
||||
|
||||
// getTileMemoryAmount reads the total GPU memory amount from the GPU tiles and returns it and the tile count.
|
||||
func (l *labeler) getTileMemoryAmount(gpuName string) (mem, numTiles uint64) {
|
||||
func (l *labeler) getMemoryAmount(gpuName string, numTiles uint64) uint64 {
|
||||
reserved := getEnvVarNumber(memoryReservedEnv)
|
||||
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*/addr_range")
|
||||
|
||||
files, err := filepath.Glob(filePath)
|
||||
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "lmem_total_bytes")
|
||||
|
||||
dat, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
klog.V(4).Info("Can't read sysfs folder", err)
|
||||
return fallback(), 1
|
||||
klog.Warning("Can't read file: ", err)
|
||||
return fallback()
|
||||
}
|
||||
|
||||
for _, fileName := range files {
|
||||
dat, err := os.ReadFile(fileName)
|
||||
totalPerTile, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64)
|
||||
if err != nil {
|
||||
klog.Warning("Skipping. Can't read file: ", err)
|
||||
continue
|
||||
klog.Warning("Can't convert lmem_total_bytes: ", err)
|
||||
return fallback()
|
||||
}
|
||||
|
||||
n, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64)
|
||||
if err != nil {
|
||||
klog.Warning("Skipping. Can't convert addr_range: ", err)
|
||||
continue
|
||||
return totalPerTile*numTiles - reserved
|
||||
}
|
||||
|
||||
numTiles++
|
||||
mem += n
|
||||
// getTileCount reads the tile count.
|
||||
func (l *labeler) getTileCount(gpuName string) (numTiles uint64) {
|
||||
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*")
|
||||
|
||||
files, _ := filepath.Glob(filePath)
|
||||
|
||||
if len(files) == 0 {
|
||||
return 1
|
||||
}
|
||||
|
||||
if mem == 0 {
|
||||
return fallback(), 1
|
||||
}
|
||||
|
||||
return mem - reserved, numTiles
|
||||
return uint64(len(files))
|
||||
}
|
||||
|
||||
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
|
||||
@ -218,8 +216,11 @@ func (l *labeler) createLabels() error {
|
||||
return errors.Wrap(err, "gpu name parsing error")
|
||||
}
|
||||
|
||||
// read the memory amount to find a proper max allocation value
|
||||
memoryAmount, numTiles := l.getTileMemoryAmount(gpuName)
|
||||
// read the tile count
|
||||
numTiles := l.getTileCount(gpuName)
|
||||
|
||||
// read memory amount
|
||||
memoryAmount := l.getMemoryAmount(gpuName, numTiles)
|
||||
|
||||
// try to add capability labels
|
||||
l.createCapabilityLabels(gpuNum, numTiles)
|
||||
|
@ -43,9 +43,9 @@ func getTestCases() []testcase {
|
||||
},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
"card0/gt/gt0/addr_range": []byte("8086"),
|
||||
"card0/lmem_total_bytes": []byte("8086"),
|
||||
},
|
||||
name: "successful labeling via gt0/addr_range",
|
||||
name: "successful labeling via lmem_total_bytes",
|
||||
memoryOverride: 16000000000,
|
||||
capabilityFile: map[string][]byte{
|
||||
"0/i915_capabilities": []byte(
|
||||
@ -89,10 +89,9 @@ func getTestCases() []testcase {
|
||||
},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
"card0/gt/gt0/addr_range": []byte("8086"),
|
||||
"card0/gt/gt1/addr_range": []byte("2"),
|
||||
"card0/lmem_total_bytes": []byte("8000"),
|
||||
},
|
||||
name: "successful labeling via gt0/addr_range and gt1/addr_range",
|
||||
name: "successful labeling via card0/lmem_total_bytes and two tiles",
|
||||
memoryOverride: 16000000000,
|
||||
capabilityFile: map[string][]byte{
|
||||
"0/i915_capabilities": []byte(
|
||||
@ -102,7 +101,7 @@ func getTestCases() []testcase {
|
||||
expectedRetval: nil,
|
||||
expectedLabels: labelMap{
|
||||
"gpu.intel.com/millicores": "1000",
|
||||
"gpu.intel.com/memory.max": "8088",
|
||||
"gpu.intel.com/memory.max": "16000",
|
||||
"gpu.intel.com/platform_new.count": "1",
|
||||
"gpu.intel.com/platform_new.present": "true",
|
||||
"gpu.intel.com/platform_new.tiles": "2",
|
||||
@ -117,9 +116,9 @@ func getTestCases() []testcase {
|
||||
},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
"card0/gt/gt0/addr_range": []byte("8086"),
|
||||
"card0/lmem_total_bytes": []byte("8086"),
|
||||
},
|
||||
name: "successful labeling via gt0/addr_range and reserved memory",
|
||||
name: "successful labeling via lmem_total_bytes and reserved memory",
|
||||
memoryOverride: 16000000000,
|
||||
memoryReserved: 86,
|
||||
capabilityFile: map[string][]byte{
|
||||
@ -242,20 +241,25 @@ func TestLabeling(t *testing.T) {
|
||||
testcases := getTestCases()
|
||||
|
||||
for _, tc := range testcases {
|
||||
subroot, err := os.MkdirTemp(root, "tc")
|
||||
if err != nil {
|
||||
t.Fatalf("can't create temporary subroot directory: %+v", err)
|
||||
}
|
||||
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
err := os.MkdirAll(path.Join(root, "0"), 0750)
|
||||
err := os.MkdirAll(path.Join(subroot, "0"), 0750)
|
||||
if err != nil {
|
||||
t.Fatalf("couldn't create dir: %s", err.Error())
|
||||
}
|
||||
sysfs := path.Join(root, sysfsDirectory)
|
||||
sysfs := path.Join(subroot, sysfsDirectory)
|
||||
|
||||
tc.createFiles(t, sysfs, root)
|
||||
tc.createFiles(t, sysfs, subroot)
|
||||
|
||||
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
|
||||
os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10))
|
||||
|
||||
labeler := newLabeler(sysfs, root)
|
||||
labeler := newLabeler(sysfs, subroot)
|
||||
err = labeler.createLabels()
|
||||
if err != nil && tc.expectedRetval == nil ||
|
||||
err == nil && tc.expectedRetval != nil {
|
||||
@ -264,12 +268,6 @@ func TestLabeling(t *testing.T) {
|
||||
if tc.expectedRetval == nil && !reflect.DeepEqual(labeler.labels, tc.expectedLabels) {
|
||||
t.Errorf("test %v label mismatch with expectation:\n%v\n%v\n", tc.name, labeler.labels, tc.expectedLabels)
|
||||
}
|
||||
for filename := range tc.capabilityFile {
|
||||
os.Remove(path.Join(root, filename))
|
||||
}
|
||||
for filename := range tc.sysfsfiles {
|
||||
os.Remove(path.Join(sysfs, filename))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user