mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
gpu nfdhook: new memory amount reading logic
This changes the memory reading to be done through lmem_total_bytes file instead of the addr_range file. Signed-off-by: Ukri Niemimuukko <ukri.niemimuukko@intel.com>
This commit is contained in:
parent
32e8c5902a
commit
64290020d7
@ -118,39 +118,37 @@ func fallback() uint64 {
|
|||||||
return getEnvVarNumber(memoryOverrideEnv)
|
return getEnvVarNumber(memoryOverrideEnv)
|
||||||
}
|
}
|
||||||
|
|
||||||
// getTileMemoryAmount reads the total GPU memory amount from the GPU tiles and returns it and the tile count.
|
func (l *labeler) getMemoryAmount(gpuName string, numTiles uint64) uint64 {
|
||||||
func (l *labeler) getTileMemoryAmount(gpuName string) (mem, numTiles uint64) {
|
|
||||||
reserved := getEnvVarNumber(memoryReservedEnv)
|
reserved := getEnvVarNumber(memoryReservedEnv)
|
||||||
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*/addr_range")
|
|
||||||
|
|
||||||
files, err := filepath.Glob(filePath)
|
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "lmem_total_bytes")
|
||||||
|
|
||||||
|
dat, err := os.ReadFile(filePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.V(4).Info("Can't read sysfs folder", err)
|
klog.Warning("Can't read file: ", err)
|
||||||
return fallback(), 1
|
return fallback()
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, fileName := range files {
|
totalPerTile, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64)
|
||||||
dat, err := os.ReadFile(fileName)
|
if err != nil {
|
||||||
if err != nil {
|
klog.Warning("Can't convert lmem_total_bytes: ", err)
|
||||||
klog.Warning("Skipping. Can't read file: ", err)
|
return fallback()
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
n, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64)
|
|
||||||
if err != nil {
|
|
||||||
klog.Warning("Skipping. Can't convert addr_range: ", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
numTiles++
|
|
||||||
mem += n
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if mem == 0 {
|
return totalPerTile*numTiles - reserved
|
||||||
return fallback(), 1
|
}
|
||||||
|
|
||||||
|
// getTileCount reads the tile count.
|
||||||
|
func (l *labeler) getTileCount(gpuName string) (numTiles uint64) {
|
||||||
|
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*")
|
||||||
|
|
||||||
|
files, _ := filepath.Glob(filePath)
|
||||||
|
|
||||||
|
if len(files) == 0 {
|
||||||
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
return mem - reserved, numTiles
|
return uint64(len(files))
|
||||||
}
|
}
|
||||||
|
|
||||||
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
|
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
|
||||||
@ -218,8 +216,11 @@ func (l *labeler) createLabels() error {
|
|||||||
return errors.Wrap(err, "gpu name parsing error")
|
return errors.Wrap(err, "gpu name parsing error")
|
||||||
}
|
}
|
||||||
|
|
||||||
// read the memory amount to find a proper max allocation value
|
// read the tile count
|
||||||
memoryAmount, numTiles := l.getTileMemoryAmount(gpuName)
|
numTiles := l.getTileCount(gpuName)
|
||||||
|
|
||||||
|
// read memory amount
|
||||||
|
memoryAmount := l.getMemoryAmount(gpuName, numTiles)
|
||||||
|
|
||||||
// try to add capability labels
|
// try to add capability labels
|
||||||
l.createCapabilityLabels(gpuNum, numTiles)
|
l.createCapabilityLabels(gpuNum, numTiles)
|
||||||
|
@ -42,10 +42,10 @@ func getTestCases() []testcase {
|
|||||||
"card0/gt/gt0",
|
"card0/gt/gt0",
|
||||||
},
|
},
|
||||||
sysfsfiles: map[string][]byte{
|
sysfsfiles: map[string][]byte{
|
||||||
"card0/device/vendor": []byte("0x8086"),
|
"card0/device/vendor": []byte("0x8086"),
|
||||||
"card0/gt/gt0/addr_range": []byte("8086"),
|
"card0/lmem_total_bytes": []byte("8086"),
|
||||||
},
|
},
|
||||||
name: "successful labeling via gt0/addr_range",
|
name: "successful labeling via lmem_total_bytes",
|
||||||
memoryOverride: 16000000000,
|
memoryOverride: 16000000000,
|
||||||
capabilityFile: map[string][]byte{
|
capabilityFile: map[string][]byte{
|
||||||
"0/i915_capabilities": []byte(
|
"0/i915_capabilities": []byte(
|
||||||
@ -88,11 +88,10 @@ func getTestCases() []testcase {
|
|||||||
"card0/gt/gt1",
|
"card0/gt/gt1",
|
||||||
},
|
},
|
||||||
sysfsfiles: map[string][]byte{
|
sysfsfiles: map[string][]byte{
|
||||||
"card0/device/vendor": []byte("0x8086"),
|
"card0/device/vendor": []byte("0x8086"),
|
||||||
"card0/gt/gt0/addr_range": []byte("8086"),
|
"card0/lmem_total_bytes": []byte("8000"),
|
||||||
"card0/gt/gt1/addr_range": []byte("2"),
|
|
||||||
},
|
},
|
||||||
name: "successful labeling via gt0/addr_range and gt1/addr_range",
|
name: "successful labeling via card0/lmem_total_bytes and two tiles",
|
||||||
memoryOverride: 16000000000,
|
memoryOverride: 16000000000,
|
||||||
capabilityFile: map[string][]byte{
|
capabilityFile: map[string][]byte{
|
||||||
"0/i915_capabilities": []byte(
|
"0/i915_capabilities": []byte(
|
||||||
@ -102,7 +101,7 @@ func getTestCases() []testcase {
|
|||||||
expectedRetval: nil,
|
expectedRetval: nil,
|
||||||
expectedLabels: labelMap{
|
expectedLabels: labelMap{
|
||||||
"gpu.intel.com/millicores": "1000",
|
"gpu.intel.com/millicores": "1000",
|
||||||
"gpu.intel.com/memory.max": "8088",
|
"gpu.intel.com/memory.max": "16000",
|
||||||
"gpu.intel.com/platform_new.count": "1",
|
"gpu.intel.com/platform_new.count": "1",
|
||||||
"gpu.intel.com/platform_new.present": "true",
|
"gpu.intel.com/platform_new.present": "true",
|
||||||
"gpu.intel.com/platform_new.tiles": "2",
|
"gpu.intel.com/platform_new.tiles": "2",
|
||||||
@ -116,10 +115,10 @@ func getTestCases() []testcase {
|
|||||||
"card0/gt/gt0",
|
"card0/gt/gt0",
|
||||||
},
|
},
|
||||||
sysfsfiles: map[string][]byte{
|
sysfsfiles: map[string][]byte{
|
||||||
"card0/device/vendor": []byte("0x8086"),
|
"card0/device/vendor": []byte("0x8086"),
|
||||||
"card0/gt/gt0/addr_range": []byte("8086"),
|
"card0/lmem_total_bytes": []byte("8086"),
|
||||||
},
|
},
|
||||||
name: "successful labeling via gt0/addr_range and reserved memory",
|
name: "successful labeling via lmem_total_bytes and reserved memory",
|
||||||
memoryOverride: 16000000000,
|
memoryOverride: 16000000000,
|
||||||
memoryReserved: 86,
|
memoryReserved: 86,
|
||||||
capabilityFile: map[string][]byte{
|
capabilityFile: map[string][]byte{
|
||||||
@ -242,20 +241,25 @@ func TestLabeling(t *testing.T) {
|
|||||||
testcases := getTestCases()
|
testcases := getTestCases()
|
||||||
|
|
||||||
for _, tc := range testcases {
|
for _, tc := range testcases {
|
||||||
|
subroot, err := os.MkdirTemp(root, "tc")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("can't create temporary subroot directory: %+v", err)
|
||||||
|
}
|
||||||
|
|
||||||
tc := tc
|
tc := tc
|
||||||
t.Run(tc.name, func(t *testing.T) {
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
err := os.MkdirAll(path.Join(root, "0"), 0750)
|
err := os.MkdirAll(path.Join(subroot, "0"), 0750)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("couldn't create dir: %s", err.Error())
|
t.Fatalf("couldn't create dir: %s", err.Error())
|
||||||
}
|
}
|
||||||
sysfs := path.Join(root, sysfsDirectory)
|
sysfs := path.Join(subroot, sysfsDirectory)
|
||||||
|
|
||||||
tc.createFiles(t, sysfs, root)
|
tc.createFiles(t, sysfs, subroot)
|
||||||
|
|
||||||
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
|
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
|
||||||
os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10))
|
os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10))
|
||||||
|
|
||||||
labeler := newLabeler(sysfs, root)
|
labeler := newLabeler(sysfs, subroot)
|
||||||
err = labeler.createLabels()
|
err = labeler.createLabels()
|
||||||
if err != nil && tc.expectedRetval == nil ||
|
if err != nil && tc.expectedRetval == nil ||
|
||||||
err == nil && tc.expectedRetval != nil {
|
err == nil && tc.expectedRetval != nil {
|
||||||
@ -264,12 +268,6 @@ func TestLabeling(t *testing.T) {
|
|||||||
if tc.expectedRetval == nil && !reflect.DeepEqual(labeler.labels, tc.expectedLabels) {
|
if tc.expectedRetval == nil && !reflect.DeepEqual(labeler.labels, tc.expectedLabels) {
|
||||||
t.Errorf("test %v label mismatch with expectation:\n%v\n%v\n", tc.name, labeler.labels, tc.expectedLabels)
|
t.Errorf("test %v label mismatch with expectation:\n%v\n%v\n", tc.name, labeler.labels, tc.expectedLabels)
|
||||||
}
|
}
|
||||||
for filename := range tc.capabilityFile {
|
|
||||||
os.Remove(path.Join(root, filename))
|
|
||||||
}
|
|
||||||
for filename := range tc.sysfsfiles {
|
|
||||||
os.Remove(path.Join(sysfs, filename))
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user