From 64290020d74c9fef10a9d0ed9168424e89d16a2d Mon Sep 17 00:00:00 2001 From: Ukri Niemimuukko Date: Thu, 16 Sep 2021 17:49:33 +0300 Subject: [PATCH] gpu nfdhook: new memory amount reading logic This changes the memory reading to be done through lmem_total_bytes file instead of the addr_range file. Signed-off-by: Ukri Niemimuukko --- cmd/gpu_nfdhook/labeler.go | 53 +++++++++++++++++---------------- cmd/gpu_nfdhook/labeler_test.go | 40 ++++++++++++------------- 2 files changed, 46 insertions(+), 47 deletions(-) diff --git a/cmd/gpu_nfdhook/labeler.go b/cmd/gpu_nfdhook/labeler.go index f93b3fdd..eefef11a 100644 --- a/cmd/gpu_nfdhook/labeler.go +++ b/cmd/gpu_nfdhook/labeler.go @@ -118,39 +118,37 @@ func fallback() uint64 { return getEnvVarNumber(memoryOverrideEnv) } -// getTileMemoryAmount reads the total GPU memory amount from the GPU tiles and returns it and the tile count. -func (l *labeler) getTileMemoryAmount(gpuName string) (mem, numTiles uint64) { +func (l *labeler) getMemoryAmount(gpuName string, numTiles uint64) uint64 { reserved := getEnvVarNumber(memoryReservedEnv) - filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*/addr_range") - files, err := filepath.Glob(filePath) + filePath := filepath.Join(l.sysfsDRMDir, gpuName, "lmem_total_bytes") + + dat, err := os.ReadFile(filePath) if err != nil { - klog.V(4).Info("Can't read sysfs folder", err) - return fallback(), 1 + klog.Warning("Can't read file: ", err) + return fallback() } - for _, fileName := range files { - dat, err := os.ReadFile(fileName) - if err != nil { - klog.Warning("Skipping. Can't read file: ", err) - continue - } - - n, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64) - if err != nil { - klog.Warning("Skipping. Can't convert addr_range: ", err) - continue - } - - numTiles++ - mem += n + totalPerTile, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64) + if err != nil { + klog.Warning("Can't convert lmem_total_bytes: ", err) + return fallback() } - if mem == 0 { - return fallback(), 1 + return totalPerTile*numTiles - reserved +} + +// getTileCount reads the tile count. +func (l *labeler) getTileCount(gpuName string) (numTiles uint64) { + filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*") + + files, _ := filepath.Glob(filePath) + + if len(files) == 0 { + return 1 } - return mem - reserved, numTiles + return uint64(len(files)) } // addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value. @@ -218,8 +216,11 @@ func (l *labeler) createLabels() error { return errors.Wrap(err, "gpu name parsing error") } - // read the memory amount to find a proper max allocation value - memoryAmount, numTiles := l.getTileMemoryAmount(gpuName) + // read the tile count + numTiles := l.getTileCount(gpuName) + + // read memory amount + memoryAmount := l.getMemoryAmount(gpuName, numTiles) // try to add capability labels l.createCapabilityLabels(gpuNum, numTiles) diff --git a/cmd/gpu_nfdhook/labeler_test.go b/cmd/gpu_nfdhook/labeler_test.go index bf0fe605..e732a8df 100644 --- a/cmd/gpu_nfdhook/labeler_test.go +++ b/cmd/gpu_nfdhook/labeler_test.go @@ -42,10 +42,10 @@ func getTestCases() []testcase { "card0/gt/gt0", }, sysfsfiles: map[string][]byte{ - "card0/device/vendor": []byte("0x8086"), - "card0/gt/gt0/addr_range": []byte("8086"), + "card0/device/vendor": []byte("0x8086"), + "card0/lmem_total_bytes": []byte("8086"), }, - name: "successful labeling via gt0/addr_range", + name: "successful labeling via lmem_total_bytes", memoryOverride: 16000000000, capabilityFile: map[string][]byte{ "0/i915_capabilities": []byte( @@ -88,11 +88,10 @@ func getTestCases() []testcase { "card0/gt/gt1", }, sysfsfiles: map[string][]byte{ - "card0/device/vendor": []byte("0x8086"), - "card0/gt/gt0/addr_range": []byte("8086"), - "card0/gt/gt1/addr_range": []byte("2"), + "card0/device/vendor": []byte("0x8086"), + "card0/lmem_total_bytes": []byte("8000"), }, - name: "successful labeling via gt0/addr_range and gt1/addr_range", + name: "successful labeling via card0/lmem_total_bytes and two tiles", memoryOverride: 16000000000, capabilityFile: map[string][]byte{ "0/i915_capabilities": []byte( @@ -102,7 +101,7 @@ func getTestCases() []testcase { expectedRetval: nil, expectedLabels: labelMap{ "gpu.intel.com/millicores": "1000", - "gpu.intel.com/memory.max": "8088", + "gpu.intel.com/memory.max": "16000", "gpu.intel.com/platform_new.count": "1", "gpu.intel.com/platform_new.present": "true", "gpu.intel.com/platform_new.tiles": "2", @@ -116,10 +115,10 @@ func getTestCases() []testcase { "card0/gt/gt0", }, sysfsfiles: map[string][]byte{ - "card0/device/vendor": []byte("0x8086"), - "card0/gt/gt0/addr_range": []byte("8086"), + "card0/device/vendor": []byte("0x8086"), + "card0/lmem_total_bytes": []byte("8086"), }, - name: "successful labeling via gt0/addr_range and reserved memory", + name: "successful labeling via lmem_total_bytes and reserved memory", memoryOverride: 16000000000, memoryReserved: 86, capabilityFile: map[string][]byte{ @@ -242,20 +241,25 @@ func TestLabeling(t *testing.T) { testcases := getTestCases() for _, tc := range testcases { + subroot, err := os.MkdirTemp(root, "tc") + if err != nil { + t.Fatalf("can't create temporary subroot directory: %+v", err) + } + tc := tc t.Run(tc.name, func(t *testing.T) { - err := os.MkdirAll(path.Join(root, "0"), 0750) + err := os.MkdirAll(path.Join(subroot, "0"), 0750) if err != nil { t.Fatalf("couldn't create dir: %s", err.Error()) } - sysfs := path.Join(root, sysfsDirectory) + sysfs := path.Join(subroot, sysfsDirectory) - tc.createFiles(t, sysfs, root) + tc.createFiles(t, sysfs, subroot) os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10)) os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10)) - labeler := newLabeler(sysfs, root) + labeler := newLabeler(sysfs, subroot) err = labeler.createLabels() if err != nil && tc.expectedRetval == nil || err == nil && tc.expectedRetval != nil { @@ -264,12 +268,6 @@ func TestLabeling(t *testing.T) { if tc.expectedRetval == nil && !reflect.DeepEqual(labeler.labels, tc.expectedLabels) { t.Errorf("test %v label mismatch with expectation:\n%v\n%v\n", tc.name, labeler.labels, tc.expectedLabels) } - for filename := range tc.capabilityFile { - os.Remove(path.Join(root, filename)) - } - for filename := range tc.sysfsfiles { - os.Remove(path.Join(sysfs, filename)) - } }) } }