mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
Merge pull request #482 from uniemimu/master
gpu_nfdhook memory amount reading from sysfs
This commit is contained in:
commit
ecf98c195d
@ -11,10 +11,13 @@ types. Selected numeric labels can be turned into kubernetes extended resources
|
||||
by the NFD, allowing for finer grained resource management for GPU-using PODs.
|
||||
|
||||
In the NFD deployment, the hook requires /host-sys -folder to have the host /sys
|
||||
-folder content mounted, and /host-dev to have the host /dev/ -folder content
|
||||
-folder content mounted, and /host-dev to have the host /dev -folder content
|
||||
mounted. Write access is not necessary.
|
||||
|
||||
There is one supported environment variable named GPU_MEMORY_OVERRIDE, which is
|
||||
supposed to hold a numeric value. For systems with GPUs which do not support
|
||||
reading the GPU memory amount, the environment variable memory value is turned
|
||||
into a GPU memory amount label instead of a read value.
|
||||
GPU memory amount is read from sysfs gt/gt* files and turned into a label.
|
||||
There are two supported environment variables named GPU_MEMORY_OVERRIDE and
|
||||
GPU_MEMORY_RESERVED. Both are supposed to hold numeric values. For systems with
|
||||
older kernel drivers or GPUs which do not support reading the GPU memory
|
||||
amount, the GPU_MEMORY_OVERRIDE environment variable value is turned into a GPU
|
||||
memory amount label instead of a read value. GPU_MEMORY_RESERVED value will be
|
||||
scoped out from the GPU memory amount found from sysfs.
|
||||
|
@ -35,6 +35,7 @@ const (
|
||||
millicoreLabelName = "millicores"
|
||||
millicoresPerGPU = 1000
|
||||
memoryOverrideEnv = "GPU_MEMORY_OVERRIDE"
|
||||
memoryReservedEnv = "GPU_MEMORY_RESERVED"
|
||||
gpuDeviceRE = `^card[0-9]+$`
|
||||
controlDeviceRE = `^controlD[0-9]+$`
|
||||
vendorString = "0x8086"
|
||||
@ -43,8 +44,8 @@ const (
|
||||
type labelMap map[string]string
|
||||
|
||||
type labeler struct {
|
||||
sysfsDir string
|
||||
devfsDir string
|
||||
sysfsDRMDir string
|
||||
devfsDRIDir string
|
||||
debugfsDRIDir string
|
||||
|
||||
gpuDeviceReg *regexp.Regexp
|
||||
@ -52,10 +53,10 @@ type labeler struct {
|
||||
labels labelMap
|
||||
}
|
||||
|
||||
func newLabeler(sysfsDir, devfsDir, debugfsDRIDir string) *labeler {
|
||||
func newLabeler(sysfsDRMDir, devfsDRIDir, debugfsDRIDir string) *labeler {
|
||||
return &labeler{
|
||||
sysfsDir: sysfsDir,
|
||||
devfsDir: devfsDir,
|
||||
sysfsDRMDir: sysfsDRMDir,
|
||||
devfsDRIDir: devfsDRIDir,
|
||||
debugfsDRIDir: debugfsDRIDir,
|
||||
gpuDeviceReg: regexp.MustCompile(gpuDeviceRE),
|
||||
controlDeviceReg: regexp.MustCompile(controlDeviceRE),
|
||||
@ -64,7 +65,7 @@ func newLabeler(sysfsDir, devfsDir, debugfsDRIDir string) *labeler {
|
||||
}
|
||||
|
||||
func (l *labeler) scan() ([]string, error) {
|
||||
files, err := ioutil.ReadDir(l.sysfsDir)
|
||||
files, err := ioutil.ReadDir(l.sysfsDRMDir)
|
||||
gpuNameList := []string{}
|
||||
|
||||
if err != nil {
|
||||
@ -77,7 +78,7 @@ func (l *labeler) scan() ([]string, error) {
|
||||
continue
|
||||
}
|
||||
|
||||
dat, err := ioutil.ReadFile(path.Join(l.sysfsDir, f.Name(), "device/vendor"))
|
||||
dat, err := ioutil.ReadFile(path.Join(l.sysfsDRMDir, f.Name(), "device/vendor"))
|
||||
if err != nil {
|
||||
klog.Warning("Skipping. Can't read vendor file: ", err)
|
||||
continue
|
||||
@ -88,7 +89,7 @@ func (l *labeler) scan() ([]string, error) {
|
||||
continue
|
||||
}
|
||||
|
||||
drmFiles, err := ioutil.ReadDir(path.Join(l.sysfsDir, f.Name(), "device/drm"))
|
||||
drmFiles, err := ioutil.ReadDir(path.Join(l.sysfsDRMDir, f.Name(), "device/drm"))
|
||||
if err != nil {
|
||||
return gpuNameList, errors.Wrap(err, "Can't read device folder")
|
||||
}
|
||||
@ -98,7 +99,7 @@ func (l *labeler) scan() ([]string, error) {
|
||||
//Skipping possible drm control node
|
||||
continue
|
||||
}
|
||||
devPath := path.Join(l.devfsDir, drmFile.Name())
|
||||
devPath := path.Join(l.devfsDRIDir, drmFile.Name())
|
||||
if _, err := os.Stat(devPath); err != nil {
|
||||
continue
|
||||
}
|
||||
@ -111,11 +112,8 @@ func (l *labeler) scan() ([]string, error) {
|
||||
return gpuNameList, nil
|
||||
}
|
||||
|
||||
// getMemoryValues reads the GPU memory amount from the system.
|
||||
func (l *labeler) getMemoryAmount( /*cardNum*/ string) uint64 {
|
||||
// reading GPU local memory amount is not yet available in the driver,
|
||||
// so just return the environment variable value
|
||||
envValue := os.Getenv(memoryOverrideEnv)
|
||||
func getEnvVarNumber(envVarName string) uint64 {
|
||||
envValue := os.Getenv(envVarName)
|
||||
if envValue != "" {
|
||||
val, err := strconv.ParseUint(envValue, 10, 64)
|
||||
if err == nil {
|
||||
@ -125,6 +123,45 @@ func (l *labeler) getMemoryAmount( /*cardNum*/ string) uint64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func fallback() uint64 {
|
||||
return getEnvVarNumber(memoryOverrideEnv)
|
||||
}
|
||||
|
||||
// getMemoryAmount reads the GPU memory amount from the system.
|
||||
func (l *labeler) getMemoryAmount(gpuName string) uint64 {
|
||||
reserved := getEnvVarNumber(memoryReservedEnv)
|
||||
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*/addr_range")
|
||||
|
||||
files, err := filepath.Glob(filePath)
|
||||
if err != nil {
|
||||
klog.V(4).Info("Can't read sysfs folder", err)
|
||||
return fallback()
|
||||
}
|
||||
|
||||
mem := uint64(0)
|
||||
for _, fileName := range files {
|
||||
dat, err := ioutil.ReadFile(fileName)
|
||||
if err != nil {
|
||||
klog.Warning("Skipping. Can't read file: ", err)
|
||||
continue
|
||||
}
|
||||
|
||||
n, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 10, 64)
|
||||
if err != nil {
|
||||
klog.Warning("Skipping. Can't convert addr_range: ", err)
|
||||
continue
|
||||
}
|
||||
|
||||
mem += n
|
||||
}
|
||||
|
||||
if mem == 0 {
|
||||
return fallback()
|
||||
}
|
||||
|
||||
return mem - reserved
|
||||
}
|
||||
|
||||
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
|
||||
func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) {
|
||||
value := int64(0)
|
||||
@ -193,7 +230,7 @@ func (l *labeler) createLabels() error {
|
||||
l.createCapabilityLabels(gpuNum)
|
||||
|
||||
// read the memory amount to find a proper max allocation value
|
||||
l.labels.addNumericLabel(labelNamespace+"memory.max", int64(l.getMemoryAmount(gpuNum)))
|
||||
l.labels.addNumericLabel(labelNamespace+"memory.max", int64(l.getMemoryAmount(gpuName)))
|
||||
}
|
||||
gpuCount := len(gpuNameList)
|
||||
// add gpu list label (example: "card0.card1.card2")
|
||||
|
@ -29,6 +29,7 @@ type testcase struct {
|
||||
devfsdirs []string
|
||||
name string
|
||||
memoryOverride uint64
|
||||
memoryReserved uint64
|
||||
capabilityFile map[string][]byte
|
||||
expectedRetval error
|
||||
expectedLabels labelMap
|
||||
@ -37,6 +38,90 @@ type testcase struct {
|
||||
//nolint:funlen
|
||||
func getTestCases() []testcase {
|
||||
return []testcase{
|
||||
{
|
||||
sysfsdirs: []string{
|
||||
"card0/device/drm/card0",
|
||||
"card0/gt/gt0",
|
||||
},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
"card0/gt/gt0/addr_range": []byte("8086"),
|
||||
},
|
||||
devfsdirs: []string{"card0"},
|
||||
name: "successful labeling via gt0/addr_range",
|
||||
memoryOverride: 16000000000,
|
||||
capabilityFile: map[string][]byte{
|
||||
"0/i915_capabilities": []byte(
|
||||
"platform: new\n" +
|
||||
"gen: 9"),
|
||||
},
|
||||
expectedRetval: nil,
|
||||
expectedLabels: labelMap{
|
||||
"gpu.intel.com/millicores": "1000",
|
||||
"gpu.intel.com/memory.max": "8086",
|
||||
"gpu.intel.com/platform_new.count": "1",
|
||||
"gpu.intel.com/platform_new.present": "true",
|
||||
"gpu.intel.com/platform_gen": "9",
|
||||
"gpu.intel.com/cards": "card0",
|
||||
},
|
||||
},
|
||||
{
|
||||
sysfsdirs: []string{
|
||||
"card0/device/drm/card0",
|
||||
"card0/gt/gt0",
|
||||
"card0/gt/gt1",
|
||||
},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
"card0/gt/gt0/addr_range": []byte("8086"),
|
||||
"card0/gt/gt1/addr_range": []byte("2"),
|
||||
},
|
||||
devfsdirs: []string{"card0"},
|
||||
name: "successful labeling via gt0/addr_range and gt1/addr_range",
|
||||
memoryOverride: 16000000000,
|
||||
capabilityFile: map[string][]byte{
|
||||
"0/i915_capabilities": []byte(
|
||||
"platform: new\n" +
|
||||
"gen: 9"),
|
||||
},
|
||||
expectedRetval: nil,
|
||||
expectedLabels: labelMap{
|
||||
"gpu.intel.com/millicores": "1000",
|
||||
"gpu.intel.com/memory.max": "8088",
|
||||
"gpu.intel.com/platform_new.count": "1",
|
||||
"gpu.intel.com/platform_new.present": "true",
|
||||
"gpu.intel.com/platform_gen": "9",
|
||||
"gpu.intel.com/cards": "card0",
|
||||
},
|
||||
},
|
||||
{
|
||||
sysfsdirs: []string{
|
||||
"card0/device/drm/card0",
|
||||
"card0/gt/gt0",
|
||||
},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
"card0/gt/gt0/addr_range": []byte("8086"),
|
||||
},
|
||||
devfsdirs: []string{"card0"},
|
||||
name: "successful labeling via gt0/addr_range and reserved memory",
|
||||
memoryOverride: 16000000000,
|
||||
memoryReserved: 86,
|
||||
capabilityFile: map[string][]byte{
|
||||
"0/i915_capabilities": []byte(
|
||||
"platform: new\n" +
|
||||
"gen: 9"),
|
||||
},
|
||||
expectedRetval: nil,
|
||||
expectedLabels: labelMap{
|
||||
"gpu.intel.com/millicores": "1000",
|
||||
"gpu.intel.com/memory.max": "8000",
|
||||
"gpu.intel.com/platform_new.count": "1",
|
||||
"gpu.intel.com/platform_new.present": "true",
|
||||
"gpu.intel.com/platform_gen": "9",
|
||||
"gpu.intel.com/cards": "card0",
|
||||
},
|
||||
},
|
||||
{
|
||||
sysfsdirs: []string{
|
||||
"card0/device/drm/card0",
|
||||
@ -45,7 +130,7 @@ func getTestCases() []testcase {
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
},
|
||||
devfsdirs: []string{"card0"},
|
||||
name: "successful labeling",
|
||||
name: "successful labeling via memory override",
|
||||
memoryOverride: 16000000000,
|
||||
capabilityFile: map[string][]byte{
|
||||
"0/i915_capabilities": []byte(
|
||||
@ -159,6 +244,7 @@ func TestLabeling(t *testing.T) {
|
||||
tc.createFiles(t, sysfs, devfs, root)
|
||||
|
||||
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
|
||||
os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10))
|
||||
|
||||
labeler := newLabeler(sysfs, devfs, root)
|
||||
err = labeler.createLabels()
|
||||
@ -172,6 +258,9 @@ func TestLabeling(t *testing.T) {
|
||||
for filename := range tc.capabilityFile {
|
||||
os.Remove(path.Join(root, filename))
|
||||
}
|
||||
for filename := range tc.sysfsfiles {
|
||||
os.Remove(path.Join(sysfs, filename))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -23,13 +23,13 @@ import (
|
||||
const (
|
||||
sysfsDirectory = "/host-sys"
|
||||
devfsDirectory = "/host-dev"
|
||||
sysfsDrmDirectory = sysfsDirectory + "/class/drm"
|
||||
devfsDriDirectory = devfsDirectory + "/dri"
|
||||
sysfsDRMDirectory = sysfsDirectory + "/class/drm"
|
||||
devfsDRIDirectory = devfsDirectory + "/dri"
|
||||
debugfsDRIDirectory = sysfsDirectory + "/kernel/debug/dri"
|
||||
)
|
||||
|
||||
func main() {
|
||||
l := newLabeler(sysfsDrmDirectory, devfsDriDirectory, debugfsDRIDirectory)
|
||||
l := newLabeler(sysfsDRMDirectory, devfsDRIDirectory, debugfsDRIDirectory)
|
||||
err := l.createLabels()
|
||||
if err != nil {
|
||||
klog.Errorf("%+v", err)
|
||||
|
Loading…
Reference in New Issue
Block a user