mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
Merge pull request #482 from uniemimu/master
gpu_nfdhook memory amount reading from sysfs
This commit is contained in:
commit
ecf98c195d
@ -11,10 +11,13 @@ types. Selected numeric labels can be turned into kubernetes extended resources
|
|||||||
by the NFD, allowing for finer grained resource management for GPU-using PODs.
|
by the NFD, allowing for finer grained resource management for GPU-using PODs.
|
||||||
|
|
||||||
In the NFD deployment, the hook requires /host-sys -folder to have the host /sys
|
In the NFD deployment, the hook requires /host-sys -folder to have the host /sys
|
||||||
-folder content mounted, and /host-dev to have the host /dev/ -folder content
|
-folder content mounted, and /host-dev to have the host /dev -folder content
|
||||||
mounted. Write access is not necessary.
|
mounted. Write access is not necessary.
|
||||||
|
|
||||||
There is one supported environment variable named GPU_MEMORY_OVERRIDE, which is
|
GPU memory amount is read from sysfs gt/gt* files and turned into a label.
|
||||||
supposed to hold a numeric value. For systems with GPUs which do not support
|
There are two supported environment variables named GPU_MEMORY_OVERRIDE and
|
||||||
reading the GPU memory amount, the environment variable memory value is turned
|
GPU_MEMORY_RESERVED. Both are supposed to hold numeric values. For systems with
|
||||||
into a GPU memory amount label instead of a read value.
|
older kernel drivers or GPUs which do not support reading the GPU memory
|
||||||
|
amount, the GPU_MEMORY_OVERRIDE environment variable value is turned into a GPU
|
||||||
|
memory amount label instead of a read value. GPU_MEMORY_RESERVED value will be
|
||||||
|
scoped out from the GPU memory amount found from sysfs.
|
||||||
|
@ -35,6 +35,7 @@ const (
|
|||||||
millicoreLabelName = "millicores"
|
millicoreLabelName = "millicores"
|
||||||
millicoresPerGPU = 1000
|
millicoresPerGPU = 1000
|
||||||
memoryOverrideEnv = "GPU_MEMORY_OVERRIDE"
|
memoryOverrideEnv = "GPU_MEMORY_OVERRIDE"
|
||||||
|
memoryReservedEnv = "GPU_MEMORY_RESERVED"
|
||||||
gpuDeviceRE = `^card[0-9]+$`
|
gpuDeviceRE = `^card[0-9]+$`
|
||||||
controlDeviceRE = `^controlD[0-9]+$`
|
controlDeviceRE = `^controlD[0-9]+$`
|
||||||
vendorString = "0x8086"
|
vendorString = "0x8086"
|
||||||
@ -43,8 +44,8 @@ const (
|
|||||||
type labelMap map[string]string
|
type labelMap map[string]string
|
||||||
|
|
||||||
type labeler struct {
|
type labeler struct {
|
||||||
sysfsDir string
|
sysfsDRMDir string
|
||||||
devfsDir string
|
devfsDRIDir string
|
||||||
debugfsDRIDir string
|
debugfsDRIDir string
|
||||||
|
|
||||||
gpuDeviceReg *regexp.Regexp
|
gpuDeviceReg *regexp.Regexp
|
||||||
@ -52,10 +53,10 @@ type labeler struct {
|
|||||||
labels labelMap
|
labels labelMap
|
||||||
}
|
}
|
||||||
|
|
||||||
func newLabeler(sysfsDir, devfsDir, debugfsDRIDir string) *labeler {
|
func newLabeler(sysfsDRMDir, devfsDRIDir, debugfsDRIDir string) *labeler {
|
||||||
return &labeler{
|
return &labeler{
|
||||||
sysfsDir: sysfsDir,
|
sysfsDRMDir: sysfsDRMDir,
|
||||||
devfsDir: devfsDir,
|
devfsDRIDir: devfsDRIDir,
|
||||||
debugfsDRIDir: debugfsDRIDir,
|
debugfsDRIDir: debugfsDRIDir,
|
||||||
gpuDeviceReg: regexp.MustCompile(gpuDeviceRE),
|
gpuDeviceReg: regexp.MustCompile(gpuDeviceRE),
|
||||||
controlDeviceReg: regexp.MustCompile(controlDeviceRE),
|
controlDeviceReg: regexp.MustCompile(controlDeviceRE),
|
||||||
@ -64,7 +65,7 @@ func newLabeler(sysfsDir, devfsDir, debugfsDRIDir string) *labeler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (l *labeler) scan() ([]string, error) {
|
func (l *labeler) scan() ([]string, error) {
|
||||||
files, err := ioutil.ReadDir(l.sysfsDir)
|
files, err := ioutil.ReadDir(l.sysfsDRMDir)
|
||||||
gpuNameList := []string{}
|
gpuNameList := []string{}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -77,7 +78,7 @@ func (l *labeler) scan() ([]string, error) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
dat, err := ioutil.ReadFile(path.Join(l.sysfsDir, f.Name(), "device/vendor"))
|
dat, err := ioutil.ReadFile(path.Join(l.sysfsDRMDir, f.Name(), "device/vendor"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Warning("Skipping. Can't read vendor file: ", err)
|
klog.Warning("Skipping. Can't read vendor file: ", err)
|
||||||
continue
|
continue
|
||||||
@ -88,7 +89,7 @@ func (l *labeler) scan() ([]string, error) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
drmFiles, err := ioutil.ReadDir(path.Join(l.sysfsDir, f.Name(), "device/drm"))
|
drmFiles, err := ioutil.ReadDir(path.Join(l.sysfsDRMDir, f.Name(), "device/drm"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return gpuNameList, errors.Wrap(err, "Can't read device folder")
|
return gpuNameList, errors.Wrap(err, "Can't read device folder")
|
||||||
}
|
}
|
||||||
@ -98,7 +99,7 @@ func (l *labeler) scan() ([]string, error) {
|
|||||||
//Skipping possible drm control node
|
//Skipping possible drm control node
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
devPath := path.Join(l.devfsDir, drmFile.Name())
|
devPath := path.Join(l.devfsDRIDir, drmFile.Name())
|
||||||
if _, err := os.Stat(devPath); err != nil {
|
if _, err := os.Stat(devPath); err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -111,11 +112,8 @@ func (l *labeler) scan() ([]string, error) {
|
|||||||
return gpuNameList, nil
|
return gpuNameList, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// getMemoryValues reads the GPU memory amount from the system.
|
func getEnvVarNumber(envVarName string) uint64 {
|
||||||
func (l *labeler) getMemoryAmount( /*cardNum*/ string) uint64 {
|
envValue := os.Getenv(envVarName)
|
||||||
// reading GPU local memory amount is not yet available in the driver,
|
|
||||||
// so just return the environment variable value
|
|
||||||
envValue := os.Getenv(memoryOverrideEnv)
|
|
||||||
if envValue != "" {
|
if envValue != "" {
|
||||||
val, err := strconv.ParseUint(envValue, 10, 64)
|
val, err := strconv.ParseUint(envValue, 10, 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@ -125,6 +123,45 @@ func (l *labeler) getMemoryAmount( /*cardNum*/ string) uint64 {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func fallback() uint64 {
|
||||||
|
return getEnvVarNumber(memoryOverrideEnv)
|
||||||
|
}
|
||||||
|
|
||||||
|
// getMemoryAmount reads the GPU memory amount from the system.
|
||||||
|
func (l *labeler) getMemoryAmount(gpuName string) uint64 {
|
||||||
|
reserved := getEnvVarNumber(memoryReservedEnv)
|
||||||
|
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*/addr_range")
|
||||||
|
|
||||||
|
files, err := filepath.Glob(filePath)
|
||||||
|
if err != nil {
|
||||||
|
klog.V(4).Info("Can't read sysfs folder", err)
|
||||||
|
return fallback()
|
||||||
|
}
|
||||||
|
|
||||||
|
mem := uint64(0)
|
||||||
|
for _, fileName := range files {
|
||||||
|
dat, err := ioutil.ReadFile(fileName)
|
||||||
|
if err != nil {
|
||||||
|
klog.Warning("Skipping. Can't read file: ", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
n, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
klog.Warning("Skipping. Can't convert addr_range: ", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
mem += n
|
||||||
|
}
|
||||||
|
|
||||||
|
if mem == 0 {
|
||||||
|
return fallback()
|
||||||
|
}
|
||||||
|
|
||||||
|
return mem - reserved
|
||||||
|
}
|
||||||
|
|
||||||
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
|
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
|
||||||
func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) {
|
func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) {
|
||||||
value := int64(0)
|
value := int64(0)
|
||||||
@ -193,7 +230,7 @@ func (l *labeler) createLabels() error {
|
|||||||
l.createCapabilityLabels(gpuNum)
|
l.createCapabilityLabels(gpuNum)
|
||||||
|
|
||||||
// read the memory amount to find a proper max allocation value
|
// read the memory amount to find a proper max allocation value
|
||||||
l.labels.addNumericLabel(labelNamespace+"memory.max", int64(l.getMemoryAmount(gpuNum)))
|
l.labels.addNumericLabel(labelNamespace+"memory.max", int64(l.getMemoryAmount(gpuName)))
|
||||||
}
|
}
|
||||||
gpuCount := len(gpuNameList)
|
gpuCount := len(gpuNameList)
|
||||||
// add gpu list label (example: "card0.card1.card2")
|
// add gpu list label (example: "card0.card1.card2")
|
||||||
|
@ -29,6 +29,7 @@ type testcase struct {
|
|||||||
devfsdirs []string
|
devfsdirs []string
|
||||||
name string
|
name string
|
||||||
memoryOverride uint64
|
memoryOverride uint64
|
||||||
|
memoryReserved uint64
|
||||||
capabilityFile map[string][]byte
|
capabilityFile map[string][]byte
|
||||||
expectedRetval error
|
expectedRetval error
|
||||||
expectedLabels labelMap
|
expectedLabels labelMap
|
||||||
@ -37,6 +38,90 @@ type testcase struct {
|
|||||||
//nolint:funlen
|
//nolint:funlen
|
||||||
func getTestCases() []testcase {
|
func getTestCases() []testcase {
|
||||||
return []testcase{
|
return []testcase{
|
||||||
|
{
|
||||||
|
sysfsdirs: []string{
|
||||||
|
"card0/device/drm/card0",
|
||||||
|
"card0/gt/gt0",
|
||||||
|
},
|
||||||
|
sysfsfiles: map[string][]byte{
|
||||||
|
"card0/device/vendor": []byte("0x8086"),
|
||||||
|
"card0/gt/gt0/addr_range": []byte("8086"),
|
||||||
|
},
|
||||||
|
devfsdirs: []string{"card0"},
|
||||||
|
name: "successful labeling via gt0/addr_range",
|
||||||
|
memoryOverride: 16000000000,
|
||||||
|
capabilityFile: map[string][]byte{
|
||||||
|
"0/i915_capabilities": []byte(
|
||||||
|
"platform: new\n" +
|
||||||
|
"gen: 9"),
|
||||||
|
},
|
||||||
|
expectedRetval: nil,
|
||||||
|
expectedLabels: labelMap{
|
||||||
|
"gpu.intel.com/millicores": "1000",
|
||||||
|
"gpu.intel.com/memory.max": "8086",
|
||||||
|
"gpu.intel.com/platform_new.count": "1",
|
||||||
|
"gpu.intel.com/platform_new.present": "true",
|
||||||
|
"gpu.intel.com/platform_gen": "9",
|
||||||
|
"gpu.intel.com/cards": "card0",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
sysfsdirs: []string{
|
||||||
|
"card0/device/drm/card0",
|
||||||
|
"card0/gt/gt0",
|
||||||
|
"card0/gt/gt1",
|
||||||
|
},
|
||||||
|
sysfsfiles: map[string][]byte{
|
||||||
|
"card0/device/vendor": []byte("0x8086"),
|
||||||
|
"card0/gt/gt0/addr_range": []byte("8086"),
|
||||||
|
"card0/gt/gt1/addr_range": []byte("2"),
|
||||||
|
},
|
||||||
|
devfsdirs: []string{"card0"},
|
||||||
|
name: "successful labeling via gt0/addr_range and gt1/addr_range",
|
||||||
|
memoryOverride: 16000000000,
|
||||||
|
capabilityFile: map[string][]byte{
|
||||||
|
"0/i915_capabilities": []byte(
|
||||||
|
"platform: new\n" +
|
||||||
|
"gen: 9"),
|
||||||
|
},
|
||||||
|
expectedRetval: nil,
|
||||||
|
expectedLabels: labelMap{
|
||||||
|
"gpu.intel.com/millicores": "1000",
|
||||||
|
"gpu.intel.com/memory.max": "8088",
|
||||||
|
"gpu.intel.com/platform_new.count": "1",
|
||||||
|
"gpu.intel.com/platform_new.present": "true",
|
||||||
|
"gpu.intel.com/platform_gen": "9",
|
||||||
|
"gpu.intel.com/cards": "card0",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
sysfsdirs: []string{
|
||||||
|
"card0/device/drm/card0",
|
||||||
|
"card0/gt/gt0",
|
||||||
|
},
|
||||||
|
sysfsfiles: map[string][]byte{
|
||||||
|
"card0/device/vendor": []byte("0x8086"),
|
||||||
|
"card0/gt/gt0/addr_range": []byte("8086"),
|
||||||
|
},
|
||||||
|
devfsdirs: []string{"card0"},
|
||||||
|
name: "successful labeling via gt0/addr_range and reserved memory",
|
||||||
|
memoryOverride: 16000000000,
|
||||||
|
memoryReserved: 86,
|
||||||
|
capabilityFile: map[string][]byte{
|
||||||
|
"0/i915_capabilities": []byte(
|
||||||
|
"platform: new\n" +
|
||||||
|
"gen: 9"),
|
||||||
|
},
|
||||||
|
expectedRetval: nil,
|
||||||
|
expectedLabels: labelMap{
|
||||||
|
"gpu.intel.com/millicores": "1000",
|
||||||
|
"gpu.intel.com/memory.max": "8000",
|
||||||
|
"gpu.intel.com/platform_new.count": "1",
|
||||||
|
"gpu.intel.com/platform_new.present": "true",
|
||||||
|
"gpu.intel.com/platform_gen": "9",
|
||||||
|
"gpu.intel.com/cards": "card0",
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
sysfsdirs: []string{
|
sysfsdirs: []string{
|
||||||
"card0/device/drm/card0",
|
"card0/device/drm/card0",
|
||||||
@ -45,7 +130,7 @@ func getTestCases() []testcase {
|
|||||||
"card0/device/vendor": []byte("0x8086"),
|
"card0/device/vendor": []byte("0x8086"),
|
||||||
},
|
},
|
||||||
devfsdirs: []string{"card0"},
|
devfsdirs: []string{"card0"},
|
||||||
name: "successful labeling",
|
name: "successful labeling via memory override",
|
||||||
memoryOverride: 16000000000,
|
memoryOverride: 16000000000,
|
||||||
capabilityFile: map[string][]byte{
|
capabilityFile: map[string][]byte{
|
||||||
"0/i915_capabilities": []byte(
|
"0/i915_capabilities": []byte(
|
||||||
@ -159,6 +244,7 @@ func TestLabeling(t *testing.T) {
|
|||||||
tc.createFiles(t, sysfs, devfs, root)
|
tc.createFiles(t, sysfs, devfs, root)
|
||||||
|
|
||||||
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
|
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
|
||||||
|
os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10))
|
||||||
|
|
||||||
labeler := newLabeler(sysfs, devfs, root)
|
labeler := newLabeler(sysfs, devfs, root)
|
||||||
err = labeler.createLabels()
|
err = labeler.createLabels()
|
||||||
@ -172,6 +258,9 @@ func TestLabeling(t *testing.T) {
|
|||||||
for filename := range tc.capabilityFile {
|
for filename := range tc.capabilityFile {
|
||||||
os.Remove(path.Join(root, filename))
|
os.Remove(path.Join(root, filename))
|
||||||
}
|
}
|
||||||
|
for filename := range tc.sysfsfiles {
|
||||||
|
os.Remove(path.Join(sysfs, filename))
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -23,13 +23,13 @@ import (
|
|||||||
const (
|
const (
|
||||||
sysfsDirectory = "/host-sys"
|
sysfsDirectory = "/host-sys"
|
||||||
devfsDirectory = "/host-dev"
|
devfsDirectory = "/host-dev"
|
||||||
sysfsDrmDirectory = sysfsDirectory + "/class/drm"
|
sysfsDRMDirectory = sysfsDirectory + "/class/drm"
|
||||||
devfsDriDirectory = devfsDirectory + "/dri"
|
devfsDRIDirectory = devfsDirectory + "/dri"
|
||||||
debugfsDRIDirectory = sysfsDirectory + "/kernel/debug/dri"
|
debugfsDRIDirectory = sysfsDirectory + "/kernel/debug/dri"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
l := newLabeler(sysfsDrmDirectory, devfsDriDirectory, debugfsDRIDirectory)
|
l := newLabeler(sysfsDRMDirectory, devfsDRIDirectory, debugfsDRIDirectory)
|
||||||
err := l.createLabels()
|
err := l.createLabels()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
klog.Errorf("%+v", err)
|
klog.Errorf("%+v", err)
|
||||||
|
Loading…
Reference in New Issue
Block a user