mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
gpu: add levelzero sidecar support for plugin and the deployment files
In addition to the levelzero's health data use, this adds support to scan devices in WSL. Scanning happens by retrieving Intel device indices from the Level-Zero API. Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
parent
2df9443fda
commit
518a8606ff
@ -23,6 +23,7 @@ import (
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@ -31,8 +32,10 @@ import (
|
||||
"k8s.io/klog/v2"
|
||||
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
|
||||
|
||||
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
|
||||
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
|
||||
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
|
||||
gpulevelzero "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero"
|
||||
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
|
||||
cdispec "tags.cncf.io/container-device-interface/specs-go"
|
||||
)
|
||||
@ -40,6 +43,8 @@ import (
|
||||
const (
|
||||
sysfsDrmDirectory = "/sys/class/drm"
|
||||
devfsDriDirectory = "/dev/dri"
|
||||
wslDxgPath = "/dev/dxg"
|
||||
wslLibPath = "/usr/lib/wsl"
|
||||
nfdFeatureDir = "/etc/kubernetes/node-feature-discovery/features.d"
|
||||
resourceFilename = "intel-gpu-resources.txt"
|
||||
gpuDeviceRE = `^card[0-9]+$`
|
||||
@ -51,6 +56,7 @@ const (
|
||||
namespace = "gpu.intel.com"
|
||||
deviceTypeI915 = "i915"
|
||||
deviceTypeXe = "xe"
|
||||
deviceTypeDxg = "dxg"
|
||||
deviceTypeDefault = deviceTypeI915
|
||||
|
||||
// telemetry resource settings.
|
||||
@ -67,8 +73,11 @@ const (
|
||||
type cliOptions struct {
|
||||
preferredAllocationPolicy string
|
||||
sharedDevNum int
|
||||
temperatureLimit int
|
||||
enableMonitoring bool
|
||||
resourceManagement bool
|
||||
wslScan bool
|
||||
healthManagement bool
|
||||
}
|
||||
|
||||
type rmWithMultipleDriversErr struct {
|
||||
@ -274,11 +283,13 @@ type devicePlugin struct {
|
||||
scanDone chan bool
|
||||
scanResources chan bool
|
||||
|
||||
resMan rm.ResourceManager
|
||||
resMan rm.ResourceManager
|
||||
levelzeroService levelzeroservice.LevelzeroService
|
||||
|
||||
sysfsDir string
|
||||
devfsDir string
|
||||
bypathDir string
|
||||
sysfsDir string
|
||||
devfsDir string
|
||||
bypathDir string
|
||||
healthStatuses map[string]string
|
||||
|
||||
// Note: If restarting the plugin with a new policy, the allocations for existing pods remain with old policy.
|
||||
policy preferredAllocationPolicyFunc
|
||||
@ -300,6 +311,7 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
|
||||
scanDone: make(chan bool, 1), // buffered as we may send to it before Scan starts receiving from it
|
||||
bypathFound: true,
|
||||
scanResources: make(chan bool, 1),
|
||||
healthStatuses: make(map[string]string),
|
||||
}
|
||||
|
||||
if options.resourceManagement {
|
||||
@ -325,15 +337,85 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
|
||||
dp.policy = nonePolicy
|
||||
}
|
||||
|
||||
if _, err := os.ReadDir(dp.bypathDir); err != nil {
|
||||
klog.Warningf("failed to read by-path dir: %+v", err)
|
||||
if !options.wslScan {
|
||||
if _, err := os.ReadDir(dp.bypathDir); err != nil {
|
||||
klog.Warningf("failed to read by-path dir: %+v", err)
|
||||
|
||||
dp.bypathFound = false
|
||||
dp.bypathFound = false
|
||||
}
|
||||
}
|
||||
|
||||
return dp
|
||||
}
|
||||
|
||||
func logHealthStatusChange(card, newStatus string, statuses map[string]string) {
|
||||
prevState, found := statuses[card]
|
||||
if !found {
|
||||
klog.V(2).Infof("%s: new => %s", card, newStatus)
|
||||
|
||||
statuses[card] = newStatus
|
||||
} else if prevState != newStatus {
|
||||
klog.V(2).Infof("%s: %s => %s", card, prevState, newStatus)
|
||||
|
||||
statuses[card] = newStatus
|
||||
}
|
||||
}
|
||||
|
||||
func (dp *devicePlugin) healthStatusForCard(cardPath string) string {
|
||||
if dp.levelzeroService == nil {
|
||||
return pluginapi.Healthy
|
||||
}
|
||||
|
||||
link, err := os.Readlink(filepath.Join(cardPath, "device"))
|
||||
if err != nil {
|
||||
klog.Warning("couldn't read device link for", cardPath)
|
||||
|
||||
return pluginapi.Healthy
|
||||
}
|
||||
|
||||
health := pluginapi.Healthy
|
||||
|
||||
// Check status changes after the function exits
|
||||
defer func() { logHealthStatusChange(cardPath, health, dp.healthStatuses) }()
|
||||
|
||||
bdfAddr := filepath.Base(link)
|
||||
|
||||
dh, err := dp.levelzeroService.GetDeviceHealth(bdfAddr)
|
||||
if err != nil {
|
||||
klog.Warningf("Device health retrieval failed: %v", err)
|
||||
|
||||
return health
|
||||
}
|
||||
|
||||
// Direct Health indicators
|
||||
klog.V(4).Infof("Health indicators: Memory=%t, Bus=%t, SoC=%t", dh.Memory, dh.Bus, dh.SoC)
|
||||
|
||||
if !dh.Memory || !dh.Bus || !dh.SoC {
|
||||
health = pluginapi.Unhealthy
|
||||
|
||||
return health
|
||||
}
|
||||
|
||||
dt, err := dp.levelzeroService.GetDeviceTemperature(bdfAddr)
|
||||
// In case of any errors, return the current health status
|
||||
if err != nil {
|
||||
klog.Warningf("Device temperature retrieval failed: %v", err)
|
||||
|
||||
return health
|
||||
}
|
||||
|
||||
limit := float64(dp.options.temperatureLimit)
|
||||
|
||||
// Temperatures for different areas
|
||||
klog.V(4).Infof("Temperatures: Memory=%.1fC, GPU=%.1fC, Global=%.1fC", dh.MemoryTemperature, dh.GPUTemperature, dh.GlobalTemperature)
|
||||
|
||||
if dt.GPU > limit || dt.Global > limit || dt.Memory > limit {
|
||||
health = pluginapi.Unhealthy
|
||||
}
|
||||
|
||||
return health
|
||||
}
|
||||
|
||||
// Implement the PreferredAllocator interface.
|
||||
func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
|
||||
if dp.resMan != nil {
|
||||
@ -369,6 +451,68 @@ func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocatio
|
||||
}
|
||||
|
||||
func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error {
|
||||
if dp.options.wslScan {
|
||||
return dp.wslGpuScan(notifier)
|
||||
} else {
|
||||
return dp.sysFsGpuScan(notifier)
|
||||
}
|
||||
}
|
||||
|
||||
func (dp *devicePlugin) wslGpuScan(notifier dpapi.Notifier) error {
|
||||
defer dp.scanTicker.Stop()
|
||||
|
||||
klog.V(1).Infof("GPU (%s) resource share count = %d", deviceTypeDxg, dp.options.sharedDevNum)
|
||||
|
||||
devSpecs := []pluginapi.DeviceSpec{
|
||||
{
|
||||
HostPath: wslDxgPath,
|
||||
ContainerPath: wslDxgPath,
|
||||
Permissions: "rw",
|
||||
},
|
||||
}
|
||||
|
||||
mounts := []pluginapi.Mount{
|
||||
{
|
||||
ContainerPath: wslLibPath,
|
||||
HostPath: wslLibPath,
|
||||
ReadOnly: true,
|
||||
},
|
||||
}
|
||||
|
||||
for {
|
||||
indices, err := dp.levelzeroService.GetIntelIndices()
|
||||
if err == nil {
|
||||
klog.V(4).Info("Intel Level-Zero indices: ", indices)
|
||||
|
||||
devTree := dpapi.NewDeviceTree()
|
||||
|
||||
for _, index := range indices {
|
||||
envs := map[string]string{
|
||||
rm.LevelzeroAffinityMaskEnvVar: strconv.Itoa(int(index)),
|
||||
}
|
||||
|
||||
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, envs, nil, nil)
|
||||
|
||||
for i := 0; i < dp.options.sharedDevNum; i++ {
|
||||
devID := fmt.Sprintf("card%d-%d", index, i)
|
||||
devTree.AddDevice(deviceTypeDxg, devID, deviceInfo)
|
||||
}
|
||||
}
|
||||
|
||||
notifier.Notify(devTree)
|
||||
} else {
|
||||
klog.Warning("Failed to get Intel indices from Level-Zero")
|
||||
}
|
||||
|
||||
select {
|
||||
case <-dp.scanDone:
|
||||
return nil
|
||||
case <-dp.scanTicker.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (dp *devicePlugin) sysFsGpuScan(notifier dpapi.Notifier) error {
|
||||
defer dp.scanTicker.Stop()
|
||||
|
||||
klog.V(1).Infof("GPU (%s/%s) resource share count = %d", deviceTypeI915, deviceTypeXe, dp.options.sharedDevNum)
|
||||
@ -566,7 +710,9 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
|
||||
|
||||
mounts, cdiDevices := dp.createMountsAndCDIDevices(cardPath, name, devSpecs)
|
||||
|
||||
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil, cdiDevices)
|
||||
health := dp.healthStatusForCard(cardPath)
|
||||
|
||||
deviceInfo := dpapi.NewDeviceInfo(health, devSpecs, mounts, nil, nil, cdiDevices)
|
||||
|
||||
for i := 0; i < dp.options.sharedDevNum; i++ {
|
||||
devID := fmt.Sprintf("%s-%d", name, i)
|
||||
@ -627,7 +773,10 @@ func main() {
|
||||
flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths")
|
||||
flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable '*_monitoring' (= all GPUs) resource")
|
||||
flag.BoolVar(&opts.resourceManagement, "resource-manager", false, "fractional GPU resource management")
|
||||
flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management")
|
||||
flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices")
|
||||
flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device")
|
||||
flag.IntVar(&opts.temperatureLimit, "temp-limit", 100, "temperature limit at which device is marked unhealthy")
|
||||
flag.StringVar(&opts.preferredAllocationPolicy, "allocation-policy", "none", "modes of allocating GPU devices: balanced, packed and none")
|
||||
flag.Parse()
|
||||
|
||||
@ -651,6 +800,34 @@ func main() {
|
||||
|
||||
plugin := newDevicePlugin(prefix+sysfsDrmDirectory, prefix+devfsDriDirectory, opts)
|
||||
|
||||
if plugin.options.wslScan {
|
||||
klog.Info("WSL mode requested")
|
||||
|
||||
if plugin.options.resourceManagement {
|
||||
klog.Error("Resource management is not supported within WSL. Please disable resource management.")
|
||||
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if plugin.options.enableMonitoring {
|
||||
klog.Error("Monitoring is not supported within WSL. Please disable monitoring.")
|
||||
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if plugin.options.healthManagement {
|
||||
klog.Error("Health management is not supported within WSL. Please disable health management.")
|
||||
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
if plugin.options.healthManagement || plugin.options.wslScan {
|
||||
plugin.levelzeroService = levelzeroservice.NewLevelzero(gpulevelzero.DefaultUnixSocketPath)
|
||||
|
||||
go plugin.levelzeroService.Run(true)
|
||||
}
|
||||
|
||||
if plugin.options.resourceManagement {
|
||||
// Start labeler to export labels file for NFD.
|
||||
nfdFeatureFile := path.Join(nfdFeatureDir, resourceFilename)
|
||||
@ -659,7 +836,10 @@ func main() {
|
||||
|
||||
// Labeler catches OS signals and calls os.Exit() after receiving any.
|
||||
go labeler.Run(prefix+sysfsDrmDirectory, nfdFeatureFile,
|
||||
labelerMaxInterval, plugin.scanResources)
|
||||
labelerMaxInterval, plugin.scanResources, plugin.levelzeroService, func() {
|
||||
// Exit the whole app when labeler exits
|
||||
os.Exit(0)
|
||||
})
|
||||
}
|
||||
|
||||
manager := dpapi.NewManager(namespace, plugin)
|
||||
|
@ -27,6 +27,7 @@ import (
|
||||
"k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
|
||||
"k8s.io/utils/strings/slices"
|
||||
|
||||
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
|
||||
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
|
||||
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
|
||||
cdispec "tags.cncf.io/container-device-interface/specs-go"
|
||||
@ -41,6 +42,7 @@ type mockNotifier struct {
|
||||
scanDone chan bool
|
||||
i915Count int
|
||||
xeCount int
|
||||
dxgCount int
|
||||
i915monitorCount int
|
||||
xeMonitorCount int
|
||||
}
|
||||
@ -50,6 +52,7 @@ func (n *mockNotifier) Notify(newDeviceTree dpapi.DeviceTree) {
|
||||
n.xeCount = len(newDeviceTree[deviceTypeXe])
|
||||
n.xeMonitorCount = len(newDeviceTree[deviceTypeXe+monitorSuffix])
|
||||
n.i915Count = len(newDeviceTree[deviceTypeI915])
|
||||
n.dxgCount = len(newDeviceTree[deviceTypeDxg])
|
||||
n.i915monitorCount = len(newDeviceTree[deviceTypeDefault+monitorSuffix])
|
||||
|
||||
n.scanDone <- true
|
||||
@ -72,18 +75,63 @@ func (m *mockResourceManager) SetTileCountPerCard(count uint64) {
|
||||
m.tileCount = count
|
||||
}
|
||||
|
||||
type mockL0Service struct {
|
||||
indices []uint32
|
||||
memSize uint64
|
||||
healthy bool
|
||||
fail bool
|
||||
}
|
||||
|
||||
func (m *mockL0Service) Run(keep bool) {
|
||||
}
|
||||
func (m *mockL0Service) Stop() {
|
||||
}
|
||||
func (m *mockL0Service) GetIntelIndices() ([]uint32, error) {
|
||||
if m.fail {
|
||||
return m.indices, errors.Errorf("error, error")
|
||||
}
|
||||
|
||||
return m.indices, nil
|
||||
}
|
||||
func (m *mockL0Service) GetDeviceHealth(bdfAddress string) (levelzeroservice.DeviceHealth, error) {
|
||||
if m.fail {
|
||||
return levelzeroservice.DeviceHealth{}, errors.Errorf("error, error")
|
||||
}
|
||||
|
||||
return levelzeroservice.DeviceHealth{Memory: m.healthy, Bus: m.healthy, SoC: m.healthy}, nil
|
||||
}
|
||||
func (m *mockL0Service) GetDeviceTemperature(bdfAddress string) (levelzeroservice.DeviceTemperature, error) {
|
||||
if m.fail {
|
||||
return levelzeroservice.DeviceTemperature{}, errors.Errorf("error, error")
|
||||
}
|
||||
|
||||
return levelzeroservice.DeviceTemperature{Global: 35.0, GPU: 35.0, Memory: 35.0}, nil
|
||||
}
|
||||
func (m *mockL0Service) GetDeviceMemoryAmount(bdfAddress string) (uint64, error) {
|
||||
if m.fail {
|
||||
return m.memSize, errors.Errorf("error, error")
|
||||
}
|
||||
|
||||
return m.memSize, nil
|
||||
}
|
||||
|
||||
type TestCaseDetails struct {
|
||||
name string
|
||||
// possible mock l0 service
|
||||
l0mock levelzeroservice.LevelzeroService
|
||||
// test-case environment
|
||||
sysfsdirs []string
|
||||
pciAddresses map[string]string
|
||||
sysfsfiles map[string][]byte
|
||||
symlinkfiles map[string]string
|
||||
name string
|
||||
sysfsdirs []string
|
||||
devfsdirs []string
|
||||
// how plugin should interpret it
|
||||
options cliOptions
|
||||
// what the result should be (i915)
|
||||
expectedI915Devs int
|
||||
expectedI915Monitors int
|
||||
// what the result should be (dxg)
|
||||
expectedDxgDevs int
|
||||
// what the result should be (xe)
|
||||
expectedXeDevs int
|
||||
expectedXeMonitors int
|
||||
@ -99,6 +147,33 @@ func createTestFiles(root string, tc TestCaseDetails) (string, string, error) {
|
||||
}
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(sysfs, 0750); err != nil {
|
||||
return "", "", errors.Wrap(err, "Failed to create fake base sysfs directory")
|
||||
}
|
||||
|
||||
if len(tc.pciAddresses) > 0 {
|
||||
if err := os.MkdirAll(filepath.Join(sysfs, ".devices"), 0750); err != nil {
|
||||
return "", "", errors.Wrap(err, "Failed to create fake pci address base")
|
||||
}
|
||||
|
||||
for pci, card := range tc.pciAddresses {
|
||||
fullPci := filepath.Join(sysfs, ".devices", pci)
|
||||
cardPath := filepath.Join(sysfs, card)
|
||||
|
||||
if err := os.MkdirAll(fullPci, 0750); err != nil {
|
||||
return "", "", errors.Wrap(err, "Failed to create fake pci address entry")
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(cardPath, 0750); err != nil {
|
||||
return "", "", errors.Wrap(err, "Failed to create fake card entry")
|
||||
}
|
||||
|
||||
if err := os.Symlink(fullPci, filepath.Join(sysfs, card, "device")); err != nil {
|
||||
return "", "", errors.Wrap(err, "Failed to create fake pci address symlinks")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, sysfsdir := range tc.sysfsdirs {
|
||||
if err := os.MkdirAll(path.Join(sysfs, sysfsdir), 0750); err != nil {
|
||||
return "", "", errors.Wrap(err, "Failed to create fake device directory")
|
||||
@ -444,6 +519,176 @@ func TestScan(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestScanWithHealth(t *testing.T) {
|
||||
tcases := []TestCaseDetails{
|
||||
{
|
||||
name: "one device with no symlink",
|
||||
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
},
|
||||
devfsdirs: []string{
|
||||
"card0",
|
||||
"by-path/pci-0000:00:00.0-card",
|
||||
"by-path/pci-0000:00:00.0-render",
|
||||
},
|
||||
expectedI915Devs: 1,
|
||||
},
|
||||
{
|
||||
name: "one device with proper symlink",
|
||||
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
|
||||
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
},
|
||||
devfsdirs: []string{
|
||||
"card0",
|
||||
"by-path/pci-0000:00:00.0-card",
|
||||
"by-path/pci-0000:00:00.0-render",
|
||||
},
|
||||
expectedI915Devs: 1,
|
||||
l0mock: &mockL0Service{
|
||||
healthy: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "one unhealthy device with proper symlink",
|
||||
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
|
||||
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
},
|
||||
devfsdirs: []string{
|
||||
"card0",
|
||||
"by-path/pci-0000:00:00.0-card",
|
||||
"by-path/pci-0000:00:00.0-render",
|
||||
},
|
||||
expectedI915Devs: 1,
|
||||
l0mock: &mockL0Service{
|
||||
healthy: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "one device with proper symlink returns error",
|
||||
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
|
||||
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
|
||||
sysfsfiles: map[string][]byte{
|
||||
"card0/device/vendor": []byte("0x8086"),
|
||||
},
|
||||
devfsdirs: []string{
|
||||
"card0",
|
||||
"by-path/pci-0000:00:00.0-card",
|
||||
"by-path/pci-0000:00:00.0-render",
|
||||
},
|
||||
expectedI915Devs: 1,
|
||||
l0mock: &mockL0Service{
|
||||
fail: true,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tcases {
|
||||
if tc.options.sharedDevNum == 0 {
|
||||
tc.options.sharedDevNum = 1
|
||||
}
|
||||
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
root, err := os.MkdirTemp("", "test_new_device_plugin")
|
||||
if err != nil {
|
||||
t.Fatalf("can't create temporary directory: %+v", err)
|
||||
}
|
||||
|
||||
// dirs/files need to be removed for the next test
|
||||
defer os.RemoveAll(root)
|
||||
|
||||
sysfs, devfs, err := createTestFiles(root, tc)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %+v", err)
|
||||
}
|
||||
|
||||
plugin := newDevicePlugin(sysfs, devfs, tc.options)
|
||||
|
||||
plugin.levelzeroService = tc.l0mock
|
||||
|
||||
notifier := &mockNotifier{
|
||||
scanDone: plugin.scanDone,
|
||||
}
|
||||
|
||||
err = plugin.Scan(notifier)
|
||||
// Scans in GPU plugin never fail
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %+v", err)
|
||||
}
|
||||
if tc.expectedI915Devs != notifier.i915Count {
|
||||
t.Errorf("Expected %d, discovered %d devices (i915)",
|
||||
tc.expectedI915Devs, notifier.i915Count)
|
||||
}
|
||||
if tc.expectedI915Monitors != notifier.i915monitorCount {
|
||||
t.Errorf("Expected %d, discovered %d monitors (i915)",
|
||||
tc.expectedI915Monitors, notifier.i915monitorCount)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestScanWsl(t *testing.T) {
|
||||
tcases := []TestCaseDetails{
|
||||
{
|
||||
name: "one wsl device",
|
||||
expectedDxgDevs: 1,
|
||||
l0mock: &mockL0Service{
|
||||
indices: []uint32{0},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "four wsl device",
|
||||
expectedDxgDevs: 4,
|
||||
l0mock: &mockL0Service{
|
||||
indices: []uint32{0, 1, 2, 3},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tcases {
|
||||
if tc.options.sharedDevNum == 0 {
|
||||
tc.options.sharedDevNum = 1
|
||||
}
|
||||
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
root, err := os.MkdirTemp("", "test_new_device_plugin")
|
||||
if err != nil {
|
||||
t.Fatalf("can't create temporary directory: %+v", err)
|
||||
}
|
||||
|
||||
// dirs/files need to be removed for the next test
|
||||
defer os.RemoveAll(root)
|
||||
|
||||
sysfs, devfs, err := createTestFiles(root, tc)
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %+v", err)
|
||||
}
|
||||
|
||||
plugin := newDevicePlugin(sysfs, devfs, tc.options)
|
||||
plugin.options.wslScan = true
|
||||
plugin.levelzeroService = tc.l0mock
|
||||
|
||||
notifier := &mockNotifier{
|
||||
scanDone: plugin.scanDone,
|
||||
}
|
||||
|
||||
err = plugin.Scan(notifier)
|
||||
// Scans in GPU plugin never fail
|
||||
if err != nil {
|
||||
t.Errorf("unexpected error: %+v", err)
|
||||
}
|
||||
if tc.expectedDxgDevs != notifier.dxgCount {
|
||||
t.Errorf("Expected %d, discovered %d devices (dxg)",
|
||||
tc.expectedI915Devs, notifier.i915Count)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestScanFails(t *testing.T) {
|
||||
tc := TestCaseDetails{
|
||||
name: "xe and i915 devices with rm will fail",
|
||||
@ -582,11 +827,11 @@ func createBypathTestFiles(t *testing.T, card, root, linkFile string, bypathFile
|
||||
byPath := path.Join(root, "by-path")
|
||||
|
||||
if linkFile != "" {
|
||||
if err := os.MkdirAll(filepath.Dir(devPath), os.ModePerm); err != nil {
|
||||
if err := os.MkdirAll(filepath.Dir(devPath), 0700); err != nil {
|
||||
t.Fatal("Couldn't create test dev dir", err)
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(filepath.Dir(drmPath), os.ModePerm); err != nil {
|
||||
if err := os.MkdirAll(filepath.Dir(drmPath), 0700); err != nil {
|
||||
t.Fatal("Couldn't create test drm dir", err)
|
||||
}
|
||||
|
||||
@ -600,7 +845,7 @@ func createBypathTestFiles(t *testing.T, card, root, linkFile string, bypathFile
|
||||
}
|
||||
|
||||
if len(bypathFiles) > 0 {
|
||||
if err := os.MkdirAll(byPath, os.ModePerm); err != nil {
|
||||
if err := os.MkdirAll(byPath, 0700); err != nil {
|
||||
t.Fatal("Mkdir failed:", byPath)
|
||||
}
|
||||
|
||||
|
208
cmd/gpu_plugin/levelzeroservice/levelzero_service.go
Normal file
208
cmd/gpu_plugin/levelzeroservice/levelzero_service.go
Normal file
@ -0,0 +1,208 @@
|
||||
// Copyright 2024 Intel Corporation. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package levelzeroservice
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
lz "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/connectivity"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
"k8s.io/klog/v2"
|
||||
)
|
||||
|
||||
type LevelzeroService interface {
|
||||
Run(bool)
|
||||
GetIntelIndices() ([]uint32, error)
|
||||
GetDeviceHealth(bdfAddress string) (DeviceHealth, error)
|
||||
GetDeviceTemperature(bdfAddress string) (DeviceTemperature, error)
|
||||
GetDeviceMemoryAmount(bdfAddress string) (uint64, error)
|
||||
}
|
||||
|
||||
type DeviceHealth struct {
|
||||
Memory bool
|
||||
Bus bool
|
||||
SoC bool
|
||||
GlobalTemperature float64
|
||||
GPUTemperature float64
|
||||
MemoryTemperature float64
|
||||
}
|
||||
|
||||
type DeviceTemperature struct {
|
||||
Global float64
|
||||
GPU float64
|
||||
Memory float64
|
||||
}
|
||||
|
||||
type clientNotReadyErr struct{}
|
||||
|
||||
func (e *clientNotReadyErr) Error() string {
|
||||
return "client is not (yet) ready"
|
||||
}
|
||||
|
||||
func NewLevelzero(socket string) LevelzeroService {
|
||||
return &levelzero{
|
||||
socketPath: socket,
|
||||
ctx: context.Background(),
|
||||
conn: nil,
|
||||
client: nil,
|
||||
}
|
||||
}
|
||||
|
||||
type levelzero struct {
|
||||
client lz.LevelzeroClient
|
||||
ctx context.Context
|
||||
conn *grpc.ClientConn
|
||||
socketPath string
|
||||
}
|
||||
|
||||
func (l *levelzero) Run(keep bool) {
|
||||
url := "unix://" + l.socketPath
|
||||
|
||||
klog.V(3).Info("Starting Level-Zero client. Connecting to: ", url)
|
||||
|
||||
conn, err := grpc.NewClient(url, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
if err != nil {
|
||||
klog.Error("Failed to connect to socket", err)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
l.conn = conn
|
||||
|
||||
for {
|
||||
state := l.conn.GetState()
|
||||
if state == connectivity.Idle {
|
||||
conn.Connect()
|
||||
}
|
||||
|
||||
if state == connectivity.Ready {
|
||||
klog.V(2).Info("Connection ready")
|
||||
|
||||
l.client = lz.NewLevelzeroClient(conn)
|
||||
|
||||
if !keep {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !conn.WaitForStateChange(ctx, state) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (l *levelzero) isClientReady() bool {
|
||||
return l.client != nil
|
||||
}
|
||||
|
||||
func (l *levelzero) GetIntelIndices() ([]uint32, error) {
|
||||
if !l.isClientReady() {
|
||||
return []uint32{}, &clientNotReadyErr{}
|
||||
}
|
||||
|
||||
cli := l.client
|
||||
|
||||
indices, err := cli.GetIntelIndices(l.ctx, &lz.GetIntelIndicesMessage{})
|
||||
if err != nil || indices == nil {
|
||||
return []uint32{}, err
|
||||
}
|
||||
|
||||
if indices.Error != nil && indices.Error.Errorcode != 0 {
|
||||
klog.Warningf("indices request returned internal error: 0x%X (%s)", indices.Error.Errorcode, indices.Error.Description)
|
||||
}
|
||||
|
||||
return indices.Indices, nil
|
||||
}
|
||||
|
||||
func (l *levelzero) GetDeviceHealth(bdfAddress string) (DeviceHealth, error) {
|
||||
if !l.isClientReady() {
|
||||
return DeviceHealth{}, &clientNotReadyErr{}
|
||||
}
|
||||
|
||||
cli := l.client
|
||||
|
||||
did := lz.DeviceId{
|
||||
BdfAddress: bdfAddress,
|
||||
}
|
||||
|
||||
health, err := cli.GetDeviceHealth(l.ctx, &did)
|
||||
if err != nil || health == nil {
|
||||
return DeviceHealth{}, err
|
||||
}
|
||||
|
||||
if health.Error != nil && health.Error.Errorcode != 0 {
|
||||
klog.Warningf("health request returned internal error: 0x%X (%s)", health.Error.Errorcode, health.Error.Description)
|
||||
}
|
||||
|
||||
return DeviceHealth{
|
||||
Memory: health.MemoryOk,
|
||||
Bus: health.BusOk,
|
||||
SoC: health.SocOk,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (l *levelzero) GetDeviceTemperature(bdfAddress string) (DeviceTemperature, error) {
|
||||
if !l.isClientReady() {
|
||||
return DeviceTemperature{}, &clientNotReadyErr{}
|
||||
}
|
||||
|
||||
cli := l.client
|
||||
|
||||
did := lz.DeviceId{
|
||||
BdfAddress: bdfAddress,
|
||||
}
|
||||
|
||||
temps, err := cli.GetDeviceTemperature(l.ctx, &did)
|
||||
if err != nil || temps == nil {
|
||||
return DeviceTemperature{}, err
|
||||
}
|
||||
|
||||
if temps.Error != nil && temps.Error.Errorcode != 0 {
|
||||
klog.Warningf("temperature request returned internal error: 0x%X (%s)", temps.Error.Errorcode, temps.Error.Description)
|
||||
}
|
||||
|
||||
return DeviceTemperature{
|
||||
Global: temps.Global,
|
||||
GPU: temps.Gpu,
|
||||
Memory: temps.Memory,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (l *levelzero) GetDeviceMemoryAmount(bdfAddress string) (uint64, error) {
|
||||
if !l.isClientReady() {
|
||||
return 0, &clientNotReadyErr{}
|
||||
}
|
||||
|
||||
cli := l.client
|
||||
|
||||
did := lz.DeviceId{
|
||||
BdfAddress: bdfAddress,
|
||||
}
|
||||
|
||||
memSize, err := cli.GetDeviceMemoryAmount(l.ctx, &did)
|
||||
if err != nil || memSize == nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if memSize.Error != nil && memSize.Error.Errorcode != 0 {
|
||||
klog.Warningf("memory request returned internal error: 0x%X (%s)", memSize.Error.Errorcode, memSize.Error.Description)
|
||||
}
|
||||
|
||||
return memSize.MemorySize, nil
|
||||
}
|
307
cmd/gpu_plugin/levelzeroservice/levelzero_service_test.go
Normal file
307
cmd/gpu_plugin/levelzeroservice/levelzero_service_test.go
Normal file
@ -0,0 +1,307 @@
|
||||
// Copyright 2024 Intel Corporation. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package levelzeroservice
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"log"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
lz "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero"
|
||||
"google.golang.org/grpc"
|
||||
)
|
||||
|
||||
const (
|
||||
NoError = iota
|
||||
InternalError = iota
|
||||
ExternalError = iota
|
||||
)
|
||||
|
||||
func init() {
|
||||
_ = flag.Set("v", "4") //Enable debug output
|
||||
}
|
||||
|
||||
type mockServer struct {
|
||||
lz.UnimplementedLevelzeroServer
|
||||
failRequest int
|
||||
}
|
||||
|
||||
func (m *mockServer) serve(socketPath string) {
|
||||
lis, err := net.Listen("unix", socketPath)
|
||||
if err != nil {
|
||||
log.Fatalf("failed to listen: %v", err)
|
||||
}
|
||||
|
||||
s := grpc.NewServer()
|
||||
|
||||
lz.RegisterLevelzeroServer(s, m)
|
||||
|
||||
go func() {
|
||||
if err := s.Serve(lis); err != nil {
|
||||
log.Fatalf("failed to serve: %v", err)
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (m *mockServer) GetDeviceHealth(c context.Context, deviceid *lz.DeviceId) (*lz.DeviceHealth, error) {
|
||||
if m.failRequest == ExternalError {
|
||||
return nil, os.ErrInvalid
|
||||
}
|
||||
|
||||
health := &lz.DeviceHealth{
|
||||
BusOk: true,
|
||||
MemoryOk: true,
|
||||
Error: nil,
|
||||
}
|
||||
|
||||
if m.failRequest == InternalError {
|
||||
health.MemoryOk = false
|
||||
health.Error = &lz.Error{
|
||||
Description: "error error",
|
||||
Errorcode: 99,
|
||||
}
|
||||
}
|
||||
|
||||
return health, nil
|
||||
}
|
||||
|
||||
func (m *mockServer) GetDeviceTemperature(c context.Context, deviceid *lz.DeviceId) (*lz.DeviceTemperature, error) {
|
||||
if m.failRequest == ExternalError {
|
||||
return nil, os.ErrInvalid
|
||||
}
|
||||
|
||||
temps := &lz.DeviceTemperature{
|
||||
Global: 35.0,
|
||||
Gpu: 35.0,
|
||||
Memory: 35.0,
|
||||
Error: nil,
|
||||
}
|
||||
|
||||
if m.failRequest == InternalError {
|
||||
temps.Global = -999.0
|
||||
temps.Gpu = -999.0
|
||||
temps.Memory = -999.0
|
||||
temps.Error = &lz.Error{
|
||||
Description: "error error",
|
||||
Errorcode: 99,
|
||||
}
|
||||
}
|
||||
|
||||
return temps, nil
|
||||
}
|
||||
|
||||
func (m *mockServer) GetIntelIndices(c context.Context, msg *lz.GetIntelIndicesMessage) (*lz.DeviceIndices, error) {
|
||||
if m.failRequest == ExternalError {
|
||||
return nil, os.ErrInvalid
|
||||
}
|
||||
|
||||
ret := lz.DeviceIndices{
|
||||
Indices: []uint32{0},
|
||||
Error: nil,
|
||||
}
|
||||
|
||||
if m.failRequest == InternalError {
|
||||
ret.Indices = []uint32{}
|
||||
ret.Error = &lz.Error{
|
||||
Description: "error error",
|
||||
Errorcode: 99,
|
||||
}
|
||||
}
|
||||
|
||||
return &ret, nil
|
||||
}
|
||||
|
||||
func (m *mockServer) GetDeviceMemoryAmount(c context.Context, deviceid *lz.DeviceId) (*lz.DeviceMemoryAmount, error) {
|
||||
if m.failRequest == ExternalError {
|
||||
return nil, os.ErrInvalid
|
||||
}
|
||||
|
||||
ret := lz.DeviceMemoryAmount{
|
||||
MemorySize: 1000,
|
||||
Error: nil,
|
||||
}
|
||||
|
||||
if m.failRequest == InternalError {
|
||||
ret.MemorySize = 0
|
||||
ret.Error = &lz.Error{
|
||||
Description: "error error",
|
||||
Errorcode: 99,
|
||||
}
|
||||
}
|
||||
|
||||
return &ret, nil
|
||||
}
|
||||
|
||||
type testcase struct {
|
||||
name string
|
||||
fail int
|
||||
}
|
||||
|
||||
var tcases = []testcase{
|
||||
{
|
||||
name: "normal flow",
|
||||
fail: NoError,
|
||||
},
|
||||
{
|
||||
name: "fail flow - internal",
|
||||
fail: InternalError,
|
||||
},
|
||||
{
|
||||
name: "fail flow - external",
|
||||
fail: ExternalError,
|
||||
},
|
||||
}
|
||||
|
||||
func TestGetDeviceHealth(t *testing.T) {
|
||||
for _, tc := range tcases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
d, err := os.MkdirTemp("", "testinglevelzero*")
|
||||
if err != nil {
|
||||
t.Fatal("failed to create tmp directory")
|
||||
}
|
||||
|
||||
defer os.RemoveAll(d)
|
||||
|
||||
sockPath := filepath.Join(d, "server.sock")
|
||||
|
||||
mock := mockServer{
|
||||
failRequest: tc.fail,
|
||||
}
|
||||
|
||||
mock.serve(sockPath)
|
||||
|
||||
n := NewLevelzero(sockPath)
|
||||
|
||||
n.Run(false)
|
||||
|
||||
dh, err := n.GetDeviceHealth("0000:00:00.1")
|
||||
|
||||
if tc.fail == NoError && err != nil {
|
||||
t.Error("GetDeviceHealth returned an error:", err)
|
||||
}
|
||||
|
||||
if tc.fail == NoError && (!dh.Memory || !dh.Bus) {
|
||||
t.Error("Call to device health returned unhealthy", dh, tc.fail)
|
||||
}
|
||||
|
||||
if tc.fail == ExternalError && err == nil {
|
||||
t.Error("GetDeviceHealth returned nil and expected error")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetIndices(t *testing.T) {
|
||||
for _, tc := range tcases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
d, err := os.MkdirTemp("", "testinglevelzero*")
|
||||
if err != nil {
|
||||
t.Fatal("failed to create tmp directory")
|
||||
}
|
||||
|
||||
defer os.RemoveAll(d)
|
||||
|
||||
sockPath := filepath.Join(d, "server.sock")
|
||||
|
||||
mock := mockServer{
|
||||
failRequest: tc.fail,
|
||||
}
|
||||
|
||||
mock.serve(sockPath)
|
||||
|
||||
n := NewLevelzero(sockPath)
|
||||
|
||||
n.Run(false)
|
||||
|
||||
indices, err := n.GetIntelIndices()
|
||||
|
||||
if tc.fail == NoError && err != nil {
|
||||
t.Error("GetIntelIndices returned error:", err)
|
||||
}
|
||||
|
||||
if tc.fail == ExternalError && err == nil {
|
||||
t.Error("GetIntelIndices returned nil and expected error")
|
||||
}
|
||||
|
||||
if tc.fail == NoError && len(indices) != 1 {
|
||||
t.Error("Wrong number of indices received", indices)
|
||||
}
|
||||
if tc.fail != NoError && len(indices) != 0 {
|
||||
t.Error("Wrong number of indices received", indices)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetMemoryAmount(t *testing.T) {
|
||||
for _, tc := range tcases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
d, err := os.MkdirTemp("", "testinglevelzero*")
|
||||
if err != nil {
|
||||
t.Fatal("failed to create tmp directory")
|
||||
}
|
||||
|
||||
defer os.RemoveAll(d)
|
||||
|
||||
sockPath := filepath.Join(d, "server.sock")
|
||||
|
||||
mock := mockServer{
|
||||
failRequest: tc.fail,
|
||||
}
|
||||
|
||||
mock.serve(sockPath)
|
||||
|
||||
n := NewLevelzero(sockPath)
|
||||
n.Run(false)
|
||||
|
||||
memSize, err := n.GetDeviceMemoryAmount("0000:11:22.3")
|
||||
|
||||
if tc.fail == NoError && err != nil {
|
||||
t.Error("TestGetMemoryAmount returned error:", err)
|
||||
}
|
||||
|
||||
if tc.fail == ExternalError && err == nil {
|
||||
t.Error("TestGetMemoryAmount returned nil and expected error")
|
||||
}
|
||||
|
||||
if tc.fail == NoError && memSize != 1000 {
|
||||
t.Error("Wrong mem size received", memSize)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAccessBeforeReady(t *testing.T) {
|
||||
n := NewLevelzero("/tmp/foobar.sock")
|
||||
|
||||
_, err := n.GetDeviceMemoryAmount("")
|
||||
if err == nil {
|
||||
t.Error("Got non-error for memory amount, expected error")
|
||||
}
|
||||
|
||||
_, err = n.GetDeviceHealth("")
|
||||
if err == nil {
|
||||
t.Error("Got non-error for health, expected error")
|
||||
}
|
||||
|
||||
_, err = n.GetIntelIndices()
|
||||
if err == nil {
|
||||
t.Error("Got non-error for indices, expected error")
|
||||
}
|
||||
}
|
@ -51,8 +51,8 @@ const (
|
||||
gasCardAnnotation = "gas-container-cards"
|
||||
gasTileAnnotation = "gas-container-tiles"
|
||||
|
||||
levelZeroAffinityMaskEnvVar = "ZE_AFFINITY_MASK"
|
||||
levelZeroHierarchyEnvVar = "ZE_FLAT_DEVICE_HIERARCHY"
|
||||
LevelzeroAffinityMaskEnvVar = "ZE_AFFINITY_MASK"
|
||||
levelzeroHierarchyEnvVar = "ZE_FLAT_DEVICE_HIERARCHY"
|
||||
|
||||
hierarchyModeComposite = "COMPOSITE"
|
||||
hierarchyModeFlat = "FLAT"
|
||||
@ -796,7 +796,7 @@ func (rm *resourceManager) createAllocateResponse(deviceIds []string, tileAffini
|
||||
cresp.Envs = make(map[string]string)
|
||||
}
|
||||
|
||||
cresp.Envs[levelZeroAffinityMaskEnvVar] = tileAffinityMask
|
||||
cresp.Envs[LevelzeroAffinityMaskEnvVar] = tileAffinityMask
|
||||
}
|
||||
|
||||
allocateResponse.ContainerResponses = append(allocateResponse.ContainerResponses, &cresp)
|
||||
@ -851,7 +851,7 @@ func containerCards(pod *v1.Pod, gpuUsingContainerIndex int) []string {
|
||||
|
||||
// Guesses level zero hierarchy mode for the container. Defaults to the new "flat" mode
|
||||
// if no mode is set in the container's env variables.
|
||||
func guessLevelZeroHierarchyMode(pod *v1.Pod, containerIndex int) string {
|
||||
func guessLevelzeroHierarchyMode(pod *v1.Pod, containerIndex int) string {
|
||||
klog.V(4).Infof("Checking pod %s envs", pod.Name)
|
||||
|
||||
if containerIndex < len(pod.Spec.Containers) {
|
||||
@ -859,7 +859,7 @@ func guessLevelZeroHierarchyMode(pod *v1.Pod, containerIndex int) string {
|
||||
|
||||
if c.Env != nil {
|
||||
for _, env := range c.Env {
|
||||
if env.Name == levelZeroHierarchyEnvVar {
|
||||
if env.Name == levelzeroHierarchyEnvVar {
|
||||
switch env.Value {
|
||||
// Check that the value is valid.
|
||||
case hierarchyModeComposite:
|
||||
@ -959,7 +959,7 @@ func containerTileAffinityMask(pod *v1.Pod, gpuUsingContainerIndex, tilesPerCard
|
||||
}
|
||||
|
||||
if i == gpuUsingContainerIndex {
|
||||
return convertTileInfoToEnvMask(containerTileInfo, tilesPerCard, guessLevelZeroHierarchyMode(pod, containerIndex))
|
||||
return convertTileInfoToEnvMask(containerTileInfo, tilesPerCard, guessLevelzeroHierarchyMode(pod, containerIndex))
|
||||
}
|
||||
|
||||
i++
|
||||
|
@ -472,7 +472,7 @@ func TestCreateFractionalResourceResponseWithOneCardTwoTiles(t *testing.T) {
|
||||
},
|
||||
Env: []v1.EnvVar{
|
||||
{
|
||||
Name: levelZeroHierarchyEnvVar,
|
||||
Name: levelzeroHierarchyEnvVar,
|
||||
Value: hierarchyModeComposite,
|
||||
},
|
||||
},
|
||||
@ -522,8 +522,8 @@ func TestCreateFractionalResourceResponseWithOneCardTwoTiles(t *testing.T) {
|
||||
// check response
|
||||
expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1")
|
||||
expectTruef(len(resp.ContainerResponses[0].Envs) == 2, t, tCase.name, "wrong number of env variables in container response, expected 2")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] == "0.0,0.1", t, tCase.name, "l0 affinity mask is incorrect")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.0,0.1", t, tCase.name, "l0 affinity mask is incorrect")
|
||||
expectTruef(len(resp.ContainerResponses[0].Devices) == 1, t, tCase.name, "wrong number of devices, expected 1")
|
||||
}
|
||||
|
||||
@ -545,7 +545,7 @@ func TestCreateFractionalResourceResponseWithTwoCardsOneTile(t *testing.T) {
|
||||
},
|
||||
Env: []v1.EnvVar{
|
||||
{
|
||||
Name: levelZeroHierarchyEnvVar,
|
||||
Name: levelzeroHierarchyEnvVar,
|
||||
Value: hierarchyModeComposite,
|
||||
},
|
||||
},
|
||||
@ -598,8 +598,8 @@ func TestCreateFractionalResourceResponseWithTwoCardsOneTile(t *testing.T) {
|
||||
} else {
|
||||
// check response
|
||||
expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] == "0.3,1.4", t, tCase.name, "l0 affinity mask is incorrect: ")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.3,1.4", t, tCase.name, "l0 affinity mask is incorrect: ")
|
||||
expectTruef(len(resp.ContainerResponses[0].Devices) == 2, t, tCase.name, "wrong number of devices, expected 2")
|
||||
}
|
||||
}
|
||||
@ -623,7 +623,7 @@ func TestCreateFractionalResourceResponseWithThreeCardsTwoTiles(t *testing.T) {
|
||||
},
|
||||
Env: []v1.EnvVar{
|
||||
{
|
||||
Name: levelZeroHierarchyEnvVar,
|
||||
Name: levelzeroHierarchyEnvVar,
|
||||
Value: hierarchyModeComposite,
|
||||
},
|
||||
},
|
||||
@ -676,8 +676,8 @@ func TestCreateFractionalResourceResponseWithThreeCardsTwoTiles(t *testing.T) {
|
||||
} else {
|
||||
// check response
|
||||
expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] == "0.0,0.1,1.2,1.3,2.3,2.4", t, tCase.name, "l0 affinity mask is incorrect: ")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
|
||||
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.0,0.1,1.2,1.3,2.3,2.4", t, tCase.name, "l0 affinity mask is incorrect: ")
|
||||
expectTruef(len(resp.ContainerResponses[0].Devices) == 3, t, tCase.name, "wrong number of devices, expected 3")
|
||||
}
|
||||
}
|
||||
@ -701,7 +701,7 @@ func TestCreateFractionalResourceResponseWithMultipleContainersTileEach(t *testi
|
||||
},
|
||||
Env: []v1.EnvVar{
|
||||
{
|
||||
Name: levelZeroHierarchyEnvVar,
|
||||
Name: levelzeroHierarchyEnvVar,
|
||||
Value: hierarchyModeComposite,
|
||||
},
|
||||
},
|
||||
@ -714,7 +714,7 @@ func TestCreateFractionalResourceResponseWithMultipleContainersTileEach(t *testi
|
||||
},
|
||||
Env: []v1.EnvVar{
|
||||
{
|
||||
Name: levelZeroHierarchyEnvVar,
|
||||
Name: levelzeroHierarchyEnvVar,
|
||||
Value: hierarchyModeComposite,
|
||||
},
|
||||
},
|
||||
@ -945,7 +945,7 @@ func TestTileAnnotationParsing(t *testing.T) {
|
||||
if i < len(pt.hierarchys) {
|
||||
pod.Spec.Containers[i].Env = []v1.EnvVar{
|
||||
{
|
||||
Name: levelZeroHierarchyEnvVar,
|
||||
Name: levelzeroHierarchyEnvVar,
|
||||
Value: pt.hierarchys[i],
|
||||
},
|
||||
}
|
||||
|
@ -29,6 +29,7 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
|
||||
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
|
||||
"github.com/pkg/errors"
|
||||
"k8s.io/klog/v2"
|
||||
@ -60,6 +61,8 @@ type labeler struct {
|
||||
controlDeviceReg *regexp.Regexp
|
||||
labels labelMap
|
||||
|
||||
levelzero levelzeroservice.LevelzeroService
|
||||
|
||||
sysfsDRMDir string
|
||||
labelsChanged bool
|
||||
}
|
||||
@ -163,7 +166,7 @@ func fallback() uint64 {
|
||||
return getEnvVarNumber(memoryOverrideEnv)
|
||||
}
|
||||
|
||||
func GetMemoryAmount(sysfsDrmDir, gpuName string, numTiles uint64) uint64 {
|
||||
func legacyFallback(sysfsDrmDir, gpuName string, numTiles uint64) uint64 {
|
||||
reserved := getEnvVarNumber(memoryReservedEnv)
|
||||
|
||||
filePath := filepath.Join(sysfsDrmDir, gpuName, "lmem_total_bytes")
|
||||
@ -183,6 +186,26 @@ func GetMemoryAmount(sysfsDrmDir, gpuName string, numTiles uint64) uint64 {
|
||||
return totalPerTile*numTiles - reserved
|
||||
}
|
||||
|
||||
func (l *labeler) GetMemoryAmount(sysfsDrmDir, gpuName string, numTiles uint64) uint64 {
|
||||
link, err := os.Readlink(filepath.Join(sysfsDrmDir, gpuName, "device"))
|
||||
if err != nil {
|
||||
return legacyFallback(sysfsDrmDir, gpuName, numTiles)
|
||||
}
|
||||
|
||||
amount := uint64(0)
|
||||
|
||||
if l.levelzero != nil {
|
||||
amount, err = l.levelzero.GetDeviceMemoryAmount(filepath.Base(link))
|
||||
if amount == 0 || err != nil {
|
||||
return legacyFallback(sysfsDrmDir, gpuName, numTiles)
|
||||
}
|
||||
} else {
|
||||
return legacyFallback(sysfsDrmDir, gpuName, numTiles)
|
||||
}
|
||||
|
||||
return amount
|
||||
}
|
||||
|
||||
// GetTileCount reads the tile count.
|
||||
func GetTileCount(cardPath string) (numTiles uint64) {
|
||||
files := []string{}
|
||||
@ -317,7 +340,7 @@ func (l *labeler) createLabels() error {
|
||||
numTiles := GetTileCount(filepath.Join(l.sysfsDRMDir, gpuName))
|
||||
tileCount += int(numTiles)
|
||||
|
||||
memoryAmount := GetMemoryAmount(l.sysfsDRMDir, gpuName, numTiles)
|
||||
memoryAmount := l.GetMemoryAmount(l.sysfsDRMDir, gpuName, numTiles)
|
||||
gpuNumList = append(gpuNumList, gpuName[4:])
|
||||
|
||||
// get numa node of the GPU
|
||||
@ -446,9 +469,11 @@ func CreateAndPrintLabels(sysfsDRMDir string) {
|
||||
|
||||
// Gathers node's GPU labels on channel trigger or timeout, and write them to a file.
|
||||
// The created label file is deleted on exit (process dying).
|
||||
func Run(sysfsDrmDir, nfdFeatureFile string, updateInterval time.Duration, scanResources chan bool) {
|
||||
func Run(sysfsDrmDir, nfdFeatureFile string, updateInterval time.Duration, scanResources chan bool, levelzero levelzeroservice.LevelzeroService, exitFunc func()) {
|
||||
l := newLabeler(sysfsDrmDir)
|
||||
|
||||
l.levelzero = levelzero
|
||||
|
||||
interruptChan := make(chan os.Signal, 1)
|
||||
signal.Notify(interruptChan, syscall.SIGTERM, syscall.SIGINT, syscall.SIGHUP, syscall.SIGQUIT)
|
||||
|
||||
@ -499,6 +524,6 @@ Loop:
|
||||
|
||||
klog.V(1).Info("Stopping GPU labeler")
|
||||
|
||||
// Close the whole application
|
||||
os.Exit(0)
|
||||
// Call exitFunc that might exit the app
|
||||
exitFunc()
|
||||
}
|
||||
|
@ -17,10 +17,15 @@ package labeler
|
||||
import (
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/fsnotify/fsnotify"
|
||||
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
|
||||
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
|
||||
)
|
||||
|
||||
@ -28,6 +33,30 @@ const (
|
||||
sysfsDirectory = "/sys/"
|
||||
)
|
||||
|
||||
type mockL0Service struct {
|
||||
memSize uint64
|
||||
fail bool
|
||||
}
|
||||
|
||||
func (m *mockL0Service) Run(bool) {
|
||||
}
|
||||
func (m *mockL0Service) GetIntelIndices() ([]uint32, error) {
|
||||
return nil, nil
|
||||
}
|
||||
func (m *mockL0Service) GetDeviceHealth(bdfAddress string) (levelzeroservice.DeviceHealth, error) {
|
||||
return levelzeroservice.DeviceHealth{}, nil
|
||||
}
|
||||
func (m *mockL0Service) GetDeviceTemperature(bdfAddress string) (levelzeroservice.DeviceTemperature, error) {
|
||||
return levelzeroservice.DeviceTemperature{}, nil
|
||||
}
|
||||
func (m *mockL0Service) GetDeviceMemoryAmount(bdfAddress string) (uint64, error) {
|
||||
if m.fail {
|
||||
return m.memSize, os.ErrInvalid
|
||||
}
|
||||
|
||||
return m.memSize, nil
|
||||
}
|
||||
|
||||
type testcase struct {
|
||||
capabilityFile map[string][]byte
|
||||
expectedRetval error
|
||||
@ -579,7 +608,7 @@ func getTestCases() []testcase {
|
||||
}
|
||||
}
|
||||
|
||||
func (tc *testcase) createFiles(t *testing.T, sysfs, root string) {
|
||||
func (tc *testcase) createFiles(t *testing.T, sysfs string) {
|
||||
for _, sysfsdir := range tc.sysfsdirs {
|
||||
if err := os.MkdirAll(path.Join(sysfs, sysfsdir), 0750); err != nil {
|
||||
t.Fatalf("Failed to create fake sysfs directory: %+v", err)
|
||||
@ -645,7 +674,7 @@ func TestLabeling(t *testing.T) {
|
||||
}
|
||||
sysfs := path.Join(subroot, "pci0000:00/0000:00:1b.4", sysfsDirectory)
|
||||
|
||||
tc.createFiles(t, sysfs, subroot)
|
||||
tc.createFiles(t, sysfs)
|
||||
|
||||
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
|
||||
os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10))
|
||||
@ -663,3 +692,176 @@ func TestLabeling(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCreateAndRun(t *testing.T) {
|
||||
root, err := os.MkdirTemp("", "test_new_device_plugin")
|
||||
if err != nil {
|
||||
t.Fatalf("can't create temporary directory: %+v", err)
|
||||
}
|
||||
|
||||
defer os.RemoveAll(root)
|
||||
|
||||
tc := getTestCases()[0]
|
||||
|
||||
subroot, err := os.MkdirTemp(root, "tc")
|
||||
if err != nil {
|
||||
t.Fatalf("can't create temporary subroot directory: %+v", err)
|
||||
}
|
||||
|
||||
t.Run("CreateAndPrintLabels", func(t *testing.T) {
|
||||
err := os.MkdirAll(path.Join(subroot, "0"), 0750)
|
||||
if err != nil {
|
||||
t.Fatalf("couldn't create dir: %s", err.Error())
|
||||
}
|
||||
sysfs := path.Join(subroot, "pci0000:00/0000:00:1b.4", sysfsDirectory)
|
||||
|
||||
tc.createFiles(t, sysfs)
|
||||
|
||||
CreateAndPrintLabels(sysfs)
|
||||
})
|
||||
|
||||
waitForFileOp := func(directory, file string, eventType fsnotify.Op, duration time.Duration) bool {
|
||||
watcher, err := fsnotify.NewWatcher()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer watcher.Close()
|
||||
|
||||
if err := watcher.Add(directory); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
timer := time.NewTimer(duration)
|
||||
|
||||
for {
|
||||
select {
|
||||
case event := <-watcher.Events:
|
||||
if filepath.Base(event.Name) == file && event.Has(eventType) {
|
||||
return true
|
||||
}
|
||||
case <-timer.C:
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t.Run("Run", func(t *testing.T) {
|
||||
err := os.MkdirAll(path.Join(subroot, "0"), 0750)
|
||||
if err != nil {
|
||||
t.Fatalf("couldn't create dir: %s", err.Error())
|
||||
}
|
||||
sysfs := path.Join(subroot, "pci0000:00/0000:00:1b.4", sysfsDirectory)
|
||||
|
||||
tc.createFiles(t, sysfs)
|
||||
|
||||
c := make(chan bool, 1)
|
||||
|
||||
nfdLabelBase := "nfd-labelfile.txt"
|
||||
nfdLabelFile := filepath.Join(root, nfdLabelBase)
|
||||
|
||||
go Run(sysfs, nfdLabelFile, time.Millisecond, c, nil, func() {})
|
||||
|
||||
// Wait for the labeling timeout to trigger
|
||||
if !waitForFileOp(root, nfdLabelBase, fsnotify.Create, time.Second*2) {
|
||||
t.Error("Run didn't create label file")
|
||||
}
|
||||
|
||||
err = syscall.Kill(syscall.Getpid(), syscall.SIGHUP)
|
||||
if err != nil {
|
||||
t.Error("Calling Kill failed")
|
||||
}
|
||||
|
||||
// Wait for the labeling timeout to trigger
|
||||
if !waitForFileOp(root, nfdLabelBase, fsnotify.Remove, time.Second*2) {
|
||||
t.Error("Run didn't remove label file")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestL0ServiceUse(t *testing.T) {
|
||||
root, err := os.MkdirTemp("", "test_new_device_plugin")
|
||||
if err != nil {
|
||||
t.Fatalf("can't create temporary directory: %+v", err)
|
||||
}
|
||||
|
||||
defer os.RemoveAll(root)
|
||||
|
||||
pciAddr := path.Join(root, "sys", ".devices", "0000:00:01.0")
|
||||
cardPath := path.Join(root, "sys", "card0")
|
||||
|
||||
err = os.MkdirAll(pciAddr, 0750)
|
||||
if err != nil {
|
||||
t.Fatalf("couldn't create pci dir: %s", err.Error())
|
||||
}
|
||||
|
||||
err = os.MkdirAll(cardPath, 0750)
|
||||
if err != nil {
|
||||
t.Fatalf("couldn't create card dir: %s", err.Error())
|
||||
}
|
||||
|
||||
err = os.Symlink(pciAddr, filepath.Join(cardPath, "device"))
|
||||
if err != nil {
|
||||
t.Fatalf("couldn't create symlink: %s", err.Error())
|
||||
}
|
||||
|
||||
err = os.WriteFile(filepath.Join(root, "sys/card0/device/vendor"), []byte("0x8086"), 0600)
|
||||
if err != nil {
|
||||
t.Fatalf("couldn't write vendor file: %s", err.Error())
|
||||
}
|
||||
|
||||
err = os.MkdirAll(filepath.Join(root, "sys/card0/device/drm"), 0600)
|
||||
if err != nil {
|
||||
t.Fatalf("couldn't create card drm dir: %s", err.Error())
|
||||
}
|
||||
|
||||
t.Run("fetch memory from l0 service", func(t *testing.T) {
|
||||
labeler := newLabeler(filepath.Join(root, "sys"))
|
||||
labeler.levelzero = &mockL0Service{
|
||||
memSize: 12345678,
|
||||
}
|
||||
err = labeler.createLabels()
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("labeler didn't work with l0 service")
|
||||
}
|
||||
|
||||
if labeler.labels["gpu.intel.com/memory.max"] != "12345678" {
|
||||
t.Errorf("labeler didn't get memory amount from l0 service: %v", labeler.labels)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("memory fetch from l0 fails", func(t *testing.T) {
|
||||
labeler := newLabeler(filepath.Join(root, "sys"))
|
||||
labeler.levelzero = &mockL0Service{
|
||||
memSize: 0,
|
||||
fail: true,
|
||||
}
|
||||
|
||||
os.Setenv(memoryOverrideEnv, "87654321")
|
||||
err = labeler.createLabels()
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("labeler didn't work with l0 service")
|
||||
}
|
||||
|
||||
if labeler.labels["gpu.intel.com/memory.max"] != "87654321" {
|
||||
t.Errorf("labeler got an invalid memory amount: %v", labeler.labels)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("memory fetch with nil l0 service", func(t *testing.T) {
|
||||
labeler := newLabeler(filepath.Join(root, "sys"))
|
||||
labeler.levelzero = nil
|
||||
|
||||
os.Setenv(memoryOverrideEnv, "87654321")
|
||||
err = labeler.createLabels()
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("labeler didn't work with l0 service")
|
||||
}
|
||||
|
||||
if labeler.labels["gpu.intel.com/memory.max"] != "87654321" {
|
||||
t.Errorf("labeler got an invalid memory amount: %v", labeler.labels)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
4
deployments/gpu_plugin/overlays/health/args.yaml
Normal file
4
deployments/gpu_plugin/overlays/health/args.yaml
Normal file
@ -0,0 +1,4 @@
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args
|
||||
value:
|
||||
- "-health-management"
|
@ -0,0 +1,6 @@
|
||||
resources:
|
||||
- ../levelzero
|
||||
patches:
|
||||
- path: args.yaml
|
||||
target:
|
||||
kind: DaemonSet
|
@ -0,0 +1,9 @@
|
||||
resources:
|
||||
- ../../base
|
||||
patches:
|
||||
- path: l0-mounts.yaml
|
||||
target:
|
||||
kind: DaemonSet
|
||||
- path: levelzero.yaml
|
||||
target:
|
||||
kind: DaemonSet
|
10
deployments/gpu_plugin/overlays/levelzero/l0-mounts.yaml
Normal file
10
deployments/gpu_plugin/overlays/levelzero/l0-mounts.yaml
Normal file
@ -0,0 +1,10 @@
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/volumeMounts/-
|
||||
value:
|
||||
name: levelzerosocket
|
||||
mountPath: /var/lib/levelzero
|
||||
- op: add
|
||||
path: /spec/template/spec/volumes/-
|
||||
value:
|
||||
name: levelzerosocket
|
||||
emptyDir: {}
|
17
deployments/gpu_plugin/overlays/levelzero/levelzero.yaml
Normal file
17
deployments/gpu_plugin/overlays/levelzero/levelzero.yaml
Normal file
@ -0,0 +1,17 @@
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/-
|
||||
value:
|
||||
name: intel-gpu-levelzero
|
||||
image: intel/intel-gpu-levelzero:devel
|
||||
imagePullPolicy: IfNotPresent
|
||||
args:
|
||||
- "-v=2"
|
||||
securityContext:
|
||||
readOnlyRootFilesystem: true
|
||||
privileged: true
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
volumeMounts:
|
||||
- name: levelzerosocket
|
||||
mountPath: /var/lib/levelzero
|
9
deployments/gpu_plugin/overlays/wsl/kustomization.yaml
Normal file
9
deployments/gpu_plugin/overlays/wsl/kustomization.yaml
Normal file
@ -0,0 +1,9 @@
|
||||
resources:
|
||||
- ../levelzero
|
||||
patches:
|
||||
- path: wsl_mounts.yaml
|
||||
target:
|
||||
kind: DaemonSet
|
||||
- path: wsl_args.yaml
|
||||
target:
|
||||
kind: DaemonSet
|
8
deployments/gpu_plugin/overlays/wsl/wsl_args.yaml
Normal file
8
deployments/gpu_plugin/overlays/wsl/wsl_args.yaml
Normal file
@ -0,0 +1,8 @@
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args
|
||||
value:
|
||||
- "-wsl"
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/1/args/-
|
||||
value:
|
||||
"-wsl"
|
24
deployments/gpu_plugin/overlays/wsl/wsl_mounts.yaml
Normal file
24
deployments/gpu_plugin/overlays/wsl/wsl_mounts.yaml
Normal file
@ -0,0 +1,24 @@
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/1/volumeMounts/-
|
||||
value:
|
||||
name: wsllib
|
||||
mountPath: /usr/lib/wsl
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/volumeMounts/-
|
||||
value:
|
||||
name: devdxg
|
||||
mountPath: /dev/dxg
|
||||
- op: add
|
||||
path: /spec/template/spec/volumes/-
|
||||
value:
|
||||
name: wsllib
|
||||
hostPath:
|
||||
path: /usr/lib/wsl
|
||||
type: DirectoryOrCreate
|
||||
- op: add
|
||||
path: /spec/template/spec/volumes/-
|
||||
value:
|
||||
name: devdxg
|
||||
hostPath:
|
||||
path: /dev/dxg
|
||||
type: CharDevice
|
Loading…
Reference in New Issue
Block a user