gpu: add levelzero sidecar support for plugin and the deployment files

In addition to the levelzero's health data use, this adds support to
scan devices in WSL. Scanning happens by retrieving Intel device
indices from the Level-Zero API.

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
Tuomas Katila 2024-06-06 14:54:11 +03:00 committed by Ukri Niemimuukko
parent 2df9443fda
commit 518a8606ff
16 changed files with 1293 additions and 39 deletions

View File

@ -23,6 +23,7 @@ import (
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"time"
@ -31,8 +32,10 @@ import (
"k8s.io/klog/v2"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
gpulevelzero "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero"
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
cdispec "tags.cncf.io/container-device-interface/specs-go"
)
@ -40,6 +43,8 @@ import (
const (
sysfsDrmDirectory = "/sys/class/drm"
devfsDriDirectory = "/dev/dri"
wslDxgPath = "/dev/dxg"
wslLibPath = "/usr/lib/wsl"
nfdFeatureDir = "/etc/kubernetes/node-feature-discovery/features.d"
resourceFilename = "intel-gpu-resources.txt"
gpuDeviceRE = `^card[0-9]+$`
@ -51,6 +56,7 @@ const (
namespace = "gpu.intel.com"
deviceTypeI915 = "i915"
deviceTypeXe = "xe"
deviceTypeDxg = "dxg"
deviceTypeDefault = deviceTypeI915
// telemetry resource settings.
@ -67,8 +73,11 @@ const (
type cliOptions struct {
preferredAllocationPolicy string
sharedDevNum int
temperatureLimit int
enableMonitoring bool
resourceManagement bool
wslScan bool
healthManagement bool
}
type rmWithMultipleDriversErr struct {
@ -274,11 +283,13 @@ type devicePlugin struct {
scanDone chan bool
scanResources chan bool
resMan rm.ResourceManager
resMan rm.ResourceManager
levelzeroService levelzeroservice.LevelzeroService
sysfsDir string
devfsDir string
bypathDir string
sysfsDir string
devfsDir string
bypathDir string
healthStatuses map[string]string
// Note: If restarting the plugin with a new policy, the allocations for existing pods remain with old policy.
policy preferredAllocationPolicyFunc
@ -300,6 +311,7 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
scanDone: make(chan bool, 1), // buffered as we may send to it before Scan starts receiving from it
bypathFound: true,
scanResources: make(chan bool, 1),
healthStatuses: make(map[string]string),
}
if options.resourceManagement {
@ -325,15 +337,85 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
dp.policy = nonePolicy
}
if _, err := os.ReadDir(dp.bypathDir); err != nil {
klog.Warningf("failed to read by-path dir: %+v", err)
if !options.wslScan {
if _, err := os.ReadDir(dp.bypathDir); err != nil {
klog.Warningf("failed to read by-path dir: %+v", err)
dp.bypathFound = false
dp.bypathFound = false
}
}
return dp
}
func logHealthStatusChange(card, newStatus string, statuses map[string]string) {
prevState, found := statuses[card]
if !found {
klog.V(2).Infof("%s: new => %s", card, newStatus)
statuses[card] = newStatus
} else if prevState != newStatus {
klog.V(2).Infof("%s: %s => %s", card, prevState, newStatus)
statuses[card] = newStatus
}
}
func (dp *devicePlugin) healthStatusForCard(cardPath string) string {
if dp.levelzeroService == nil {
return pluginapi.Healthy
}
link, err := os.Readlink(filepath.Join(cardPath, "device"))
if err != nil {
klog.Warning("couldn't read device link for", cardPath)
return pluginapi.Healthy
}
health := pluginapi.Healthy
// Check status changes after the function exits
defer func() { logHealthStatusChange(cardPath, health, dp.healthStatuses) }()
bdfAddr := filepath.Base(link)
dh, err := dp.levelzeroService.GetDeviceHealth(bdfAddr)
if err != nil {
klog.Warningf("Device health retrieval failed: %v", err)
return health
}
// Direct Health indicators
klog.V(4).Infof("Health indicators: Memory=%t, Bus=%t, SoC=%t", dh.Memory, dh.Bus, dh.SoC)
if !dh.Memory || !dh.Bus || !dh.SoC {
health = pluginapi.Unhealthy
return health
}
dt, err := dp.levelzeroService.GetDeviceTemperature(bdfAddr)
// In case of any errors, return the current health status
if err != nil {
klog.Warningf("Device temperature retrieval failed: %v", err)
return health
}
limit := float64(dp.options.temperatureLimit)
// Temperatures for different areas
klog.V(4).Infof("Temperatures: Memory=%.1fC, GPU=%.1fC, Global=%.1fC", dh.MemoryTemperature, dh.GPUTemperature, dh.GlobalTemperature)
if dt.GPU > limit || dt.Global > limit || dt.Memory > limit {
health = pluginapi.Unhealthy
}
return health
}
// Implement the PreferredAllocator interface.
func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
if dp.resMan != nil {
@ -369,6 +451,68 @@ func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocatio
}
func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error {
if dp.options.wslScan {
return dp.wslGpuScan(notifier)
} else {
return dp.sysFsGpuScan(notifier)
}
}
func (dp *devicePlugin) wslGpuScan(notifier dpapi.Notifier) error {
defer dp.scanTicker.Stop()
klog.V(1).Infof("GPU (%s) resource share count = %d", deviceTypeDxg, dp.options.sharedDevNum)
devSpecs := []pluginapi.DeviceSpec{
{
HostPath: wslDxgPath,
ContainerPath: wslDxgPath,
Permissions: "rw",
},
}
mounts := []pluginapi.Mount{
{
ContainerPath: wslLibPath,
HostPath: wslLibPath,
ReadOnly: true,
},
}
for {
indices, err := dp.levelzeroService.GetIntelIndices()
if err == nil {
klog.V(4).Info("Intel Level-Zero indices: ", indices)
devTree := dpapi.NewDeviceTree()
for _, index := range indices {
envs := map[string]string{
rm.LevelzeroAffinityMaskEnvVar: strconv.Itoa(int(index)),
}
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, envs, nil, nil)
for i := 0; i < dp.options.sharedDevNum; i++ {
devID := fmt.Sprintf("card%d-%d", index, i)
devTree.AddDevice(deviceTypeDxg, devID, deviceInfo)
}
}
notifier.Notify(devTree)
} else {
klog.Warning("Failed to get Intel indices from Level-Zero")
}
select {
case <-dp.scanDone:
return nil
case <-dp.scanTicker.C:
}
}
}
func (dp *devicePlugin) sysFsGpuScan(notifier dpapi.Notifier) error {
defer dp.scanTicker.Stop()
klog.V(1).Infof("GPU (%s/%s) resource share count = %d", deviceTypeI915, deviceTypeXe, dp.options.sharedDevNum)
@ -566,7 +710,9 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
mounts, cdiDevices := dp.createMountsAndCDIDevices(cardPath, name, devSpecs)
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil, cdiDevices)
health := dp.healthStatusForCard(cardPath)
deviceInfo := dpapi.NewDeviceInfo(health, devSpecs, mounts, nil, nil, cdiDevices)
for i := 0; i < dp.options.sharedDevNum; i++ {
devID := fmt.Sprintf("%s-%d", name, i)
@ -627,7 +773,10 @@ func main() {
flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths")
flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable '*_monitoring' (= all GPUs) resource")
flag.BoolVar(&opts.resourceManagement, "resource-manager", false, "fractional GPU resource management")
flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management")
flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices")
flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device")
flag.IntVar(&opts.temperatureLimit, "temp-limit", 100, "temperature limit at which device is marked unhealthy")
flag.StringVar(&opts.preferredAllocationPolicy, "allocation-policy", "none", "modes of allocating GPU devices: balanced, packed and none")
flag.Parse()
@ -651,6 +800,34 @@ func main() {
plugin := newDevicePlugin(prefix+sysfsDrmDirectory, prefix+devfsDriDirectory, opts)
if plugin.options.wslScan {
klog.Info("WSL mode requested")
if plugin.options.resourceManagement {
klog.Error("Resource management is not supported within WSL. Please disable resource management.")
os.Exit(1)
}
if plugin.options.enableMonitoring {
klog.Error("Monitoring is not supported within WSL. Please disable monitoring.")
os.Exit(1)
}
if plugin.options.healthManagement {
klog.Error("Health management is not supported within WSL. Please disable health management.")
os.Exit(1)
}
}
if plugin.options.healthManagement || plugin.options.wslScan {
plugin.levelzeroService = levelzeroservice.NewLevelzero(gpulevelzero.DefaultUnixSocketPath)
go plugin.levelzeroService.Run(true)
}
if plugin.options.resourceManagement {
// Start labeler to export labels file for NFD.
nfdFeatureFile := path.Join(nfdFeatureDir, resourceFilename)
@ -659,7 +836,10 @@ func main() {
// Labeler catches OS signals and calls os.Exit() after receiving any.
go labeler.Run(prefix+sysfsDrmDirectory, nfdFeatureFile,
labelerMaxInterval, plugin.scanResources)
labelerMaxInterval, plugin.scanResources, plugin.levelzeroService, func() {
// Exit the whole app when labeler exits
os.Exit(0)
})
}
manager := dpapi.NewManager(namespace, plugin)

View File

@ -27,6 +27,7 @@ import (
"k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
"k8s.io/utils/strings/slices"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
cdispec "tags.cncf.io/container-device-interface/specs-go"
@ -41,6 +42,7 @@ type mockNotifier struct {
scanDone chan bool
i915Count int
xeCount int
dxgCount int
i915monitorCount int
xeMonitorCount int
}
@ -50,6 +52,7 @@ func (n *mockNotifier) Notify(newDeviceTree dpapi.DeviceTree) {
n.xeCount = len(newDeviceTree[deviceTypeXe])
n.xeMonitorCount = len(newDeviceTree[deviceTypeXe+monitorSuffix])
n.i915Count = len(newDeviceTree[deviceTypeI915])
n.dxgCount = len(newDeviceTree[deviceTypeDxg])
n.i915monitorCount = len(newDeviceTree[deviceTypeDefault+monitorSuffix])
n.scanDone <- true
@ -72,18 +75,63 @@ func (m *mockResourceManager) SetTileCountPerCard(count uint64) {
m.tileCount = count
}
type mockL0Service struct {
indices []uint32
memSize uint64
healthy bool
fail bool
}
func (m *mockL0Service) Run(keep bool) {
}
func (m *mockL0Service) Stop() {
}
func (m *mockL0Service) GetIntelIndices() ([]uint32, error) {
if m.fail {
return m.indices, errors.Errorf("error, error")
}
return m.indices, nil
}
func (m *mockL0Service) GetDeviceHealth(bdfAddress string) (levelzeroservice.DeviceHealth, error) {
if m.fail {
return levelzeroservice.DeviceHealth{}, errors.Errorf("error, error")
}
return levelzeroservice.DeviceHealth{Memory: m.healthy, Bus: m.healthy, SoC: m.healthy}, nil
}
func (m *mockL0Service) GetDeviceTemperature(bdfAddress string) (levelzeroservice.DeviceTemperature, error) {
if m.fail {
return levelzeroservice.DeviceTemperature{}, errors.Errorf("error, error")
}
return levelzeroservice.DeviceTemperature{Global: 35.0, GPU: 35.0, Memory: 35.0}, nil
}
func (m *mockL0Service) GetDeviceMemoryAmount(bdfAddress string) (uint64, error) {
if m.fail {
return m.memSize, errors.Errorf("error, error")
}
return m.memSize, nil
}
type TestCaseDetails struct {
name string
// possible mock l0 service
l0mock levelzeroservice.LevelzeroService
// test-case environment
sysfsdirs []string
pciAddresses map[string]string
sysfsfiles map[string][]byte
symlinkfiles map[string]string
name string
sysfsdirs []string
devfsdirs []string
// how plugin should interpret it
options cliOptions
// what the result should be (i915)
expectedI915Devs int
expectedI915Monitors int
// what the result should be (dxg)
expectedDxgDevs int
// what the result should be (xe)
expectedXeDevs int
expectedXeMonitors int
@ -99,6 +147,33 @@ func createTestFiles(root string, tc TestCaseDetails) (string, string, error) {
}
}
if err := os.MkdirAll(sysfs, 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake base sysfs directory")
}
if len(tc.pciAddresses) > 0 {
if err := os.MkdirAll(filepath.Join(sysfs, ".devices"), 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake pci address base")
}
for pci, card := range tc.pciAddresses {
fullPci := filepath.Join(sysfs, ".devices", pci)
cardPath := filepath.Join(sysfs, card)
if err := os.MkdirAll(fullPci, 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake pci address entry")
}
if err := os.MkdirAll(cardPath, 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake card entry")
}
if err := os.Symlink(fullPci, filepath.Join(sysfs, card, "device")); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake pci address symlinks")
}
}
}
for _, sysfsdir := range tc.sysfsdirs {
if err := os.MkdirAll(path.Join(sysfs, sysfsdir), 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake device directory")
@ -444,6 +519,176 @@ func TestScan(t *testing.T) {
}
}
func TestScanWithHealth(t *testing.T) {
tcases := []TestCaseDetails{
{
name: "one device with no symlink",
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
},
expectedI915Devs: 1,
},
{
name: "one device with proper symlink",
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
},
expectedI915Devs: 1,
l0mock: &mockL0Service{
healthy: true,
},
},
{
name: "one unhealthy device with proper symlink",
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
},
expectedI915Devs: 1,
l0mock: &mockL0Service{
healthy: false,
},
},
{
name: "one device with proper symlink returns error",
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
},
expectedI915Devs: 1,
l0mock: &mockL0Service{
fail: true,
},
},
}
for _, tc := range tcases {
if tc.options.sharedDevNum == 0 {
tc.options.sharedDevNum = 1
}
t.Run(tc.name, func(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs, devfs, err := createTestFiles(root, tc)
if err != nil {
t.Errorf("unexpected error: %+v", err)
}
plugin := newDevicePlugin(sysfs, devfs, tc.options)
plugin.levelzeroService = tc.l0mock
notifier := &mockNotifier{
scanDone: plugin.scanDone,
}
err = plugin.Scan(notifier)
// Scans in GPU plugin never fail
if err != nil {
t.Errorf("unexpected error: %+v", err)
}
if tc.expectedI915Devs != notifier.i915Count {
t.Errorf("Expected %d, discovered %d devices (i915)",
tc.expectedI915Devs, notifier.i915Count)
}
if tc.expectedI915Monitors != notifier.i915monitorCount {
t.Errorf("Expected %d, discovered %d monitors (i915)",
tc.expectedI915Monitors, notifier.i915monitorCount)
}
})
}
}
func TestScanWsl(t *testing.T) {
tcases := []TestCaseDetails{
{
name: "one wsl device",
expectedDxgDevs: 1,
l0mock: &mockL0Service{
indices: []uint32{0},
},
},
{
name: "four wsl device",
expectedDxgDevs: 4,
l0mock: &mockL0Service{
indices: []uint32{0, 1, 2, 3},
},
},
}
for _, tc := range tcases {
if tc.options.sharedDevNum == 0 {
tc.options.sharedDevNum = 1
}
t.Run(tc.name, func(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs, devfs, err := createTestFiles(root, tc)
if err != nil {
t.Errorf("unexpected error: %+v", err)
}
plugin := newDevicePlugin(sysfs, devfs, tc.options)
plugin.options.wslScan = true
plugin.levelzeroService = tc.l0mock
notifier := &mockNotifier{
scanDone: plugin.scanDone,
}
err = plugin.Scan(notifier)
// Scans in GPU plugin never fail
if err != nil {
t.Errorf("unexpected error: %+v", err)
}
if tc.expectedDxgDevs != notifier.dxgCount {
t.Errorf("Expected %d, discovered %d devices (dxg)",
tc.expectedI915Devs, notifier.i915Count)
}
})
}
}
func TestScanFails(t *testing.T) {
tc := TestCaseDetails{
name: "xe and i915 devices with rm will fail",
@ -582,11 +827,11 @@ func createBypathTestFiles(t *testing.T, card, root, linkFile string, bypathFile
byPath := path.Join(root, "by-path")
if linkFile != "" {
if err := os.MkdirAll(filepath.Dir(devPath), os.ModePerm); err != nil {
if err := os.MkdirAll(filepath.Dir(devPath), 0700); err != nil {
t.Fatal("Couldn't create test dev dir", err)
}
if err := os.MkdirAll(filepath.Dir(drmPath), os.ModePerm); err != nil {
if err := os.MkdirAll(filepath.Dir(drmPath), 0700); err != nil {
t.Fatal("Couldn't create test drm dir", err)
}
@ -600,7 +845,7 @@ func createBypathTestFiles(t *testing.T, card, root, linkFile string, bypathFile
}
if len(bypathFiles) > 0 {
if err := os.MkdirAll(byPath, os.ModePerm); err != nil {
if err := os.MkdirAll(byPath, 0700); err != nil {
t.Fatal("Mkdir failed:", byPath)
}

View File

@ -0,0 +1,208 @@
// Copyright 2024 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package levelzeroservice
import (
"context"
lz "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero"
"google.golang.org/grpc"
"google.golang.org/grpc/connectivity"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/klog/v2"
)
type LevelzeroService interface {
Run(bool)
GetIntelIndices() ([]uint32, error)
GetDeviceHealth(bdfAddress string) (DeviceHealth, error)
GetDeviceTemperature(bdfAddress string) (DeviceTemperature, error)
GetDeviceMemoryAmount(bdfAddress string) (uint64, error)
}
type DeviceHealth struct {
Memory bool
Bus bool
SoC bool
GlobalTemperature float64
GPUTemperature float64
MemoryTemperature float64
}
type DeviceTemperature struct {
Global float64
GPU float64
Memory float64
}
type clientNotReadyErr struct{}
func (e *clientNotReadyErr) Error() string {
return "client is not (yet) ready"
}
func NewLevelzero(socket string) LevelzeroService {
return &levelzero{
socketPath: socket,
ctx: context.Background(),
conn: nil,
client: nil,
}
}
type levelzero struct {
client lz.LevelzeroClient
ctx context.Context
conn *grpc.ClientConn
socketPath string
}
func (l *levelzero) Run(keep bool) {
url := "unix://" + l.socketPath
klog.V(3).Info("Starting Level-Zero client. Connecting to: ", url)
conn, err := grpc.NewClient(url, grpc.WithTransportCredentials(insecure.NewCredentials()))
if err != nil {
klog.Error("Failed to connect to socket", err)
return
}
ctx := context.Background()
l.conn = conn
for {
state := l.conn.GetState()
if state == connectivity.Idle {
conn.Connect()
}
if state == connectivity.Ready {
klog.V(2).Info("Connection ready")
l.client = lz.NewLevelzeroClient(conn)
if !keep {
break
}
}
if !conn.WaitForStateChange(ctx, state) {
continue
}
}
}
func (l *levelzero) isClientReady() bool {
return l.client != nil
}
func (l *levelzero) GetIntelIndices() ([]uint32, error) {
if !l.isClientReady() {
return []uint32{}, &clientNotReadyErr{}
}
cli := l.client
indices, err := cli.GetIntelIndices(l.ctx, &lz.GetIntelIndicesMessage{})
if err != nil || indices == nil {
return []uint32{}, err
}
if indices.Error != nil && indices.Error.Errorcode != 0 {
klog.Warningf("indices request returned internal error: 0x%X (%s)", indices.Error.Errorcode, indices.Error.Description)
}
return indices.Indices, nil
}
func (l *levelzero) GetDeviceHealth(bdfAddress string) (DeviceHealth, error) {
if !l.isClientReady() {
return DeviceHealth{}, &clientNotReadyErr{}
}
cli := l.client
did := lz.DeviceId{
BdfAddress: bdfAddress,
}
health, err := cli.GetDeviceHealth(l.ctx, &did)
if err != nil || health == nil {
return DeviceHealth{}, err
}
if health.Error != nil && health.Error.Errorcode != 0 {
klog.Warningf("health request returned internal error: 0x%X (%s)", health.Error.Errorcode, health.Error.Description)
}
return DeviceHealth{
Memory: health.MemoryOk,
Bus: health.BusOk,
SoC: health.SocOk,
}, nil
}
func (l *levelzero) GetDeviceTemperature(bdfAddress string) (DeviceTemperature, error) {
if !l.isClientReady() {
return DeviceTemperature{}, &clientNotReadyErr{}
}
cli := l.client
did := lz.DeviceId{
BdfAddress: bdfAddress,
}
temps, err := cli.GetDeviceTemperature(l.ctx, &did)
if err != nil || temps == nil {
return DeviceTemperature{}, err
}
if temps.Error != nil && temps.Error.Errorcode != 0 {
klog.Warningf("temperature request returned internal error: 0x%X (%s)", temps.Error.Errorcode, temps.Error.Description)
}
return DeviceTemperature{
Global: temps.Global,
GPU: temps.Gpu,
Memory: temps.Memory,
}, nil
}
func (l *levelzero) GetDeviceMemoryAmount(bdfAddress string) (uint64, error) {
if !l.isClientReady() {
return 0, &clientNotReadyErr{}
}
cli := l.client
did := lz.DeviceId{
BdfAddress: bdfAddress,
}
memSize, err := cli.GetDeviceMemoryAmount(l.ctx, &did)
if err != nil || memSize == nil {
return 0, err
}
if memSize.Error != nil && memSize.Error.Errorcode != 0 {
klog.Warningf("memory request returned internal error: 0x%X (%s)", memSize.Error.Errorcode, memSize.Error.Description)
}
return memSize.MemorySize, nil
}

View File

@ -0,0 +1,307 @@
// Copyright 2024 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package levelzeroservice
import (
"context"
"flag"
"log"
"net"
"os"
"path/filepath"
"testing"
lz "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero"
"google.golang.org/grpc"
)
const (
NoError = iota
InternalError = iota
ExternalError = iota
)
func init() {
_ = flag.Set("v", "4") //Enable debug output
}
type mockServer struct {
lz.UnimplementedLevelzeroServer
failRequest int
}
func (m *mockServer) serve(socketPath string) {
lis, err := net.Listen("unix", socketPath)
if err != nil {
log.Fatalf("failed to listen: %v", err)
}
s := grpc.NewServer()
lz.RegisterLevelzeroServer(s, m)
go func() {
if err := s.Serve(lis); err != nil {
log.Fatalf("failed to serve: %v", err)
}
}()
}
func (m *mockServer) GetDeviceHealth(c context.Context, deviceid *lz.DeviceId) (*lz.DeviceHealth, error) {
if m.failRequest == ExternalError {
return nil, os.ErrInvalid
}
health := &lz.DeviceHealth{
BusOk: true,
MemoryOk: true,
Error: nil,
}
if m.failRequest == InternalError {
health.MemoryOk = false
health.Error = &lz.Error{
Description: "error error",
Errorcode: 99,
}
}
return health, nil
}
func (m *mockServer) GetDeviceTemperature(c context.Context, deviceid *lz.DeviceId) (*lz.DeviceTemperature, error) {
if m.failRequest == ExternalError {
return nil, os.ErrInvalid
}
temps := &lz.DeviceTemperature{
Global: 35.0,
Gpu: 35.0,
Memory: 35.0,
Error: nil,
}
if m.failRequest == InternalError {
temps.Global = -999.0
temps.Gpu = -999.0
temps.Memory = -999.0
temps.Error = &lz.Error{
Description: "error error",
Errorcode: 99,
}
}
return temps, nil
}
func (m *mockServer) GetIntelIndices(c context.Context, msg *lz.GetIntelIndicesMessage) (*lz.DeviceIndices, error) {
if m.failRequest == ExternalError {
return nil, os.ErrInvalid
}
ret := lz.DeviceIndices{
Indices: []uint32{0},
Error: nil,
}
if m.failRequest == InternalError {
ret.Indices = []uint32{}
ret.Error = &lz.Error{
Description: "error error",
Errorcode: 99,
}
}
return &ret, nil
}
func (m *mockServer) GetDeviceMemoryAmount(c context.Context, deviceid *lz.DeviceId) (*lz.DeviceMemoryAmount, error) {
if m.failRequest == ExternalError {
return nil, os.ErrInvalid
}
ret := lz.DeviceMemoryAmount{
MemorySize: 1000,
Error: nil,
}
if m.failRequest == InternalError {
ret.MemorySize = 0
ret.Error = &lz.Error{
Description: "error error",
Errorcode: 99,
}
}
return &ret, nil
}
type testcase struct {
name string
fail int
}
var tcases = []testcase{
{
name: "normal flow",
fail: NoError,
},
{
name: "fail flow - internal",
fail: InternalError,
},
{
name: "fail flow - external",
fail: ExternalError,
},
}
func TestGetDeviceHealth(t *testing.T) {
for _, tc := range tcases {
t.Run(tc.name, func(t *testing.T) {
d, err := os.MkdirTemp("", "testinglevelzero*")
if err != nil {
t.Fatal("failed to create tmp directory")
}
defer os.RemoveAll(d)
sockPath := filepath.Join(d, "server.sock")
mock := mockServer{
failRequest: tc.fail,
}
mock.serve(sockPath)
n := NewLevelzero(sockPath)
n.Run(false)
dh, err := n.GetDeviceHealth("0000:00:00.1")
if tc.fail == NoError && err != nil {
t.Error("GetDeviceHealth returned an error:", err)
}
if tc.fail == NoError && (!dh.Memory || !dh.Bus) {
t.Error("Call to device health returned unhealthy", dh, tc.fail)
}
if tc.fail == ExternalError && err == nil {
t.Error("GetDeviceHealth returned nil and expected error")
}
})
}
}
func TestGetIndices(t *testing.T) {
for _, tc := range tcases {
t.Run(tc.name, func(t *testing.T) {
d, err := os.MkdirTemp("", "testinglevelzero*")
if err != nil {
t.Fatal("failed to create tmp directory")
}
defer os.RemoveAll(d)
sockPath := filepath.Join(d, "server.sock")
mock := mockServer{
failRequest: tc.fail,
}
mock.serve(sockPath)
n := NewLevelzero(sockPath)
n.Run(false)
indices, err := n.GetIntelIndices()
if tc.fail == NoError && err != nil {
t.Error("GetIntelIndices returned error:", err)
}
if tc.fail == ExternalError && err == nil {
t.Error("GetIntelIndices returned nil and expected error")
}
if tc.fail == NoError && len(indices) != 1 {
t.Error("Wrong number of indices received", indices)
}
if tc.fail != NoError && len(indices) != 0 {
t.Error("Wrong number of indices received", indices)
}
})
}
}
func TestGetMemoryAmount(t *testing.T) {
for _, tc := range tcases {
t.Run(tc.name, func(t *testing.T) {
d, err := os.MkdirTemp("", "testinglevelzero*")
if err != nil {
t.Fatal("failed to create tmp directory")
}
defer os.RemoveAll(d)
sockPath := filepath.Join(d, "server.sock")
mock := mockServer{
failRequest: tc.fail,
}
mock.serve(sockPath)
n := NewLevelzero(sockPath)
n.Run(false)
memSize, err := n.GetDeviceMemoryAmount("0000:11:22.3")
if tc.fail == NoError && err != nil {
t.Error("TestGetMemoryAmount returned error:", err)
}
if tc.fail == ExternalError && err == nil {
t.Error("TestGetMemoryAmount returned nil and expected error")
}
if tc.fail == NoError && memSize != 1000 {
t.Error("Wrong mem size received", memSize)
}
})
}
}
func TestAccessBeforeReady(t *testing.T) {
n := NewLevelzero("/tmp/foobar.sock")
_, err := n.GetDeviceMemoryAmount("")
if err == nil {
t.Error("Got non-error for memory amount, expected error")
}
_, err = n.GetDeviceHealth("")
if err == nil {
t.Error("Got non-error for health, expected error")
}
_, err = n.GetIntelIndices()
if err == nil {
t.Error("Got non-error for indices, expected error")
}
}

View File

@ -51,8 +51,8 @@ const (
gasCardAnnotation = "gas-container-cards"
gasTileAnnotation = "gas-container-tiles"
levelZeroAffinityMaskEnvVar = "ZE_AFFINITY_MASK"
levelZeroHierarchyEnvVar = "ZE_FLAT_DEVICE_HIERARCHY"
LevelzeroAffinityMaskEnvVar = "ZE_AFFINITY_MASK"
levelzeroHierarchyEnvVar = "ZE_FLAT_DEVICE_HIERARCHY"
hierarchyModeComposite = "COMPOSITE"
hierarchyModeFlat = "FLAT"
@ -796,7 +796,7 @@ func (rm *resourceManager) createAllocateResponse(deviceIds []string, tileAffini
cresp.Envs = make(map[string]string)
}
cresp.Envs[levelZeroAffinityMaskEnvVar] = tileAffinityMask
cresp.Envs[LevelzeroAffinityMaskEnvVar] = tileAffinityMask
}
allocateResponse.ContainerResponses = append(allocateResponse.ContainerResponses, &cresp)
@ -851,7 +851,7 @@ func containerCards(pod *v1.Pod, gpuUsingContainerIndex int) []string {
// Guesses level zero hierarchy mode for the container. Defaults to the new "flat" mode
// if no mode is set in the container's env variables.
func guessLevelZeroHierarchyMode(pod *v1.Pod, containerIndex int) string {
func guessLevelzeroHierarchyMode(pod *v1.Pod, containerIndex int) string {
klog.V(4).Infof("Checking pod %s envs", pod.Name)
if containerIndex < len(pod.Spec.Containers) {
@ -859,7 +859,7 @@ func guessLevelZeroHierarchyMode(pod *v1.Pod, containerIndex int) string {
if c.Env != nil {
for _, env := range c.Env {
if env.Name == levelZeroHierarchyEnvVar {
if env.Name == levelzeroHierarchyEnvVar {
switch env.Value {
// Check that the value is valid.
case hierarchyModeComposite:
@ -959,7 +959,7 @@ func containerTileAffinityMask(pod *v1.Pod, gpuUsingContainerIndex, tilesPerCard
}
if i == gpuUsingContainerIndex {
return convertTileInfoToEnvMask(containerTileInfo, tilesPerCard, guessLevelZeroHierarchyMode(pod, containerIndex))
return convertTileInfoToEnvMask(containerTileInfo, tilesPerCard, guessLevelzeroHierarchyMode(pod, containerIndex))
}
i++

View File

@ -472,7 +472,7 @@ func TestCreateFractionalResourceResponseWithOneCardTwoTiles(t *testing.T) {
},
Env: []v1.EnvVar{
{
Name: levelZeroHierarchyEnvVar,
Name: levelzeroHierarchyEnvVar,
Value: hierarchyModeComposite,
},
},
@ -522,8 +522,8 @@ func TestCreateFractionalResourceResponseWithOneCardTwoTiles(t *testing.T) {
// check response
expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1")
expectTruef(len(resp.ContainerResponses[0].Envs) == 2, t, tCase.name, "wrong number of env variables in container response, expected 2")
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] == "0.0,0.1", t, tCase.name, "l0 affinity mask is incorrect")
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.0,0.1", t, tCase.name, "l0 affinity mask is incorrect")
expectTruef(len(resp.ContainerResponses[0].Devices) == 1, t, tCase.name, "wrong number of devices, expected 1")
}
@ -545,7 +545,7 @@ func TestCreateFractionalResourceResponseWithTwoCardsOneTile(t *testing.T) {
},
Env: []v1.EnvVar{
{
Name: levelZeroHierarchyEnvVar,
Name: levelzeroHierarchyEnvVar,
Value: hierarchyModeComposite,
},
},
@ -598,8 +598,8 @@ func TestCreateFractionalResourceResponseWithTwoCardsOneTile(t *testing.T) {
} else {
// check response
expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1")
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] == "0.3,1.4", t, tCase.name, "l0 affinity mask is incorrect: ")
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.3,1.4", t, tCase.name, "l0 affinity mask is incorrect: ")
expectTruef(len(resp.ContainerResponses[0].Devices) == 2, t, tCase.name, "wrong number of devices, expected 2")
}
}
@ -623,7 +623,7 @@ func TestCreateFractionalResourceResponseWithThreeCardsTwoTiles(t *testing.T) {
},
Env: []v1.EnvVar{
{
Name: levelZeroHierarchyEnvVar,
Name: levelzeroHierarchyEnvVar,
Value: hierarchyModeComposite,
},
},
@ -676,8 +676,8 @@ func TestCreateFractionalResourceResponseWithThreeCardsTwoTiles(t *testing.T) {
} else {
// check response
expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1")
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] == "0.0,0.1,1.2,1.3,2.3,2.4", t, tCase.name, "l0 affinity mask is incorrect: ")
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set")
expectTruef(resp.ContainerResponses[0].Envs[LevelzeroAffinityMaskEnvVar] == "0.0,0.1,1.2,1.3,2.3,2.4", t, tCase.name, "l0 affinity mask is incorrect: ")
expectTruef(len(resp.ContainerResponses[0].Devices) == 3, t, tCase.name, "wrong number of devices, expected 3")
}
}
@ -701,7 +701,7 @@ func TestCreateFractionalResourceResponseWithMultipleContainersTileEach(t *testi
},
Env: []v1.EnvVar{
{
Name: levelZeroHierarchyEnvVar,
Name: levelzeroHierarchyEnvVar,
Value: hierarchyModeComposite,
},
},
@ -714,7 +714,7 @@ func TestCreateFractionalResourceResponseWithMultipleContainersTileEach(t *testi
},
Env: []v1.EnvVar{
{
Name: levelZeroHierarchyEnvVar,
Name: levelzeroHierarchyEnvVar,
Value: hierarchyModeComposite,
},
},
@ -945,7 +945,7 @@ func TestTileAnnotationParsing(t *testing.T) {
if i < len(pt.hierarchys) {
pod.Spec.Containers[i].Env = []v1.EnvVar{
{
Name: levelZeroHierarchyEnvVar,
Name: levelzeroHierarchyEnvVar,
Value: pt.hierarchys[i],
},
}

View File

@ -29,6 +29,7 @@ import (
"syscall"
"time"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
"github.com/pkg/errors"
"k8s.io/klog/v2"
@ -60,6 +61,8 @@ type labeler struct {
controlDeviceReg *regexp.Regexp
labels labelMap
levelzero levelzeroservice.LevelzeroService
sysfsDRMDir string
labelsChanged bool
}
@ -163,7 +166,7 @@ func fallback() uint64 {
return getEnvVarNumber(memoryOverrideEnv)
}
func GetMemoryAmount(sysfsDrmDir, gpuName string, numTiles uint64) uint64 {
func legacyFallback(sysfsDrmDir, gpuName string, numTiles uint64) uint64 {
reserved := getEnvVarNumber(memoryReservedEnv)
filePath := filepath.Join(sysfsDrmDir, gpuName, "lmem_total_bytes")
@ -183,6 +186,26 @@ func GetMemoryAmount(sysfsDrmDir, gpuName string, numTiles uint64) uint64 {
return totalPerTile*numTiles - reserved
}
func (l *labeler) GetMemoryAmount(sysfsDrmDir, gpuName string, numTiles uint64) uint64 {
link, err := os.Readlink(filepath.Join(sysfsDrmDir, gpuName, "device"))
if err != nil {
return legacyFallback(sysfsDrmDir, gpuName, numTiles)
}
amount := uint64(0)
if l.levelzero != nil {
amount, err = l.levelzero.GetDeviceMemoryAmount(filepath.Base(link))
if amount == 0 || err != nil {
return legacyFallback(sysfsDrmDir, gpuName, numTiles)
}
} else {
return legacyFallback(sysfsDrmDir, gpuName, numTiles)
}
return amount
}
// GetTileCount reads the tile count.
func GetTileCount(cardPath string) (numTiles uint64) {
files := []string{}
@ -317,7 +340,7 @@ func (l *labeler) createLabels() error {
numTiles := GetTileCount(filepath.Join(l.sysfsDRMDir, gpuName))
tileCount += int(numTiles)
memoryAmount := GetMemoryAmount(l.sysfsDRMDir, gpuName, numTiles)
memoryAmount := l.GetMemoryAmount(l.sysfsDRMDir, gpuName, numTiles)
gpuNumList = append(gpuNumList, gpuName[4:])
// get numa node of the GPU
@ -446,9 +469,11 @@ func CreateAndPrintLabels(sysfsDRMDir string) {
// Gathers node's GPU labels on channel trigger or timeout, and write them to a file.
// The created label file is deleted on exit (process dying).
func Run(sysfsDrmDir, nfdFeatureFile string, updateInterval time.Duration, scanResources chan bool) {
func Run(sysfsDrmDir, nfdFeatureFile string, updateInterval time.Duration, scanResources chan bool, levelzero levelzeroservice.LevelzeroService, exitFunc func()) {
l := newLabeler(sysfsDrmDir)
l.levelzero = levelzero
interruptChan := make(chan os.Signal, 1)
signal.Notify(interruptChan, syscall.SIGTERM, syscall.SIGINT, syscall.SIGHUP, syscall.SIGQUIT)
@ -499,6 +524,6 @@ Loop:
klog.V(1).Info("Stopping GPU labeler")
// Close the whole application
os.Exit(0)
// Call exitFunc that might exit the app
exitFunc()
}

View File

@ -17,10 +17,15 @@ package labeler
import (
"os"
"path"
"path/filepath"
"reflect"
"strconv"
"syscall"
"testing"
"time"
"github.com/fsnotify/fsnotify"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
)
@ -28,6 +33,30 @@ const (
sysfsDirectory = "/sys/"
)
type mockL0Service struct {
memSize uint64
fail bool
}
func (m *mockL0Service) Run(bool) {
}
func (m *mockL0Service) GetIntelIndices() ([]uint32, error) {
return nil, nil
}
func (m *mockL0Service) GetDeviceHealth(bdfAddress string) (levelzeroservice.DeviceHealth, error) {
return levelzeroservice.DeviceHealth{}, nil
}
func (m *mockL0Service) GetDeviceTemperature(bdfAddress string) (levelzeroservice.DeviceTemperature, error) {
return levelzeroservice.DeviceTemperature{}, nil
}
func (m *mockL0Service) GetDeviceMemoryAmount(bdfAddress string) (uint64, error) {
if m.fail {
return m.memSize, os.ErrInvalid
}
return m.memSize, nil
}
type testcase struct {
capabilityFile map[string][]byte
expectedRetval error
@ -579,7 +608,7 @@ func getTestCases() []testcase {
}
}
func (tc *testcase) createFiles(t *testing.T, sysfs, root string) {
func (tc *testcase) createFiles(t *testing.T, sysfs string) {
for _, sysfsdir := range tc.sysfsdirs {
if err := os.MkdirAll(path.Join(sysfs, sysfsdir), 0750); err != nil {
t.Fatalf("Failed to create fake sysfs directory: %+v", err)
@ -645,7 +674,7 @@ func TestLabeling(t *testing.T) {
}
sysfs := path.Join(subroot, "pci0000:00/0000:00:1b.4", sysfsDirectory)
tc.createFiles(t, sysfs, subroot)
tc.createFiles(t, sysfs)
os.Setenv(memoryOverrideEnv, strconv.FormatUint(tc.memoryOverride, 10))
os.Setenv(memoryReservedEnv, strconv.FormatUint(tc.memoryReserved, 10))
@ -663,3 +692,176 @@ func TestLabeling(t *testing.T) {
})
}
}
func TestCreateAndRun(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)
}
defer os.RemoveAll(root)
tc := getTestCases()[0]
subroot, err := os.MkdirTemp(root, "tc")
if err != nil {
t.Fatalf("can't create temporary subroot directory: %+v", err)
}
t.Run("CreateAndPrintLabels", func(t *testing.T) {
err := os.MkdirAll(path.Join(subroot, "0"), 0750)
if err != nil {
t.Fatalf("couldn't create dir: %s", err.Error())
}
sysfs := path.Join(subroot, "pci0000:00/0000:00:1b.4", sysfsDirectory)
tc.createFiles(t, sysfs)
CreateAndPrintLabels(sysfs)
})
waitForFileOp := func(directory, file string, eventType fsnotify.Op, duration time.Duration) bool {
watcher, err := fsnotify.NewWatcher()
if err != nil {
t.Fatal(err)
}
defer watcher.Close()
if err := watcher.Add(directory); err != nil {
t.Fatal(err)
}
timer := time.NewTimer(duration)
for {
select {
case event := <-watcher.Events:
if filepath.Base(event.Name) == file && event.Has(eventType) {
return true
}
case <-timer.C:
return false
}
}
}
t.Run("Run", func(t *testing.T) {
err := os.MkdirAll(path.Join(subroot, "0"), 0750)
if err != nil {
t.Fatalf("couldn't create dir: %s", err.Error())
}
sysfs := path.Join(subroot, "pci0000:00/0000:00:1b.4", sysfsDirectory)
tc.createFiles(t, sysfs)
c := make(chan bool, 1)
nfdLabelBase := "nfd-labelfile.txt"
nfdLabelFile := filepath.Join(root, nfdLabelBase)
go Run(sysfs, nfdLabelFile, time.Millisecond, c, nil, func() {})
// Wait for the labeling timeout to trigger
if !waitForFileOp(root, nfdLabelBase, fsnotify.Create, time.Second*2) {
t.Error("Run didn't create label file")
}
err = syscall.Kill(syscall.Getpid(), syscall.SIGHUP)
if err != nil {
t.Error("Calling Kill failed")
}
// Wait for the labeling timeout to trigger
if !waitForFileOp(root, nfdLabelBase, fsnotify.Remove, time.Second*2) {
t.Error("Run didn't remove label file")
}
})
}
func TestL0ServiceUse(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)
}
defer os.RemoveAll(root)
pciAddr := path.Join(root, "sys", ".devices", "0000:00:01.0")
cardPath := path.Join(root, "sys", "card0")
err = os.MkdirAll(pciAddr, 0750)
if err != nil {
t.Fatalf("couldn't create pci dir: %s", err.Error())
}
err = os.MkdirAll(cardPath, 0750)
if err != nil {
t.Fatalf("couldn't create card dir: %s", err.Error())
}
err = os.Symlink(pciAddr, filepath.Join(cardPath, "device"))
if err != nil {
t.Fatalf("couldn't create symlink: %s", err.Error())
}
err = os.WriteFile(filepath.Join(root, "sys/card0/device/vendor"), []byte("0x8086"), 0600)
if err != nil {
t.Fatalf("couldn't write vendor file: %s", err.Error())
}
err = os.MkdirAll(filepath.Join(root, "sys/card0/device/drm"), 0600)
if err != nil {
t.Fatalf("couldn't create card drm dir: %s", err.Error())
}
t.Run("fetch memory from l0 service", func(t *testing.T) {
labeler := newLabeler(filepath.Join(root, "sys"))
labeler.levelzero = &mockL0Service{
memSize: 12345678,
}
err = labeler.createLabels()
if err != nil {
t.Errorf("labeler didn't work with l0 service")
}
if labeler.labels["gpu.intel.com/memory.max"] != "12345678" {
t.Errorf("labeler didn't get memory amount from l0 service: %v", labeler.labels)
}
})
t.Run("memory fetch from l0 fails", func(t *testing.T) {
labeler := newLabeler(filepath.Join(root, "sys"))
labeler.levelzero = &mockL0Service{
memSize: 0,
fail: true,
}
os.Setenv(memoryOverrideEnv, "87654321")
err = labeler.createLabels()
if err != nil {
t.Errorf("labeler didn't work with l0 service")
}
if labeler.labels["gpu.intel.com/memory.max"] != "87654321" {
t.Errorf("labeler got an invalid memory amount: %v", labeler.labels)
}
})
t.Run("memory fetch with nil l0 service", func(t *testing.T) {
labeler := newLabeler(filepath.Join(root, "sys"))
labeler.levelzero = nil
os.Setenv(memoryOverrideEnv, "87654321")
err = labeler.createLabels()
if err != nil {
t.Errorf("labeler didn't work with l0 service")
}
if labeler.labels["gpu.intel.com/memory.max"] != "87654321" {
t.Errorf("labeler got an invalid memory amount: %v", labeler.labels)
}
})
}

View File

@ -0,0 +1,4 @@
- op: add
path: /spec/template/spec/containers/0/args
value:
- "-health-management"

View File

@ -0,0 +1,6 @@
resources:
- ../levelzero
patches:
- path: args.yaml
target:
kind: DaemonSet

View File

@ -0,0 +1,9 @@
resources:
- ../../base
patches:
- path: l0-mounts.yaml
target:
kind: DaemonSet
- path: levelzero.yaml
target:
kind: DaemonSet

View File

@ -0,0 +1,10 @@
- op: add
path: /spec/template/spec/containers/0/volumeMounts/-
value:
name: levelzerosocket
mountPath: /var/lib/levelzero
- op: add
path: /spec/template/spec/volumes/-
value:
name: levelzerosocket
emptyDir: {}

View File

@ -0,0 +1,17 @@
- op: add
path: /spec/template/spec/containers/-
value:
name: intel-gpu-levelzero
image: intel/intel-gpu-levelzero:devel
imagePullPolicy: IfNotPresent
args:
- "-v=2"
securityContext:
readOnlyRootFilesystem: true
privileged: true
capabilities:
drop:
- ALL
volumeMounts:
- name: levelzerosocket
mountPath: /var/lib/levelzero

View File

@ -0,0 +1,9 @@
resources:
- ../levelzero
patches:
- path: wsl_mounts.yaml
target:
kind: DaemonSet
- path: wsl_args.yaml
target:
kind: DaemonSet

View File

@ -0,0 +1,8 @@
- op: add
path: /spec/template/spec/containers/0/args
value:
- "-wsl"
- op: add
path: /spec/template/spec/containers/1/args/-
value:
"-wsl"

View File

@ -0,0 +1,24 @@
- op: add
path: /spec/template/spec/containers/1/volumeMounts/-
value:
name: wsllib
mountPath: /usr/lib/wsl
- op: add
path: /spec/template/spec/containers/0/volumeMounts/-
value:
name: devdxg
mountPath: /dev/dxg
- op: add
path: /spec/template/spec/volumes/-
value:
name: wsllib
hostPath:
path: /usr/lib/wsl
type: DirectoryOrCreate
- op: add
path: /spec/template/spec/volumes/-
value:
name: devdxg
hostPath:
path: /dev/dxg
type: CharDevice