intel-device-plugins-for-ku.../cmd/gpu_plugin/gpu_plugin_test.go
Tuomas Katila 518a8606ff gpu: add levelzero sidecar support for plugin and the deployment files
In addition to the levelzero's health data use, this adds support to
scan devices in WSL. Scanning happens by retrieving Intel device
indices from the Level-Zero API.

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
2024-09-19 19:14:15 +03:00

1212 lines
35 KiB
Go

// Copyright 2017-2023 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"flag"
"os"
"path"
"path/filepath"
"reflect"
"sort"
"testing"
"github.com/pkg/errors"
"k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
"k8s.io/utils/strings/slices"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/levelzeroservice"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
cdispec "tags.cncf.io/container-device-interface/specs-go"
)
func init() {
_ = flag.Set("v", "4") //Enable debug output
}
// mockNotifier implements Notifier interface.
type mockNotifier struct {
scanDone chan bool
i915Count int
xeCount int
dxgCount int
i915monitorCount int
xeMonitorCount int
}
// Notify stops plugin Scan.
func (n *mockNotifier) Notify(newDeviceTree dpapi.DeviceTree) {
n.xeCount = len(newDeviceTree[deviceTypeXe])
n.xeMonitorCount = len(newDeviceTree[deviceTypeXe+monitorSuffix])
n.i915Count = len(newDeviceTree[deviceTypeI915])
n.dxgCount = len(newDeviceTree[deviceTypeDxg])
n.i915monitorCount = len(newDeviceTree[deviceTypeDefault+monitorSuffix])
n.scanDone <- true
}
type mockResourceManager struct {
tileCount uint64
}
func (m *mockResourceManager) CreateFractionalResourceResponse(*v1beta1.AllocateRequest) (*v1beta1.AllocateResponse, error) {
return &v1beta1.AllocateResponse{}, &dpapi.UseDefaultMethodError{}
}
func (m *mockResourceManager) SetDevInfos(rm.DeviceInfoMap) {}
func (m *mockResourceManager) GetPreferredFractionalAllocation(*v1beta1.PreferredAllocationRequest) (*v1beta1.PreferredAllocationResponse, error) {
return &v1beta1.PreferredAllocationResponse{}, &dpapi.UseDefaultMethodError{}
}
func (m *mockResourceManager) SetTileCountPerCard(count uint64) {
m.tileCount = count
}
type mockL0Service struct {
indices []uint32
memSize uint64
healthy bool
fail bool
}
func (m *mockL0Service) Run(keep bool) {
}
func (m *mockL0Service) Stop() {
}
func (m *mockL0Service) GetIntelIndices() ([]uint32, error) {
if m.fail {
return m.indices, errors.Errorf("error, error")
}
return m.indices, nil
}
func (m *mockL0Service) GetDeviceHealth(bdfAddress string) (levelzeroservice.DeviceHealth, error) {
if m.fail {
return levelzeroservice.DeviceHealth{}, errors.Errorf("error, error")
}
return levelzeroservice.DeviceHealth{Memory: m.healthy, Bus: m.healthy, SoC: m.healthy}, nil
}
func (m *mockL0Service) GetDeviceTemperature(bdfAddress string) (levelzeroservice.DeviceTemperature, error) {
if m.fail {
return levelzeroservice.DeviceTemperature{}, errors.Errorf("error, error")
}
return levelzeroservice.DeviceTemperature{Global: 35.0, GPU: 35.0, Memory: 35.0}, nil
}
func (m *mockL0Service) GetDeviceMemoryAmount(bdfAddress string) (uint64, error) {
if m.fail {
return m.memSize, errors.Errorf("error, error")
}
return m.memSize, nil
}
type TestCaseDetails struct {
// possible mock l0 service
l0mock levelzeroservice.LevelzeroService
// test-case environment
pciAddresses map[string]string
sysfsfiles map[string][]byte
symlinkfiles map[string]string
name string
sysfsdirs []string
devfsdirs []string
// how plugin should interpret it
options cliOptions
// what the result should be (i915)
expectedI915Devs int
expectedI915Monitors int
// what the result should be (dxg)
expectedDxgDevs int
// what the result should be (xe)
expectedXeDevs int
expectedXeMonitors int
}
func createTestFiles(root string, tc TestCaseDetails) (string, string, error) {
sysfs := path.Join(root, "sys")
devfs := path.Join(root, "dev")
for _, devfsdir := range tc.devfsdirs {
if err := os.MkdirAll(path.Join(devfs, devfsdir), 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake device directory")
}
}
if err := os.MkdirAll(sysfs, 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake base sysfs directory")
}
if len(tc.pciAddresses) > 0 {
if err := os.MkdirAll(filepath.Join(sysfs, ".devices"), 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake pci address base")
}
for pci, card := range tc.pciAddresses {
fullPci := filepath.Join(sysfs, ".devices", pci)
cardPath := filepath.Join(sysfs, card)
if err := os.MkdirAll(fullPci, 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake pci address entry")
}
if err := os.MkdirAll(cardPath, 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake card entry")
}
if err := os.Symlink(fullPci, filepath.Join(sysfs, card, "device")); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake pci address symlinks")
}
}
}
for _, sysfsdir := range tc.sysfsdirs {
if err := os.MkdirAll(path.Join(sysfs, sysfsdir), 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake device directory")
}
}
for filename, body := range tc.sysfsfiles {
if err := os.WriteFile(path.Join(sysfs, filename), body, 0600); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake vendor file")
}
}
for source, target := range tc.symlinkfiles {
driverPath := path.Join(sysfs, target)
symlinkPath := path.Join(sysfs, source)
if err := os.MkdirAll(driverPath, 0750); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake driver file.")
}
if err := os.Symlink(driverPath, symlinkPath); err != nil {
return "", "", errors.Wrap(err, "Failed to create fake driver symlink file.")
}
}
return sysfs, devfs, nil
}
func TestNewDevicePlugin(t *testing.T) {
if newDevicePlugin("", "", cliOptions{sharedDevNum: 2, resourceManagement: false}) == nil {
t.Error("Failed to create plugin")
}
if newDevicePlugin("", "", cliOptions{sharedDevNum: 2, resourceManagement: true}) != nil {
t.Error("Unexpectedly managed to create resource management enabled plugin inside unit tests")
}
}
func TestGetPreferredAllocation(t *testing.T) {
rqt := &v1beta1.PreferredAllocationRequest{
ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{
{
AvailableDeviceIDs: []string{"card0-4", "card0-2", "card1-1", "card2-3", "card2-4", "card2-1", "card1-0", "card1-4", "card3-4", "card1-2", "card0-1", "card2-0", "card2-2", "card1-3", "card3-0", "card3-3", "card0-3", "card0-0", "card3-1", "card3-2"},
AllocationSize: 4,
},
},
}
rqtNotEnough := &v1beta1.PreferredAllocationRequest{
ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{
{
AvailableDeviceIDs: []string{"card0-1", "card0-2", "card0-3", "card1-1"},
AllocationSize: 3,
},
},
}
rqtErr := &v1beta1.PreferredAllocationRequest{
ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{
{
AvailableDeviceIDs: []string{"card0-4", "card1-1", "card2-3", "card2-4", "card2-1"},
AllocationSize: 6,
},
},
}
plugin := newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"})
response, _ := plugin.GetPreferredAllocation(rqt)
sort.Strings(response.ContainerResponses[0].DeviceIDs)
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-4", "card1-1", "card2-3", "card3-4"}) {
t.Error("Unexpected return value for none preferred allocation", response.ContainerResponses[0].DeviceIDs)
}
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "balanced"})
response, _ = plugin.GetPreferredAllocation(rqt)
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-0", "card1-0", "card2-0", "card3-0"}) {
t.Error("Unexpected return value for balanced preferred allocation", response.ContainerResponses[0].DeviceIDs)
}
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "packed"})
response, _ = plugin.GetPreferredAllocation(rqt)
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-0", "card0-1", "card0-2", "card0-3"}) {
t.Error("Unexpected return value for packed preferred allocation", response.ContainerResponses[0].DeviceIDs)
}
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"})
response, _ = plugin.GetPreferredAllocation(rqtErr)
if response != nil {
t.Error("Fail to handle the input error that req.AllocationSize is greater than len(req.AvailableDeviceIDs).")
}
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"})
response, _ = plugin.GetPreferredAllocation(rqtNotEnough)
sort.Strings(response.ContainerResponses[0].DeviceIDs)
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-1", "card0-2", "card1-1"}) {
t.Error("Unexpected return value for none preferred allocation with too few separate devices",
response.ContainerResponses[0].DeviceIDs)
}
}
func TestAllocate(t *testing.T) {
plugin := newDevicePlugin("", "", cliOptions{sharedDevNum: 2, resourceManagement: false})
_, err := plugin.Allocate(&v1beta1.AllocateRequest{})
if _, ok := err.(*dpapi.UseDefaultMethodError); !ok {
t.Errorf("Unexpected return value: %+v", err)
}
// mock the rm
plugin.resMan = &mockResourceManager{}
_, err = plugin.Allocate(&v1beta1.AllocateRequest{})
if _, ok := err.(*dpapi.UseDefaultMethodError); !ok {
t.Errorf("Unexpected return value: %+v", err)
}
}
func TestScan(t *testing.T) {
tcases := []TestCaseDetails{
{
name: "no sysfs mounted",
},
{
name: "no device installed",
sysfsdirs: []string{"card0"},
},
{
name: "missing dev node",
sysfsdirs: []string{"card0/device"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
},
{
name: "one device",
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
},
expectedI915Devs: 1,
},
{
name: "one device with xe driver",
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
symlinkfiles: map[string]string{
"card0/device/driver": "drivers/xe",
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
},
expectedXeDevs: 1,
},
{
name: "two devices with xe driver and monitoring",
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64", "card1/device/drm/card1"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card1/device/vendor": []byte("0x8086"),
},
symlinkfiles: map[string]string{
"card0/device/driver": "drivers/xe",
"card1/device/driver": "drivers/xe",
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
"card1",
"by-path/pci-0000:00:01.0-card",
"by-path/pci-0000:00:01.0-render",
},
options: cliOptions{enableMonitoring: true},
expectedXeDevs: 2,
expectedXeMonitors: 1,
},
{
name: "two devices with xe and i915 drivers",
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64", "card1/device/drm/card1"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card1/device/vendor": []byte("0x8086"),
},
symlinkfiles: map[string]string{
"card0/device/driver": "drivers/xe",
"card1/device/driver": "drivers/i915",
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
"card1",
"by-path/pci-0000:00:01.0-card",
"by-path/pci-0000:00:01.0-render",
},
options: cliOptions{enableMonitoring: true},
expectedXeDevs: 1,
expectedXeMonitors: 1,
expectedI915Devs: 1,
expectedI915Monitors: 1,
},
{
name: "sriov-1-pf-no-vfs + monitoring",
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card0/device/sriov_numvfs": []byte("0"),
},
devfsdirs: []string{"card0"},
options: cliOptions{enableMonitoring: true},
expectedI915Devs: 1,
expectedI915Monitors: 1,
},
{
name: "two sysfs records but one dev node",
sysfsdirs: []string{
"card0/device/drm/card0",
"card1/device/drm/card1",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card1/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{"card0"},
expectedI915Devs: 1,
},
{
name: "sriov-1-pf-and-2-vfs",
sysfsdirs: []string{
"card0/device/drm/card0",
"card1/device/drm/card1",
"card2/device/drm/card2",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card0/device/sriov_numvfs": []byte("2"),
"card1/device/vendor": []byte("0x8086"),
"card2/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{"card0", "card1", "card2"},
expectedI915Devs: 2,
},
{
name: "two devices with 13 shares + monitoring",
sysfsdirs: []string{
"card0/device/drm/card0",
"card1/device/drm/card1",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card1/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{"card0", "card1"},
options: cliOptions{sharedDevNum: 13, enableMonitoring: true},
expectedI915Devs: 26,
expectedI915Monitors: 1,
},
{
name: "wrong vendor",
sysfsdirs: []string{"card0/device/drm/card0"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0xbeef"),
},
devfsdirs: []string{"card0"},
},
{
name: "wrong vendor with 13 shares + monitoring",
sysfsdirs: []string{"card0/device/drm/card0"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0xbeef"),
},
devfsdirs: []string{"card0"},
options: cliOptions{sharedDevNum: 13, enableMonitoring: true},
},
{
name: "no sysfs records",
sysfsdirs: []string{"non_gpu_card"},
},
}
for _, tc := range tcases {
if tc.options.sharedDevNum == 0 {
tc.options.sharedDevNum = 1
}
t.Run(tc.name, func(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs, devfs, err := createTestFiles(root, tc)
if err != nil {
t.Errorf("Unexpected error: %+v", err)
}
plugin := newDevicePlugin(sysfs, devfs, tc.options)
notifier := &mockNotifier{
scanDone: plugin.scanDone,
}
err = plugin.Scan(notifier)
// Scans in GPU plugin never fail
if err != nil {
t.Errorf("Unexpected error: %+v", err)
}
if tc.expectedI915Devs != notifier.i915Count {
t.Errorf("Expected %d, discovered %d devices (i915)",
tc.expectedI915Devs, notifier.i915Count)
}
if tc.expectedI915Monitors != notifier.i915monitorCount {
t.Errorf("Expected %d, discovered %d monitors (i915)",
tc.expectedI915Monitors, notifier.i915monitorCount)
}
if tc.expectedXeDevs != notifier.xeCount {
t.Errorf("Expected %d, discovered %d devices (XE)",
tc.expectedXeDevs, notifier.xeCount)
}
if tc.expectedXeMonitors != notifier.xeMonitorCount {
t.Errorf("Expected %d, discovered %d monitors (XE)",
tc.expectedXeMonitors, notifier.xeMonitorCount)
}
})
}
}
func TestScanWithHealth(t *testing.T) {
tcases := []TestCaseDetails{
{
name: "one device with no symlink",
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
},
expectedI915Devs: 1,
},
{
name: "one device with proper symlink",
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
},
expectedI915Devs: 1,
l0mock: &mockL0Service{
healthy: true,
},
},
{
name: "one unhealthy device with proper symlink",
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
},
expectedI915Devs: 1,
l0mock: &mockL0Service{
healthy: false,
},
},
{
name: "one device with proper symlink returns error",
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
},
expectedI915Devs: 1,
l0mock: &mockL0Service{
fail: true,
},
},
}
for _, tc := range tcases {
if tc.options.sharedDevNum == 0 {
tc.options.sharedDevNum = 1
}
t.Run(tc.name, func(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs, devfs, err := createTestFiles(root, tc)
if err != nil {
t.Errorf("unexpected error: %+v", err)
}
plugin := newDevicePlugin(sysfs, devfs, tc.options)
plugin.levelzeroService = tc.l0mock
notifier := &mockNotifier{
scanDone: plugin.scanDone,
}
err = plugin.Scan(notifier)
// Scans in GPU plugin never fail
if err != nil {
t.Errorf("unexpected error: %+v", err)
}
if tc.expectedI915Devs != notifier.i915Count {
t.Errorf("Expected %d, discovered %d devices (i915)",
tc.expectedI915Devs, notifier.i915Count)
}
if tc.expectedI915Monitors != notifier.i915monitorCount {
t.Errorf("Expected %d, discovered %d monitors (i915)",
tc.expectedI915Monitors, notifier.i915monitorCount)
}
})
}
}
func TestScanWsl(t *testing.T) {
tcases := []TestCaseDetails{
{
name: "one wsl device",
expectedDxgDevs: 1,
l0mock: &mockL0Service{
indices: []uint32{0},
},
},
{
name: "four wsl device",
expectedDxgDevs: 4,
l0mock: &mockL0Service{
indices: []uint32{0, 1, 2, 3},
},
},
}
for _, tc := range tcases {
if tc.options.sharedDevNum == 0 {
tc.options.sharedDevNum = 1
}
t.Run(tc.name, func(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs, devfs, err := createTestFiles(root, tc)
if err != nil {
t.Errorf("unexpected error: %+v", err)
}
plugin := newDevicePlugin(sysfs, devfs, tc.options)
plugin.options.wslScan = true
plugin.levelzeroService = tc.l0mock
notifier := &mockNotifier{
scanDone: plugin.scanDone,
}
err = plugin.Scan(notifier)
// Scans in GPU plugin never fail
if err != nil {
t.Errorf("unexpected error: %+v", err)
}
if tc.expectedDxgDevs != notifier.dxgCount {
t.Errorf("Expected %d, discovered %d devices (dxg)",
tc.expectedI915Devs, notifier.i915Count)
}
})
}
}
func TestScanFails(t *testing.T) {
tc := TestCaseDetails{
name: "xe and i915 devices with rm will fail",
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64", "card1/device/drm/card1"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card1/device/vendor": []byte("0x8086"),
},
symlinkfiles: map[string]string{
"card0/device/driver": "drivers/xe",
"card1/device/driver": "drivers/i915",
},
devfsdirs: []string{
"card0",
"card1",
},
}
t.Run(tc.name, func(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs, devfs, err := createTestFiles(root, tc)
if err != nil {
t.Errorf("Unexpected error: %+v", err)
}
plugin := newDevicePlugin(sysfs, devfs, tc.options)
plugin.resMan = &mockResourceManager{}
notifier := &mockNotifier{
scanDone: plugin.scanDone,
}
err = plugin.Scan(notifier)
if err == nil {
t.Error("Unexpected nil error")
}
})
}
func TestScanWithRmAndTiles(t *testing.T) {
tcs := []TestCaseDetails{
{
name: "two tile xe devices with rm enabled - homogeneous",
sysfsdirs: []string{
"card0/device/drm/card0",
"card1/device/drm/card1",
"card0/device/tile0/gt0",
"card0/device/tile1/gt1",
"card1/device/tile0/gt0",
"card1/device/tile1/gt1",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card1/device/vendor": []byte("0x8086"),
},
symlinkfiles: map[string]string{
"card0/device/driver": "drivers/xe",
"card1/device/driver": "drivers/xe",
},
devfsdirs: []string{
"card0",
"card1",
},
},
{
name: "2 & 1 tile xe devices with rm enabled - heterogeneous",
sysfsdirs: []string{
"card0/device/drm/card0",
"card1/device/drm/card1",
"card0/device/tile0/gt0",
"card0/device/tile1/gt1",
"card1/device/tile0/gt0",
},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
"card1/device/vendor": []byte("0x8086"),
},
symlinkfiles: map[string]string{
"card0/device/driver": "drivers/xe",
"card1/device/driver": "drivers/xe",
},
devfsdirs: []string{
"card0",
"card1",
},
},
}
expectedTileCounts := []uint64{2, 0}
for i, tc := range tcs {
t.Run(tc.name, func(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs, devfs, err := createTestFiles(root, tc)
if err != nil {
t.Errorf("Unexpected error: %+v", err)
}
plugin := newDevicePlugin(sysfs, devfs, tc.options)
rm := &mockResourceManager{}
plugin.resMan = rm
notifier := &mockNotifier{
scanDone: plugin.scanDone,
}
err = plugin.Scan(notifier)
if err != nil {
t.Error("Unexpected error")
}
if rm.tileCount != expectedTileCounts[i] {
t.Error("Unexpected tilecount for RM")
}
})
}
}
// Would be nice to combine these with the overall Scan unit tests.
func createBypathTestFiles(t *testing.T, card, root, linkFile string, bypathFiles []string) (string, string) {
drmPath := path.Join(root, "sys/class/drm/", card)
devPath := path.Join(root, "sys", linkFile)
byPath := path.Join(root, "by-path")
if linkFile != "" {
if err := os.MkdirAll(filepath.Dir(devPath), 0700); err != nil {
t.Fatal("Couldn't create test dev dir", err)
}
if err := os.MkdirAll(filepath.Dir(drmPath), 0700); err != nil {
t.Fatal("Couldn't create test drm dir", err)
}
if err := os.WriteFile(devPath, []byte{0}, 0o600); err != nil {
t.Fatal("Couldn't create card file", err)
}
if err := os.Symlink(devPath, drmPath); err != nil {
t.Fatal("Couldn't create symlink between pci path and sysfs drm path")
}
}
if len(bypathFiles) > 0 {
if err := os.MkdirAll(byPath, 0700); err != nil {
t.Fatal("Mkdir failed:", byPath)
}
for _, f := range bypathFiles {
if err := os.WriteFile(path.Join(byPath, f), []byte{1}, 0o600); err != nil {
t.Fatal("WriteFile failed:", path.Join(byPath, f))
}
}
}
return drmPath, byPath
}
func TestBypath(t *testing.T) {
type testData struct {
desc string
linkpath string
bypathFiles []string
pciAddrOk bool
mountCount int
}
const cardName string = "card0"
tds := []testData{
{
"card with two by-path files",
"00.10.2/00.334.302/0.0.1.00/0000:0f:05.0/drm/" + cardName,
[]string{"pci-0000:0f:05.0-card", "pci-0000:0f:05.0-render"},
true,
2,
},
{
"different by-path files",
"00.10.2/00.334.302/0.0.1.00/0000:ff:05.0/drm/" + cardName,
[]string{"pci-0000:0f:05.0-card", "pci-0000:0f:05.0-render"},
true,
0,
},
{
"invalid pci address",
"00.10.2/00.334.302/0.0.1.00/000:ff:05.1/drm/" + cardName,
[]string{"pci-0000:0f:05.0-card", "pci-0000:0f:05.0-render"},
false,
0,
},
{
"symlink without card",
"00.10.2/00.334.302/0.0.1.00/0000:0f:05.0/drm",
[]string{"pci-0000:0f:05.0-card", "pci-0000:0f:05.0-render"},
false,
0,
},
{
"no symlink",
"",
[]string{"pci-0000:0f:05.0-card", "pci-0000:0f:05.0-render"},
false,
0,
},
{
"no by-path files",
"00.10.2/00.334.302/0.0.1.00/0000:0f:05.0/drm/" + cardName,
[]string{},
true,
0,
},
}
for _, td := range tds {
root, err := os.MkdirTemp("", "test_bypath_mounting")
if err != nil {
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
plugin := newDevicePlugin("/", "/", cliOptions{})
drmPath, byPath := createBypathTestFiles(t, cardName, root, td.linkpath, td.bypathFiles)
pciAddr, pciErr := plugin.pciAddressForCard(drmPath, cardName)
if pciErr != nil && td.pciAddrOk {
t.Errorf("%s: failed to retrieve pci address when it should have", td.desc)
}
if pciErr != nil {
continue
}
mounts := plugin.bypathMountsForPci(pciAddr, byPath)
if len(mounts) != td.mountCount {
t.Errorf("%s: Wrong number of mounts %d vs. %d", td.desc, len(mounts), td.mountCount)
}
absPaths := []string{}
for _, link := range td.bypathFiles {
absPaths = append(absPaths, path.Join(byPath, link))
}
for _, mount := range mounts {
if !slices.Contains(absPaths, mount.ContainerPath) {
t.Errorf("%s: containerpath is incorrect: %s", td.desc, mount.ContainerPath)
}
if !slices.Contains(absPaths, mount.HostPath) {
t.Errorf("%s: hostpath is incorrect: %s", td.desc, mount.HostPath)
}
}
}
}
func TestPciDeviceForCard(t *testing.T) {
root, err := os.MkdirTemp("", "test_pci_device_for_card")
if err != nil {
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs := path.Join(root, "sys")
cardPath := filepath.Join(sysfs, "class", "drm", "card0")
cardDevicePath := filepath.Join(cardPath, "device")
if err := os.MkdirAll(cardDevicePath, 0750); err != nil {
t.Fatalf("Card device path creation failed: %+v", err)
}
data := "0x5959"
err = os.WriteFile(filepath.Join(cardDevicePath, "device"), []byte(data), 0o600)
if err != nil {
t.Fatalf("Device id write failed: %+v", err)
}
id, err := pciDeviceIDForCard(cardPath)
if err != nil {
t.Errorf("Failed to get device id for card: %+v", err)
}
if id != data {
t.Errorf("Wrong id received %s vs %s", id, data)
}
// Check bad device
cardPath = filepath.Join(sysfs, "class", "drm", "card1")
cardDevicePath = filepath.Join(cardPath, "device")
if err := os.MkdirAll(cardDevicePath, 0750); err != nil {
t.Fatalf("Card device path creation failed: %+v", err)
}
err = os.WriteFile(filepath.Join(cardDevicePath, "devicebad"), []byte(data), 0o600)
if err != nil {
t.Fatalf("Device id write failed: %+v", err)
}
id, err = pciDeviceIDForCard(cardPath)
if err == nil {
t.Errorf("ID received when it shouldn't be possible: %s", id)
}
}
type symlinkItem struct {
old string
new string
}
func createSymlinks(t *testing.T, base string, links []symlinkItem) {
for _, link := range links {
linkOld := filepath.Join(base, link.old)
linkNew := filepath.Join(base, link.new)
if _, err := os.Stat(linkOld); err != nil {
if err := os.MkdirAll(linkOld, 0o750); err != nil && !errors.Is(err, os.ErrExist) {
t.Fatalf("Failed to create symlink base dir: %+v", err)
}
}
d := filepath.Dir(linkNew)
if err := os.MkdirAll(d, 0o750); err != nil {
t.Fatal("Failed to create symlink new dir", err)
}
if err := os.Symlink(linkOld, linkNew); err != nil {
t.Fatal("Failed to create symlink from old to new", err)
}
}
}
func createFiles(t *testing.T, base string, files map[string][]byte) {
for file, content := range files {
fp := filepath.Join(base, file)
dir := filepath.Dir(fp)
if err := os.MkdirAll(dir, 0o750); err != nil {
t.Fatal("Failed to create dev directories", err)
}
if err := os.WriteFile(fp, content, 0o600); err != nil {
t.Fatal("Failed to create dev file", err)
}
}
}
func createDirs(t *testing.T, base string, dirs []string) {
for _, dir := range dirs {
if err := os.MkdirAll(filepath.Join(base, dir), 0o750); err != nil {
t.Fatal("Failed to create sysfs directories", err)
}
}
}
func TestCDIDeviceInclusion(t *testing.T) {
root, err := os.MkdirTemp("", "test_cdidevice")
if err != nil {
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs := path.Join(root, "sys")
devfs := path.Join(root, "dev")
sysfslinks := []symlinkItem{
{"/0042:01:02.0", "/class/drm/card0"},
{"/0042:01:05.0", "/class/drm/card1"},
{"driver/i915", "/class/drm/card0/device/driver"},
{"driver/xe", "/class/drm/card1/device/driver"},
}
devfslinks := []symlinkItem{
{"/dri/card0", "/dri/by-path/pci-0042:01:02.0-card"},
{"/dri/renderD128", "/dri/by-path/pci-0042:01:02.0-render"},
{"/dri/card1", "/dri/by-path/pci-0042:01:05.0-card"},
{"/dri/renderD129", "/dri/by-path/pci-0042:01:05.0-render"},
}
sysfsDirs := []string{
"class/drm/card0/device/drm/card0",
"class/drm/card0/device/drm/renderD128",
"class/drm/card1/device/drm/card1",
"class/drm/card1/device/drm/renderD129",
}
sysfsFiles := map[string][]byte{
"class/drm/card0/device/device": []byte("0x9a49"),
"class/drm/card0/device/vendor": []byte("0x8086"),
"class/drm/card1/device/device": []byte("0x9a48"),
"class/drm/card1/device/vendor": []byte("0x8086"),
}
devfsfiles := map[string][]byte{
"/dri/card0": []byte("1"),
"/dri/renderD128": []byte("1"),
"/dri/card1": []byte("1"),
"/dri/renderD129": []byte("1"),
}
createSymlinks(t, sysfs, sysfslinks)
createFiles(t, devfs, devfsfiles)
createFiles(t, sysfs, sysfsFiles)
createDirs(t, sysfs, sysfsDirs)
createSymlinks(t, devfs, devfslinks)
plugin := newDevicePlugin(sysfs+"/class/drm", devfs+"/dri", cliOptions{sharedDevNum: 1})
plugin.bypathFound = true
tree, err := plugin.scan()
if err != nil {
t.Error("Failed to get device id for card")
}
refTree := dpapi.NewDeviceTree()
refTree.AddDevice("i915", "card0-0", dpapi.NewDeviceInfo("Healthy", []v1beta1.DeviceSpec{
{ContainerPath: devfs + "/dri/card0", HostPath: devfs + "/dri/card0", Permissions: "rw"},
{ContainerPath: devfs + "/dri/renderD128", HostPath: devfs + "/dri/renderD128", Permissions: "rw"},
}, []v1beta1.Mount{
{ContainerPath: devfs + "/dri/by-path/pci-0042:01:02.0-card", HostPath: devfs + "/dri/by-path/pci-0042:01:02.0-card", ReadOnly: true},
{ContainerPath: devfs + "/dri/by-path/pci-0042:01:02.0-render", HostPath: devfs + "/dri/by-path/pci-0042:01:02.0-render", ReadOnly: true},
}, nil, nil, &cdispec.Spec{
Version: dpapi.CDIVersion,
Kind: dpapi.CDIVendor + "/gpu",
Devices: []cdispec.Device{
{
Name: "card0",
ContainerEdits: cdispec.ContainerEdits{
DeviceNodes: []*cdispec.DeviceNode{
{Path: devfs + "/dri/card0", HostPath: devfs + "/dri/card0", Permissions: "rw"},
{Path: devfs + "/dri/renderD128", HostPath: devfs + "/dri/renderD128", Permissions: "rw"},
},
Mounts: []*cdispec.Mount{
{
HostPath: devfs + "/dri/by-path/pci-0042:01:02.0-card",
ContainerPath: devfs + "/dri/by-path/pci-0042:01:02.0-card",
Options: []string{"bind", "ro"},
Type: "none",
},
{
HostPath: devfs + "/dri/by-path/pci-0042:01:02.0-render",
ContainerPath: devfs + "/dri/by-path/pci-0042:01:02.0-render",
Options: []string{"bind", "ro"},
Type: "none",
},
},
},
},
},
}))
refTree.AddDevice("xe", "card1-0", dpapi.NewDeviceInfo("Healthy", []v1beta1.DeviceSpec{
{ContainerPath: devfs + "/dri/card1", HostPath: devfs + "/dri/card1", Permissions: "rw"},
{ContainerPath: devfs + "/dri/renderD129", HostPath: devfs + "/dri/renderD129", Permissions: "rw"},
}, []v1beta1.Mount{
{ContainerPath: devfs + "/dri/by-path/pci-0042:01:05.0-card", HostPath: devfs + "/dri/by-path/pci-0042:01:05.0-card", ReadOnly: true},
{ContainerPath: devfs + "/dri/by-path/pci-0042:01:05.0-render", HostPath: devfs + "/dri/by-path/pci-0042:01:05.0-render", ReadOnly: true},
}, nil, nil, &cdispec.Spec{
Version: dpapi.CDIVersion,
Kind: dpapi.CDIVendor + "/gpu",
Devices: []cdispec.Device{
{
Name: "card1",
ContainerEdits: cdispec.ContainerEdits{
DeviceNodes: []*cdispec.DeviceNode{
{Path: devfs + "/dri/card1", HostPath: devfs + "/dri/card1", Permissions: "rw"},
{Path: devfs + "/dri/renderD129", HostPath: devfs + "/dri/renderD129", Permissions: "rw"},
},
Mounts: []*cdispec.Mount{
{
HostPath: devfs + "/dri/by-path/pci-0042:01:05.0-card",
ContainerPath: devfs + "/dri/by-path/pci-0042:01:05.0-card",
Options: []string{"bind", "ro"},
Type: "none",
},
{
HostPath: devfs + "/dri/by-path/pci-0042:01:05.0-render",
ContainerPath: devfs + "/dri/by-path/pci-0042:01:05.0-render",
Options: []string{"bind", "ro"},
Type: "none",
},
},
},
},
},
}))
if !reflect.DeepEqual(tree, refTree) {
t.Error("Received device tree isn't expected\n", tree, "\n", refTree)
}
if tree.DeviceTypeCount("i915") != 1 {
t.Error("Invalid count for device (i915)")
}
if tree.DeviceTypeCount("xe") != 1 {
t.Error("Invalid count for device (xe)")
}
}