gpu: add support for CDI devices

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
Tuomas Katila 2024-08-20 14:02:30 +03:00
parent 13e00f0302
commit 402fb8d9cd
5 changed files with 377 additions and 23 deletions

View File

@ -16,6 +16,7 @@ Table of Contents
* [Running GPU plugin as non-root](#running-gpu-plugin-as-non-root)
* [Labels created by GPU plugin](#labels-created-by-gpu-plugin)
* [SR-IOV use with the plugin](#sr-iov-use-with-the-plugin)
* [CDI support](#cdi-support)
* [KMD and UMD](#kmd-and-umd)
* [Issues with media workloads on multi-GPU setups](#issues-with-media-workloads-on-multi-gpu-setups)
* [Workaround for QSV and VA-API](#workaround-for-qsv-and-va-api)
@ -218,6 +219,19 @@ GPU plugin does __not__ setup SR-IOV. It has to be configured by the cluster adm
GPU plugin does however support provisioning Virtual Functions (VFs) to containers for a SR-IOV enabled GPU. When the plugin detects a GPU with SR-IOV VFs configured, it will only provision the VFs and leaves the PF device on the host.
### CDI support
GPU plugin supports [CDI](https://github.com/container-orchestrated-devices/container-device-interface) to provide device details to the container. It does not yet provide any benefits compared to the traditional Kubernetes Device Plugin API. The CDI device specs will improve in the future with features that are not possible with the Device Plugin API.
To enable CDI support, container runtime has to support it. The support varies depending on the versions:
* CRI-O supports CDI by default v1.24.0 onwards.
* Containerd supports CDI from 1.7.0 onwards. 2.0.0 release will enable it by default.
* Docker supports CDI from v25 onwards.
Kubernetes CDI support is included since 1.28 release. In 1.28 it needs to be enabled via `DevicePluginCDIDevices` feature gate. From 1.29 onwards the feature is enabled by default.
> *NOTE*: To use CDI outside of Kubernetes, for example with Docker or Podman, CDI specs can be generated with the [Intel CDI specs generator](https://github.com/intel/intel-resource-drivers-for-kubernetes/releases/tag/specs-generator-v0.1.0).
### KMD and UMD
There are 3 different Kernel Mode Drivers (KMD) available: `i915 upstream`, `i915 backport` and `xe`:

View File

@ -34,6 +34,7 @@ import (
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
cdispec "tags.cncf.io/container-device-interface/specs-go"
)
const (
@ -202,13 +203,10 @@ func packedPolicy(req *pluginapi.ContainerPreferredAllocationRequest) []string {
return deviceIds
}
// Returns a slice of by-path Mounts for a cardPath&Name.
// by-path files are searched from the given bypathDir.
// In the by-path dir, any files that start with "pci-<pci addr>" will be added to mounts.
func (dp *devicePlugin) bypathMountsForPci(cardPath, cardName, bypathDir string) []pluginapi.Mount {
func (dp *devicePlugin) pciAddressForCard(cardPath, cardName string) (string, error) {
linkPath, err := os.Readlink(cardPath)
if err != nil {
return nil
return "", err
}
// Fetches the pci address for a drm card by reading the
@ -220,9 +218,27 @@ func (dp *devicePlugin) bypathMountsForPci(cardPath, cardName, bypathDir string)
if !dp.pciAddressReg.MatchString(pciAddress) {
klog.Warningf("Invalid pci address for %s: %s", cardPath, pciAddress)
return nil
return "", os.ErrInvalid
}
return pciAddress, nil
}
func pciDeviceIDForCard(cardPath string) (string, error) {
idPath := filepath.Join(cardPath, "device", "device")
idBytes, err := os.ReadFile(idPath)
if err != nil {
return "", err
}
return strings.Split(string(idBytes), "\n")[0], nil
}
// Returns a slice of by-path Mounts for a pciAddress.
// by-path files are searched from the given bypathDir.
// In the by-path dir, any files that start with "pci-<pci addr>" will be added to mounts.
func (dp *devicePlugin) bypathMountsForPci(pciAddress, bypathDir string) []pluginapi.Mount {
files, err := os.ReadDir(bypathDir)
if err != nil {
klog.Warningf("Failed to read by-path directory: %+v", err)
@ -481,6 +497,45 @@ func (dp *devicePlugin) createDeviceSpecsFromDrmFiles(cardPath string) []plugina
return specs
}
func (dp *devicePlugin) createMountsAndCDIDevices(cardPath, name string, devSpecs []pluginapi.DeviceSpec) ([]pluginapi.Mount, *cdispec.Spec) {
mounts := []pluginapi.Mount{}
if dp.bypathFound {
if pciAddr, pciErr := dp.pciAddressForCard(cardPath, name); pciErr == nil {
mounts = dp.bypathMountsForPci(pciAddr, dp.bypathDir)
}
}
spec := &cdispec.Spec{
Version: dpapi.CDIVersion,
Kind: dpapi.CDIVendor + "/gpu",
Devices: make([]cdispec.Device, 1),
}
spec.Devices[0].Name = name
cedits := &spec.Devices[0].ContainerEdits
for _, dspec := range devSpecs {
cedits.DeviceNodes = append(cedits.DeviceNodes, &cdispec.DeviceNode{
HostPath: dspec.HostPath,
Path: dspec.ContainerPath,
Permissions: dspec.Permissions,
})
}
for _, mount := range mounts {
cedits.Mounts = append(cedits.Mounts, &cdispec.Mount{
HostPath: mount.HostPath,
ContainerPath: mount.ContainerPath,
Type: "none",
Options: []string{"bind", "ro"},
})
}
return mounts, spec
}
func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
files, err := os.ReadDir(dp.sysfsDir)
if err != nil {
@ -509,12 +564,9 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
continue
}
mounts := []pluginapi.Mount{}
if dp.bypathFound {
mounts = dp.bypathMountsForPci(cardPath, name, dp.bypathDir)
}
mounts, cdiDevices := dp.createMountsAndCDIDevices(cardPath, name, devSpecs)
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil, nil)
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil, cdiDevices)
for i := 0; i < dp.options.sharedDevNum; i++ {
devID := fmt.Sprintf("%s-%d", name, i)

View File

@ -29,6 +29,7 @@ import (
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
cdispec "tags.cncf.io/container-device-interface/specs-go"
)
func init() {
@ -402,14 +403,14 @@ func TestScan(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs, devfs, err := createTestFiles(root, tc)
if err != nil {
t.Errorf("unexpected error: %+v", err)
t.Errorf("Unexpected error: %+v", err)
}
plugin := newDevicePlugin(sysfs, devfs, tc.options)
@ -421,7 +422,7 @@ func TestScan(t *testing.T) {
err = plugin.Scan(notifier)
// Scans in GPU plugin never fail
if err != nil {
t.Errorf("unexpected error: %+v", err)
t.Errorf("Unexpected error: %+v", err)
}
if tc.expectedI915Devs != notifier.i915Count {
t.Errorf("Expected %d, discovered %d devices (i915)",
@ -464,14 +465,14 @@ func TestScanFails(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs, devfs, err := createTestFiles(root, tc)
if err != nil {
t.Errorf("unexpected error: %+v", err)
t.Errorf("Unexpected error: %+v", err)
}
plugin := newDevicePlugin(sysfs, devfs, tc.options)
@ -484,7 +485,7 @@ func TestScanFails(t *testing.T) {
err = plugin.Scan(notifier)
if err == nil {
t.Error("unexpected nil error")
t.Error("Unexpected nil error")
}
})
}
@ -544,14 +545,14 @@ func TestScanWithRmAndTiles(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
root, err := os.MkdirTemp("", "test_new_device_plugin")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs, devfs, err := createTestFiles(root, tc)
if err != nil {
t.Errorf("unexpected error: %+v", err)
t.Errorf("Unexpected error: %+v", err)
}
plugin := newDevicePlugin(sysfs, devfs, tc.options)
@ -565,10 +566,10 @@ func TestScanWithRmAndTiles(t *testing.T) {
err = plugin.Scan(notifier)
if err != nil {
t.Error("unexpected error")
t.Error("Unexpected error")
}
if rm.tileCount != expectedTileCounts[i] {
t.Error("unexpected tilecount for RM")
t.Error("Unexpected tilecount for RM")
}
})
}
@ -618,6 +619,7 @@ func TestBypath(t *testing.T) {
desc string
linkpath string
bypathFiles []string
pciAddrOk bool
mountCount int
}
@ -628,36 +630,42 @@ func TestBypath(t *testing.T) {
"card with two by-path files",
"00.10.2/00.334.302/0.0.1.00/0000:0f:05.0/drm/" + cardName,
[]string{"pci-0000:0f:05.0-card", "pci-0000:0f:05.0-render"},
true,
2,
},
{
"different by-path files",
"00.10.2/00.334.302/0.0.1.00/0000:ff:05.0/drm/" + cardName,
[]string{"pci-0000:0f:05.0-card", "pci-0000:0f:05.0-render"},
true,
0,
},
{
"invalid pci address",
"00.10.2/00.334.302/0.0.1.00/000:ff:05.1/drm/" + cardName,
[]string{"pci-0000:0f:05.0-card", "pci-0000:0f:05.0-render"},
false,
0,
},
{
"symlink without card",
"00.10.2/00.334.302/0.0.1.00/0000:0f:05.0/drm",
[]string{"pci-0000:0f:05.0-card", "pci-0000:0f:05.0-render"},
false,
0,
},
{
"no symlink",
"",
[]string{"pci-0000:0f:05.0-card", "pci-0000:0f:05.0-render"},
false,
0,
},
{
"no by-path files",
"00.10.2/00.334.302/0.0.1.00/0000:0f:05.0/drm/" + cardName,
[]string{},
true,
0,
},
}
@ -665,7 +673,7 @@ func TestBypath(t *testing.T) {
for _, td := range tds {
root, err := os.MkdirTemp("", "test_bypath_mounting")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
@ -674,7 +682,17 @@ func TestBypath(t *testing.T) {
drmPath, byPath := createBypathTestFiles(t, cardName, root, td.linkpath, td.bypathFiles)
mounts := plugin.bypathMountsForPci(drmPath, cardName, byPath)
pciAddr, pciErr := plugin.pciAddressForCard(drmPath, cardName)
if pciErr != nil && td.pciAddrOk {
t.Errorf("%s: failed to retrieve pci address when it should have", td.desc)
}
if pciErr != nil {
continue
}
mounts := plugin.bypathMountsForPci(pciAddr, byPath)
if len(mounts) != td.mountCount {
t.Errorf("%s: Wrong number of mounts %d vs. %d", td.desc, len(mounts), td.mountCount)
@ -696,3 +714,253 @@ func TestBypath(t *testing.T) {
}
}
}
func TestPciDeviceForCard(t *testing.T) {
root, err := os.MkdirTemp("", "test_pci_device_for_card")
if err != nil {
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs := path.Join(root, "sys")
cardPath := filepath.Join(sysfs, "class", "drm", "card0")
cardDevicePath := filepath.Join(cardPath, "device")
if err := os.MkdirAll(cardDevicePath, 0750); err != nil {
t.Fatalf("Card device path creation failed: %+v", err)
}
data := "0x5959"
err = os.WriteFile(filepath.Join(cardDevicePath, "device"), []byte(data), 0o600)
if err != nil {
t.Fatalf("Device id write failed: %+v", err)
}
id, err := pciDeviceIDForCard(cardPath)
if err != nil {
t.Errorf("Failed to get device id for card: %+v", err)
}
if id != data {
t.Errorf("Wrong id received %s vs %s", id, data)
}
// Check bad device
cardPath = filepath.Join(sysfs, "class", "drm", "card1")
cardDevicePath = filepath.Join(cardPath, "device")
if err := os.MkdirAll(cardDevicePath, 0750); err != nil {
t.Fatalf("Card device path creation failed: %+v", err)
}
err = os.WriteFile(filepath.Join(cardDevicePath, "devicebad"), []byte(data), 0o600)
if err != nil {
t.Fatalf("Device id write failed: %+v", err)
}
id, err = pciDeviceIDForCard(cardPath)
if err == nil {
t.Errorf("ID received when it shouldn't be possible: %s", id)
}
}
type symlinkItem struct {
old string
new string
}
func createSymlinks(t *testing.T, base string, links []symlinkItem) {
for _, link := range links {
linkOld := filepath.Join(base, link.old)
linkNew := filepath.Join(base, link.new)
if _, err := os.Stat(linkOld); err != nil {
if err := os.MkdirAll(linkOld, 0o750); err != nil && !errors.Is(err, os.ErrExist) {
t.Fatalf("Failed to create symlink base dir: %+v", err)
}
}
d := filepath.Dir(linkNew)
if err := os.MkdirAll(d, 0o750); err != nil {
t.Fatal("Failed to create symlink new dir", err)
}
if err := os.Symlink(linkOld, linkNew); err != nil {
t.Fatal("Failed to create symlink from old to new", err)
}
}
}
func createFiles(t *testing.T, base string, files map[string][]byte) {
for file, content := range files {
fp := filepath.Join(base, file)
dir := filepath.Dir(fp)
if err := os.MkdirAll(dir, 0o750); err != nil {
t.Fatal("Failed to create dev directories", err)
}
if err := os.WriteFile(fp, content, 0o600); err != nil {
t.Fatal("Failed to create dev file", err)
}
}
}
func createDirs(t *testing.T, base string, dirs []string) {
for _, dir := range dirs {
if err := os.MkdirAll(filepath.Join(base, dir), 0o750); err != nil {
t.Fatal("Failed to create sysfs directories", err)
}
}
}
func TestCDIDeviceInclusion(t *testing.T) {
root, err := os.MkdirTemp("", "test_cdidevice")
if err != nil {
t.Fatalf("Can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
sysfs := path.Join(root, "sys")
devfs := path.Join(root, "dev")
sysfslinks := []symlinkItem{
{"/0042:01:02.0", "/class/drm/card0"},
{"/0042:01:05.0", "/class/drm/card1"},
{"driver/i915", "/class/drm/card0/device/driver"},
{"driver/xe", "/class/drm/card1/device/driver"},
}
devfslinks := []symlinkItem{
{"/dri/card0", "/dri/by-path/pci-0042:01:02.0-card"},
{"/dri/renderD128", "/dri/by-path/pci-0042:01:02.0-render"},
{"/dri/card1", "/dri/by-path/pci-0042:01:05.0-card"},
{"/dri/renderD129", "/dri/by-path/pci-0042:01:05.0-render"},
}
sysfsDirs := []string{
"class/drm/card0/device/drm/card0",
"class/drm/card0/device/drm/renderD128",
"class/drm/card1/device/drm/card1",
"class/drm/card1/device/drm/renderD129",
}
sysfsFiles := map[string][]byte{
"class/drm/card0/device/device": []byte("0x9a49"),
"class/drm/card0/device/vendor": []byte("0x8086"),
"class/drm/card1/device/device": []byte("0x9a48"),
"class/drm/card1/device/vendor": []byte("0x8086"),
}
devfsfiles := map[string][]byte{
"/dri/card0": []byte("1"),
"/dri/renderD128": []byte("1"),
"/dri/card1": []byte("1"),
"/dri/renderD129": []byte("1"),
}
createSymlinks(t, sysfs, sysfslinks)
createFiles(t, devfs, devfsfiles)
createFiles(t, sysfs, sysfsFiles)
createDirs(t, sysfs, sysfsDirs)
createSymlinks(t, devfs, devfslinks)
plugin := newDevicePlugin(sysfs+"/class/drm", devfs+"/dri", cliOptions{sharedDevNum: 1})
plugin.bypathFound = true
tree, err := plugin.scan()
if err != nil {
t.Error("Failed to get device id for card")
}
refTree := dpapi.NewDeviceTree()
refTree.AddDevice("i915", "card0-0", dpapi.NewDeviceInfo("Healthy", []v1beta1.DeviceSpec{
{ContainerPath: devfs + "/dri/card0", HostPath: devfs + "/dri/card0", Permissions: "rw"},
{ContainerPath: devfs + "/dri/renderD128", HostPath: devfs + "/dri/renderD128", Permissions: "rw"},
}, []v1beta1.Mount{
{ContainerPath: devfs + "/dri/by-path/pci-0042:01:02.0-card", HostPath: devfs + "/dri/by-path/pci-0042:01:02.0-card", ReadOnly: true},
{ContainerPath: devfs + "/dri/by-path/pci-0042:01:02.0-render", HostPath: devfs + "/dri/by-path/pci-0042:01:02.0-render", ReadOnly: true},
}, nil, nil, &cdispec.Spec{
Version: dpapi.CDIVersion,
Kind: dpapi.CDIVendor + "/gpu",
Devices: []cdispec.Device{
{
Name: "card0",
ContainerEdits: cdispec.ContainerEdits{
DeviceNodes: []*cdispec.DeviceNode{
{Path: devfs + "/dri/card0", HostPath: devfs + "/dri/card0", Permissions: "rw"},
{Path: devfs + "/dri/renderD128", HostPath: devfs + "/dri/renderD128", Permissions: "rw"},
},
Mounts: []*cdispec.Mount{
{
HostPath: devfs + "/dri/by-path/pci-0042:01:02.0-card",
ContainerPath: devfs + "/dri/by-path/pci-0042:01:02.0-card",
Options: []string{"bind", "ro"},
Type: "none",
},
{
HostPath: devfs + "/dri/by-path/pci-0042:01:02.0-render",
ContainerPath: devfs + "/dri/by-path/pci-0042:01:02.0-render",
Options: []string{"bind", "ro"},
Type: "none",
},
},
},
},
},
}))
refTree.AddDevice("xe", "card1-0", dpapi.NewDeviceInfo("Healthy", []v1beta1.DeviceSpec{
{ContainerPath: devfs + "/dri/card1", HostPath: devfs + "/dri/card1", Permissions: "rw"},
{ContainerPath: devfs + "/dri/renderD129", HostPath: devfs + "/dri/renderD129", Permissions: "rw"},
}, []v1beta1.Mount{
{ContainerPath: devfs + "/dri/by-path/pci-0042:01:05.0-card", HostPath: devfs + "/dri/by-path/pci-0042:01:05.0-card", ReadOnly: true},
{ContainerPath: devfs + "/dri/by-path/pci-0042:01:05.0-render", HostPath: devfs + "/dri/by-path/pci-0042:01:05.0-render", ReadOnly: true},
}, nil, nil, &cdispec.Spec{
Version: dpapi.CDIVersion,
Kind: dpapi.CDIVendor + "/gpu",
Devices: []cdispec.Device{
{
Name: "card1",
ContainerEdits: cdispec.ContainerEdits{
DeviceNodes: []*cdispec.DeviceNode{
{Path: devfs + "/dri/card1", HostPath: devfs + "/dri/card1", Permissions: "rw"},
{Path: devfs + "/dri/renderD129", HostPath: devfs + "/dri/renderD129", Permissions: "rw"},
},
Mounts: []*cdispec.Mount{
{
HostPath: devfs + "/dri/by-path/pci-0042:01:05.0-card",
ContainerPath: devfs + "/dri/by-path/pci-0042:01:05.0-card",
Options: []string{"bind", "ro"},
Type: "none",
},
{
HostPath: devfs + "/dri/by-path/pci-0042:01:05.0-render",
ContainerPath: devfs + "/dri/by-path/pci-0042:01:05.0-render",
Options: []string{"bind", "ro"},
Type: "none",
},
},
},
},
},
}))
if !reflect.DeepEqual(tree, refTree) {
t.Error("Received device tree isn't expected\n", tree, "\n", refTree)
}
if tree.DeviceTypeCount("i915") != 1 {
t.Error("Invalid count for device (i915)")
}
if tree.DeviceTypeCount("xe") != 1 {
t.Error("Invalid count for device (xe)")
}
}

View File

@ -45,6 +45,8 @@ spec:
readOnly: true
- name: kubeletsockets
mountPath: /var/lib/kubelet/device-plugins
- name: cdipath
mountPath: /var/run/cdi
volumes:
- name: devfs
hostPath:
@ -55,5 +57,9 @@ spec:
- name: kubeletsockets
hostPath:
path: /var/lib/kubelet/device-plugins
- name: cdipath
hostPath:
path: /var/run/cdi
type: DirectoryOrCreate
nodeSelector:
kubernetes.io/arch: amd64

View File

@ -39,6 +39,7 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet
yes := true
no := false
directoryOrCreate := v1.HostPathDirectoryOrCreate
maxUnavailable := intstr.FromInt(1)
maxSurge := intstr.FromInt(0)
@ -120,6 +121,10 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet
Name: "kubeletsockets",
MountPath: "/var/lib/kubelet/device-plugins",
},
{
Name: "cdipath",
MountPath: "/var/run/cdi",
},
},
},
},
@ -149,6 +154,15 @@ func (c *controller) newDaemonSetExpected(rawObj client.Object) *apps.DaemonSet
},
},
},
{
Name: "cdipath",
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{
Path: "/var/run/cdi",
Type: &directoryOrCreate,
},
},
},
},
},
},