mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
qat: add heartbeat check and use that as a device healthiness indicator
Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
parent
7ebb43bcc3
commit
c7162df440
@ -67,3 +67,4 @@ issues:
|
|||||||
# Until the testing package allows pinning variables disable scopelint
|
# Until the testing package allows pinning variables disable scopelint
|
||||||
# for tests. See https://github.com/kyoh86/scopelint/issues/4.
|
# for tests. See https://github.com/kyoh86/scopelint/issues/4.
|
||||||
- scopelint
|
- scopelint
|
||||||
|
- gocognit
|
||||||
|
@ -393,6 +393,37 @@ func readDeviceConfiguration(pfDev string) string {
|
|||||||
return devCfg.Section("GENERAL").Key("ServicesEnabled").String()
|
return devCfg.Section("GENERAL").Key("ServicesEnabled").String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getDeviceHealthiness(device string, lookup map[string]string) string {
|
||||||
|
healthiness := pluginapi.Healthy
|
||||||
|
|
||||||
|
pfDev, err := filepath.EvalSymlinks(filepath.Join(device, "physfn"))
|
||||||
|
if err != nil {
|
||||||
|
klog.Warningf("failed to get PF device ID for %s: %q", filepath.Base(device), err)
|
||||||
|
return healthiness
|
||||||
|
}
|
||||||
|
|
||||||
|
// VFs share one PF, so all the VFs should return the same result.
|
||||||
|
if _, found := lookup[pfDev]; found {
|
||||||
|
return lookup[pfDev]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to find the PF's heartbeat status. If unable to, return Healthy.
|
||||||
|
driver := getCurrentDriver(pfDev)
|
||||||
|
|
||||||
|
hbStatusFile := filepath.Join(filepath.Dir(filepath.Join(pfDev, "../../")), "kernel/debug",
|
||||||
|
fmt.Sprintf("qat_%s_%s/heartbeat/status", driver, filepath.Base(pfDev)))
|
||||||
|
|
||||||
|
// If status reads "-1", the device is considered bad:
|
||||||
|
// https://github.com/torvalds/linux/blob/v6.6-rc5/Documentation/ABI/testing/debugfs-driver-qat
|
||||||
|
if data, err := os.ReadFile(hbStatusFile); err == nil && string(data) == "-1" {
|
||||||
|
healthiness = pluginapi.Unhealthy
|
||||||
|
}
|
||||||
|
|
||||||
|
lookup[pfDev] = healthiness
|
||||||
|
|
||||||
|
return healthiness
|
||||||
|
}
|
||||||
|
|
||||||
func getDeviceCapabilities(device string) (string, error) {
|
func getDeviceCapabilities(device string) (string, error) {
|
||||||
devID, err := getDeviceID(device)
|
devID, err := getDeviceID(device)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -583,6 +614,8 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) {
|
|||||||
devTree := dpapi.NewDeviceTree()
|
devTree := dpapi.NewDeviceTree()
|
||||||
n := 0
|
n := 0
|
||||||
|
|
||||||
|
pfHealthLookup := map[string]string{}
|
||||||
|
|
||||||
for _, vfDevice := range dp.getVfDevices() {
|
for _, vfDevice := range dp.getVfDevices() {
|
||||||
vfBdf := filepath.Base(vfDevice)
|
vfBdf := filepath.Base(vfDevice)
|
||||||
|
|
||||||
@ -610,14 +643,16 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
klog.V(1).Infof("Device %s with %s capabilities found", vfBdf, cap)
|
healthiness := getDeviceHealthiness(vfDevice, pfHealthLookup)
|
||||||
|
|
||||||
|
klog.V(1).Infof("Device %s with %s capabilities found (%s)", vfBdf, cap, healthiness)
|
||||||
|
|
||||||
n = n + 1
|
n = n + 1
|
||||||
envs := map[string]string{
|
envs := map[string]string{
|
||||||
fmt.Sprintf("%s%d", envVarPrefix, n): vfBdf,
|
fmt.Sprintf("%s%d", envVarPrefix, n): vfBdf,
|
||||||
}
|
}
|
||||||
|
|
||||||
devinfo := dpapi.NewDeviceInfo(pluginapi.Healthy, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil)
|
devinfo := dpapi.NewDeviceInfo(healthiness, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil)
|
||||||
|
|
||||||
devTree.AddDevice(cap, vfBdf, devinfo)
|
devTree.AddDevice(cap, vfBdf, devinfo)
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,7 @@ package dpdkdrv
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"flag"
|
"flag"
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"reflect"
|
"reflect"
|
||||||
@ -162,15 +163,16 @@ func TestGetPreferredAllocation(t *testing.T) {
|
|||||||
|
|
||||||
func TestScan(t *testing.T) {
|
func TestScan(t *testing.T) {
|
||||||
tcases := []struct {
|
tcases := []struct {
|
||||||
name string
|
name string
|
||||||
dpdkDriver string
|
dpdkDriver string
|
||||||
dirs []string
|
dirs []string
|
||||||
files map[string][]byte
|
files map[string][]byte
|
||||||
symlinks map[string]string
|
symlinks map[string]string
|
||||||
kernelVfDrivers []string
|
kernelVfDrivers []string
|
||||||
expectedErr bool
|
expectedErr bool
|
||||||
maxDevNum int
|
maxDevNum int
|
||||||
expectedDevNum int
|
expectedDevNum int
|
||||||
|
expectedUnhealthyNum int
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "No error returned for uninitialized device plugin",
|
name: "No error returned for uninitialized device plugin",
|
||||||
@ -519,7 +521,119 @@ func TestScan(t *testing.T) {
|
|||||||
maxDevNum: 2,
|
maxDevNum: 2,
|
||||||
expectedDevNum: 2,
|
expectedDevNum: 2,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status bad",
|
||||||
|
dpdkDriver: "vfio-pci",
|
||||||
|
kernelVfDrivers: []string{"4xxxvf"},
|
||||||
|
dirs: []string{
|
||||||
|
"sys/bus/pci/drivers/4xxx",
|
||||||
|
"sys/bus/pci/drivers/vfio-pci",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/qat",
|
||||||
|
"sys/kernel/debug/qat_4xxx_0000:02:00.0",
|
||||||
|
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat",
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0",
|
||||||
|
},
|
||||||
|
files: map[string][]byte{
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"),
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"),
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"),
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"),
|
||||||
|
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("-1"),
|
||||||
|
},
|
||||||
|
symlinks: map[string]string{
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx",
|
||||||
|
},
|
||||||
|
maxDevNum: 1,
|
||||||
|
expectedDevNum: 1,
|
||||||
|
expectedUnhealthyNum: 1,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status good",
|
||||||
|
dpdkDriver: "vfio-pci",
|
||||||
|
kernelVfDrivers: []string{"4xxxvf"},
|
||||||
|
dirs: []string{
|
||||||
|
"sys/bus/pci/drivers/4xxx",
|
||||||
|
"sys/bus/pci/drivers/vfio-pci",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/qat",
|
||||||
|
"sys/kernel/debug/qat_4xxx_0000:02:00.0",
|
||||||
|
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat",
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0",
|
||||||
|
},
|
||||||
|
files: map[string][]byte{
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"),
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"),
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"),
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"),
|
||||||
|
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("0"),
|
||||||
|
},
|
||||||
|
symlinks: map[string]string{
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx",
|
||||||
|
},
|
||||||
|
maxDevNum: 1,
|
||||||
|
expectedDevNum: 1,
|
||||||
|
expectedUnhealthyNum: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfDevID is equal to qatDevId (37c9) heartbeat status bad",
|
||||||
|
dpdkDriver: "vfio-pci",
|
||||||
|
kernelVfDrivers: []string{"c6xxvf"},
|
||||||
|
dirs: []string{
|
||||||
|
"sys/bus/pci/drivers/c6xx",
|
||||||
|
"sys/bus/pci/drivers/vfio-pci",
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0",
|
||||||
|
"sys/bus/pci/devices/0000:02:01.1",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat",
|
||||||
|
},
|
||||||
|
files: map[string][]byte{
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x37c9"),
|
||||||
|
"sys/bus/pci/devices/0000:02:01.1/device": []byte("0x37c9"),
|
||||||
|
"sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat/status": []byte("-1"),
|
||||||
|
},
|
||||||
|
symlinks: map[string]string{
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
|
||||||
|
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/bus/pci/devices/0000:02:01.1/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
|
||||||
|
"sys/bus/pci/devices/0000:02:01.1/physfn": "sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/bus/pci/drivers/c6xx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/virtfn1": "sys/bus/pci/devices/0000:02:01.1",
|
||||||
|
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/c6xx",
|
||||||
|
},
|
||||||
|
maxDevNum: 3,
|
||||||
|
expectedDevNum: 2,
|
||||||
|
expectedUnhealthyNum: 2,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
countUnhealthyDevices := func(tree dpapi.DeviceTree) int {
|
||||||
|
unhealtyNum := 0
|
||||||
|
|
||||||
|
for _, v := range tree {
|
||||||
|
for _, vv := range v {
|
||||||
|
field := reflect.ValueOf(vv).FieldByName("state")
|
||||||
|
if fmt.Sprintf("%+v", field) == pluginapi.Unhealthy {
|
||||||
|
unhealtyNum = unhealtyNum + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return unhealtyNum
|
||||||
|
}
|
||||||
|
|
||||||
for _, tt := range tcases {
|
for _, tt := range tcases {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
tmpdir, err := os.MkdirTemp("/tmp/", "qatplugin-TestScanPrivate-*")
|
tmpdir, err := os.MkdirTemp("/tmp/", "qatplugin-TestScanPrivate-*")
|
||||||
@ -560,6 +674,10 @@ func TestScan(t *testing.T) {
|
|||||||
t.Errorf("expected %d, but got %d devices", tt.expectedDevNum, devNum)
|
t.Errorf("expected %d, but got %d devices", tt.expectedDevNum, devNum)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if unhealtyNum := countUnhealthyDevices(fN.tree); unhealtyNum != tt.expectedUnhealthyNum {
|
||||||
|
t.Errorf("expected %d, but got %d unhealthy devices", tt.expectedUnhealthyNum, unhealtyNum)
|
||||||
|
}
|
||||||
|
|
||||||
if err = os.RemoveAll(tmpdir); err != nil {
|
if err = os.RemoveAll(tmpdir); err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user