mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
qat: add heartbeat check and use that as a device healthiness indicator
Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
parent
7ebb43bcc3
commit
c7162df440
@ -67,3 +67,4 @@ issues:
|
||||
# Until the testing package allows pinning variables disable scopelint
|
||||
# for tests. See https://github.com/kyoh86/scopelint/issues/4.
|
||||
- scopelint
|
||||
- gocognit
|
||||
|
@ -393,6 +393,37 @@ func readDeviceConfiguration(pfDev string) string {
|
||||
return devCfg.Section("GENERAL").Key("ServicesEnabled").String()
|
||||
}
|
||||
|
||||
func getDeviceHealthiness(device string, lookup map[string]string) string {
|
||||
healthiness := pluginapi.Healthy
|
||||
|
||||
pfDev, err := filepath.EvalSymlinks(filepath.Join(device, "physfn"))
|
||||
if err != nil {
|
||||
klog.Warningf("failed to get PF device ID for %s: %q", filepath.Base(device), err)
|
||||
return healthiness
|
||||
}
|
||||
|
||||
// VFs share one PF, so all the VFs should return the same result.
|
||||
if _, found := lookup[pfDev]; found {
|
||||
return lookup[pfDev]
|
||||
}
|
||||
|
||||
// Try to find the PF's heartbeat status. If unable to, return Healthy.
|
||||
driver := getCurrentDriver(pfDev)
|
||||
|
||||
hbStatusFile := filepath.Join(filepath.Dir(filepath.Join(pfDev, "../../")), "kernel/debug",
|
||||
fmt.Sprintf("qat_%s_%s/heartbeat/status", driver, filepath.Base(pfDev)))
|
||||
|
||||
// If status reads "-1", the device is considered bad:
|
||||
// https://github.com/torvalds/linux/blob/v6.6-rc5/Documentation/ABI/testing/debugfs-driver-qat
|
||||
if data, err := os.ReadFile(hbStatusFile); err == nil && string(data) == "-1" {
|
||||
healthiness = pluginapi.Unhealthy
|
||||
}
|
||||
|
||||
lookup[pfDev] = healthiness
|
||||
|
||||
return healthiness
|
||||
}
|
||||
|
||||
func getDeviceCapabilities(device string) (string, error) {
|
||||
devID, err := getDeviceID(device)
|
||||
if err != nil {
|
||||
@ -583,6 +614,8 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) {
|
||||
devTree := dpapi.NewDeviceTree()
|
||||
n := 0
|
||||
|
||||
pfHealthLookup := map[string]string{}
|
||||
|
||||
for _, vfDevice := range dp.getVfDevices() {
|
||||
vfBdf := filepath.Base(vfDevice)
|
||||
|
||||
@ -610,14 +643,16 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
klog.V(1).Infof("Device %s with %s capabilities found", vfBdf, cap)
|
||||
healthiness := getDeviceHealthiness(vfDevice, pfHealthLookup)
|
||||
|
||||
klog.V(1).Infof("Device %s with %s capabilities found (%s)", vfBdf, cap, healthiness)
|
||||
|
||||
n = n + 1
|
||||
envs := map[string]string{
|
||||
fmt.Sprintf("%s%d", envVarPrefix, n): vfBdf,
|
||||
}
|
||||
|
||||
devinfo := dpapi.NewDeviceInfo(pluginapi.Healthy, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil)
|
||||
devinfo := dpapi.NewDeviceInfo(healthiness, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil)
|
||||
|
||||
devTree.AddDevice(cap, vfBdf, devinfo)
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ package dpdkdrv
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"reflect"
|
||||
@ -162,15 +163,16 @@ func TestGetPreferredAllocation(t *testing.T) {
|
||||
|
||||
func TestScan(t *testing.T) {
|
||||
tcases := []struct {
|
||||
name string
|
||||
dpdkDriver string
|
||||
dirs []string
|
||||
files map[string][]byte
|
||||
symlinks map[string]string
|
||||
kernelVfDrivers []string
|
||||
expectedErr bool
|
||||
maxDevNum int
|
||||
expectedDevNum int
|
||||
name string
|
||||
dpdkDriver string
|
||||
dirs []string
|
||||
files map[string][]byte
|
||||
symlinks map[string]string
|
||||
kernelVfDrivers []string
|
||||
expectedErr bool
|
||||
maxDevNum int
|
||||
expectedDevNum int
|
||||
expectedUnhealthyNum int
|
||||
}{
|
||||
{
|
||||
name: "No error returned for uninitialized device plugin",
|
||||
@ -519,7 +521,119 @@ func TestScan(t *testing.T) {
|
||||
maxDevNum: 2,
|
||||
expectedDevNum: 2,
|
||||
},
|
||||
{
|
||||
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status bad",
|
||||
dpdkDriver: "vfio-pci",
|
||||
kernelVfDrivers: []string{"4xxxvf"},
|
||||
dirs: []string{
|
||||
"sys/bus/pci/drivers/4xxx",
|
||||
"sys/bus/pci/drivers/vfio-pci",
|
||||
"sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/devices/pci0000:02/0000:02:00.0/qat",
|
||||
"sys/kernel/debug/qat_4xxx_0000:02:00.0",
|
||||
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat",
|
||||
"sys/bus/pci/devices/0000:02:01.0",
|
||||
},
|
||||
files: map[string][]byte{
|
||||
"sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"),
|
||||
"sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"),
|
||||
"sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"),
|
||||
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"),
|
||||
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("-1"),
|
||||
},
|
||||
symlinks: map[string]string{
|
||||
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
|
||||
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
|
||||
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx",
|
||||
},
|
||||
maxDevNum: 1,
|
||||
expectedDevNum: 1,
|
||||
expectedUnhealthyNum: 1,
|
||||
},
|
||||
{
|
||||
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status good",
|
||||
dpdkDriver: "vfio-pci",
|
||||
kernelVfDrivers: []string{"4xxxvf"},
|
||||
dirs: []string{
|
||||
"sys/bus/pci/drivers/4xxx",
|
||||
"sys/bus/pci/drivers/vfio-pci",
|
||||
"sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/devices/pci0000:02/0000:02:00.0/qat",
|
||||
"sys/kernel/debug/qat_4xxx_0000:02:00.0",
|
||||
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat",
|
||||
"sys/bus/pci/devices/0000:02:01.0",
|
||||
},
|
||||
files: map[string][]byte{
|
||||
"sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"),
|
||||
"sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"),
|
||||
"sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"),
|
||||
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"),
|
||||
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("0"),
|
||||
},
|
||||
symlinks: map[string]string{
|
||||
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
|
||||
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
|
||||
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx",
|
||||
},
|
||||
maxDevNum: 1,
|
||||
expectedDevNum: 1,
|
||||
expectedUnhealthyNum: 0,
|
||||
},
|
||||
{
|
||||
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfDevID is equal to qatDevId (37c9) heartbeat status bad",
|
||||
dpdkDriver: "vfio-pci",
|
||||
kernelVfDrivers: []string{"c6xxvf"},
|
||||
dirs: []string{
|
||||
"sys/bus/pci/drivers/c6xx",
|
||||
"sys/bus/pci/drivers/vfio-pci",
|
||||
"sys/bus/pci/devices/0000:02:01.0",
|
||||
"sys/bus/pci/devices/0000:02:01.1",
|
||||
"sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat",
|
||||
},
|
||||
files: map[string][]byte{
|
||||
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x37c9"),
|
||||
"sys/bus/pci/devices/0000:02:01.1/device": []byte("0x37c9"),
|
||||
"sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat/status": []byte("-1"),
|
||||
},
|
||||
symlinks: map[string]string{
|
||||
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
|
||||
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/bus/pci/devices/0000:02:01.1/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
|
||||
"sys/bus/pci/devices/0000:02:01.1/physfn": "sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/bus/pci/drivers/c6xx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
|
||||
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
|
||||
"sys/devices/pci0000:02/0000:02:00.0/virtfn1": "sys/bus/pci/devices/0000:02:01.1",
|
||||
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/c6xx",
|
||||
},
|
||||
maxDevNum: 3,
|
||||
expectedDevNum: 2,
|
||||
expectedUnhealthyNum: 2,
|
||||
},
|
||||
}
|
||||
|
||||
countUnhealthyDevices := func(tree dpapi.DeviceTree) int {
|
||||
unhealtyNum := 0
|
||||
|
||||
for _, v := range tree {
|
||||
for _, vv := range v {
|
||||
field := reflect.ValueOf(vv).FieldByName("state")
|
||||
if fmt.Sprintf("%+v", field) == pluginapi.Unhealthy {
|
||||
unhealtyNum = unhealtyNum + 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return unhealtyNum
|
||||
}
|
||||
|
||||
for _, tt := range tcases {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
tmpdir, err := os.MkdirTemp("/tmp/", "qatplugin-TestScanPrivate-*")
|
||||
@ -560,6 +674,10 @@ func TestScan(t *testing.T) {
|
||||
t.Errorf("expected %d, but got %d devices", tt.expectedDevNum, devNum)
|
||||
}
|
||||
|
||||
if unhealtyNum := countUnhealthyDevices(fN.tree); unhealtyNum != tt.expectedUnhealthyNum {
|
||||
t.Errorf("expected %d, but got %d unhealthy devices", tt.expectedUnhealthyNum, unhealtyNum)
|
||||
}
|
||||
|
||||
if err = os.RemoveAll(tmpdir); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user