qat: add heartbeat check and use that as a device healthiness indicator

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
Tuomas Katila 2023-10-09 14:37:59 +03:00
parent 7ebb43bcc3
commit c7162df440
3 changed files with 165 additions and 11 deletions

View File

@ -67,3 +67,4 @@ issues:
# Until the testing package allows pinning variables disable scopelint
# for tests. See https://github.com/kyoh86/scopelint/issues/4.
- scopelint
- gocognit

View File

@ -393,6 +393,37 @@ func readDeviceConfiguration(pfDev string) string {
return devCfg.Section("GENERAL").Key("ServicesEnabled").String()
}
func getDeviceHealthiness(device string, lookup map[string]string) string {
healthiness := pluginapi.Healthy
pfDev, err := filepath.EvalSymlinks(filepath.Join(device, "physfn"))
if err != nil {
klog.Warningf("failed to get PF device ID for %s: %q", filepath.Base(device), err)
return healthiness
}
// VFs share one PF, so all the VFs should return the same result.
if _, found := lookup[pfDev]; found {
return lookup[pfDev]
}
// Try to find the PF's heartbeat status. If unable to, return Healthy.
driver := getCurrentDriver(pfDev)
hbStatusFile := filepath.Join(filepath.Dir(filepath.Join(pfDev, "../../")), "kernel/debug",
fmt.Sprintf("qat_%s_%s/heartbeat/status", driver, filepath.Base(pfDev)))
// If status reads "-1", the device is considered bad:
// https://github.com/torvalds/linux/blob/v6.6-rc5/Documentation/ABI/testing/debugfs-driver-qat
if data, err := os.ReadFile(hbStatusFile); err == nil && string(data) == "-1" {
healthiness = pluginapi.Unhealthy
}
lookup[pfDev] = healthiness
return healthiness
}
func getDeviceCapabilities(device string) (string, error) {
devID, err := getDeviceID(device)
if err != nil {
@ -583,6 +614,8 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) {
devTree := dpapi.NewDeviceTree()
n := 0
pfHealthLookup := map[string]string{}
for _, vfDevice := range dp.getVfDevices() {
vfBdf := filepath.Base(vfDevice)
@ -610,14 +643,16 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) {
return nil, err
}
klog.V(1).Infof("Device %s with %s capabilities found", vfBdf, cap)
healthiness := getDeviceHealthiness(vfDevice, pfHealthLookup)
klog.V(1).Infof("Device %s with %s capabilities found (%s)", vfBdf, cap, healthiness)
n = n + 1
envs := map[string]string{
fmt.Sprintf("%s%d", envVarPrefix, n): vfBdf,
}
devinfo := dpapi.NewDeviceInfo(pluginapi.Healthy, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil)
devinfo := dpapi.NewDeviceInfo(healthiness, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil)
devTree.AddDevice(cap, vfBdf, devinfo)
}

View File

@ -16,6 +16,7 @@ package dpdkdrv
import (
"flag"
"fmt"
"os"
"path"
"reflect"
@ -162,15 +163,16 @@ func TestGetPreferredAllocation(t *testing.T) {
func TestScan(t *testing.T) {
tcases := []struct {
name string
dpdkDriver string
dirs []string
files map[string][]byte
symlinks map[string]string
kernelVfDrivers []string
expectedErr bool
maxDevNum int
expectedDevNum int
name string
dpdkDriver string
dirs []string
files map[string][]byte
symlinks map[string]string
kernelVfDrivers []string
expectedErr bool
maxDevNum int
expectedDevNum int
expectedUnhealthyNum int
}{
{
name: "No error returned for uninitialized device plugin",
@ -519,7 +521,119 @@ func TestScan(t *testing.T) {
maxDevNum: 2,
expectedDevNum: 2,
},
{
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status bad",
dpdkDriver: "vfio-pci",
kernelVfDrivers: []string{"4xxxvf"},
dirs: []string{
"sys/bus/pci/drivers/4xxx",
"sys/bus/pci/drivers/vfio-pci",
"sys/devices/pci0000:02/0000:02:00.0",
"sys/devices/pci0000:02/0000:02:00.0/qat",
"sys/kernel/debug/qat_4xxx_0000:02:00.0",
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat",
"sys/bus/pci/devices/0000:02:01.0",
},
files: map[string][]byte{
"sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"),
"sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"),
"sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"),
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"),
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("-1"),
},
symlinks: map[string]string{
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
"sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx",
},
maxDevNum: 1,
expectedDevNum: 1,
expectedUnhealthyNum: 1,
},
{
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status good",
dpdkDriver: "vfio-pci",
kernelVfDrivers: []string{"4xxxvf"},
dirs: []string{
"sys/bus/pci/drivers/4xxx",
"sys/bus/pci/drivers/vfio-pci",
"sys/devices/pci0000:02/0000:02:00.0",
"sys/devices/pci0000:02/0000:02:00.0/qat",
"sys/kernel/debug/qat_4xxx_0000:02:00.0",
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat",
"sys/bus/pci/devices/0000:02:01.0",
},
files: map[string][]byte{
"sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"),
"sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"),
"sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"),
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"),
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("0"),
},
symlinks: map[string]string{
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
"sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx",
},
maxDevNum: 1,
expectedDevNum: 1,
expectedUnhealthyNum: 0,
},
{
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfDevID is equal to qatDevId (37c9) heartbeat status bad",
dpdkDriver: "vfio-pci",
kernelVfDrivers: []string{"c6xxvf"},
dirs: []string{
"sys/bus/pci/drivers/c6xx",
"sys/bus/pci/drivers/vfio-pci",
"sys/bus/pci/devices/0000:02:01.0",
"sys/bus/pci/devices/0000:02:01.1",
"sys/devices/pci0000:02/0000:02:00.0",
"sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat",
},
files: map[string][]byte{
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x37c9"),
"sys/bus/pci/devices/0000:02:01.1/device": []byte("0x37c9"),
"sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat/status": []byte("-1"),
},
symlinks: map[string]string{
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
"sys/bus/pci/devices/0000:02:01.1/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
"sys/bus/pci/devices/0000:02:01.1/physfn": "sys/devices/pci0000:02/0000:02:00.0",
"sys/bus/pci/drivers/c6xx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
"sys/devices/pci0000:02/0000:02:00.0/virtfn1": "sys/bus/pci/devices/0000:02:01.1",
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/c6xx",
},
maxDevNum: 3,
expectedDevNum: 2,
expectedUnhealthyNum: 2,
},
}
countUnhealthyDevices := func(tree dpapi.DeviceTree) int {
unhealtyNum := 0
for _, v := range tree {
for _, vv := range v {
field := reflect.ValueOf(vv).FieldByName("state")
if fmt.Sprintf("%+v", field) == pluginapi.Unhealthy {
unhealtyNum = unhealtyNum + 1
}
}
}
return unhealtyNum
}
for _, tt := range tcases {
t.Run(tt.name, func(t *testing.T) {
tmpdir, err := os.MkdirTemp("/tmp/", "qatplugin-TestScanPrivate-*")
@ -560,6 +674,10 @@ func TestScan(t *testing.T) {
t.Errorf("expected %d, but got %d devices", tt.expectedDevNum, devNum)
}
if unhealtyNum := countUnhealthyDevices(fN.tree); unhealtyNum != tt.expectedUnhealthyNum {
t.Errorf("expected %d, but got %d unhealthy devices", tt.expectedUnhealthyNum, unhealtyNum)
}
if err = os.RemoveAll(tmpdir); err != nil {
t.Fatal(err)
}