From c7162df440c4df369cbd066efcde38fbb4e4c018 Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Mon, 9 Oct 2023 14:37:59 +0300 Subject: [PATCH] qat: add heartbeat check and use that as a device healthiness indicator Signed-off-by: Tuomas Katila --- .golangci.yml | 1 + cmd/qat_plugin/dpdkdrv/dpdkdrv.go | 39 ++++++- cmd/qat_plugin/dpdkdrv/dpdkdrv_test.go | 136 +++++++++++++++++++++++-- 3 files changed, 165 insertions(+), 11 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index abd6af85..da9285c1 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -67,3 +67,4 @@ issues: # Until the testing package allows pinning variables disable scopelint # for tests. See https://github.com/kyoh86/scopelint/issues/4. - scopelint + - gocognit diff --git a/cmd/qat_plugin/dpdkdrv/dpdkdrv.go b/cmd/qat_plugin/dpdkdrv/dpdkdrv.go index abb59c68..c2966a33 100644 --- a/cmd/qat_plugin/dpdkdrv/dpdkdrv.go +++ b/cmd/qat_plugin/dpdkdrv/dpdkdrv.go @@ -393,6 +393,37 @@ func readDeviceConfiguration(pfDev string) string { return devCfg.Section("GENERAL").Key("ServicesEnabled").String() } +func getDeviceHealthiness(device string, lookup map[string]string) string { + healthiness := pluginapi.Healthy + + pfDev, err := filepath.EvalSymlinks(filepath.Join(device, "physfn")) + if err != nil { + klog.Warningf("failed to get PF device ID for %s: %q", filepath.Base(device), err) + return healthiness + } + + // VFs share one PF, so all the VFs should return the same result. + if _, found := lookup[pfDev]; found { + return lookup[pfDev] + } + + // Try to find the PF's heartbeat status. If unable to, return Healthy. + driver := getCurrentDriver(pfDev) + + hbStatusFile := filepath.Join(filepath.Dir(filepath.Join(pfDev, "../../")), "kernel/debug", + fmt.Sprintf("qat_%s_%s/heartbeat/status", driver, filepath.Base(pfDev))) + + // If status reads "-1", the device is considered bad: + // https://github.com/torvalds/linux/blob/v6.6-rc5/Documentation/ABI/testing/debugfs-driver-qat + if data, err := os.ReadFile(hbStatusFile); err == nil && string(data) == "-1" { + healthiness = pluginapi.Unhealthy + } + + lookup[pfDev] = healthiness + + return healthiness +} + func getDeviceCapabilities(device string) (string, error) { devID, err := getDeviceID(device) if err != nil { @@ -583,6 +614,8 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) { devTree := dpapi.NewDeviceTree() n := 0 + pfHealthLookup := map[string]string{} + for _, vfDevice := range dp.getVfDevices() { vfBdf := filepath.Base(vfDevice) @@ -610,14 +643,16 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) { return nil, err } - klog.V(1).Infof("Device %s with %s capabilities found", vfBdf, cap) + healthiness := getDeviceHealthiness(vfDevice, pfHealthLookup) + + klog.V(1).Infof("Device %s with %s capabilities found (%s)", vfBdf, cap, healthiness) n = n + 1 envs := map[string]string{ fmt.Sprintf("%s%d", envVarPrefix, n): vfBdf, } - devinfo := dpapi.NewDeviceInfo(pluginapi.Healthy, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil) + devinfo := dpapi.NewDeviceInfo(healthiness, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil) devTree.AddDevice(cap, vfBdf, devinfo) } diff --git a/cmd/qat_plugin/dpdkdrv/dpdkdrv_test.go b/cmd/qat_plugin/dpdkdrv/dpdkdrv_test.go index e868fd10..b60424c2 100644 --- a/cmd/qat_plugin/dpdkdrv/dpdkdrv_test.go +++ b/cmd/qat_plugin/dpdkdrv/dpdkdrv_test.go @@ -16,6 +16,7 @@ package dpdkdrv import ( "flag" + "fmt" "os" "path" "reflect" @@ -162,15 +163,16 @@ func TestGetPreferredAllocation(t *testing.T) { func TestScan(t *testing.T) { tcases := []struct { - name string - dpdkDriver string - dirs []string - files map[string][]byte - symlinks map[string]string - kernelVfDrivers []string - expectedErr bool - maxDevNum int - expectedDevNum int + name string + dpdkDriver string + dirs []string + files map[string][]byte + symlinks map[string]string + kernelVfDrivers []string + expectedErr bool + maxDevNum int + expectedDevNum int + expectedUnhealthyNum int }{ { name: "No error returned for uninitialized device plugin", @@ -519,7 +521,119 @@ func TestScan(t *testing.T) { maxDevNum: 2, expectedDevNum: 2, }, + { + name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status bad", + dpdkDriver: "vfio-pci", + kernelVfDrivers: []string{"4xxxvf"}, + dirs: []string{ + "sys/bus/pci/drivers/4xxx", + "sys/bus/pci/drivers/vfio-pci", + "sys/devices/pci0000:02/0000:02:00.0", + "sys/devices/pci0000:02/0000:02:00.0/qat", + "sys/kernel/debug/qat_4xxx_0000:02:00.0", + "sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat", + "sys/bus/pci/devices/0000:02:01.0", + }, + files: map[string][]byte{ + "sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"), + "sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"), + "sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"), + "sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"), + "sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("-1"), + }, + symlinks: map[string]string{ + "sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile", + "sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0", + "sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0", + "sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0", + "sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0", + "sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx", + }, + maxDevNum: 1, + expectedDevNum: 1, + expectedUnhealthyNum: 1, + }, + { + name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status good", + dpdkDriver: "vfio-pci", + kernelVfDrivers: []string{"4xxxvf"}, + dirs: []string{ + "sys/bus/pci/drivers/4xxx", + "sys/bus/pci/drivers/vfio-pci", + "sys/devices/pci0000:02/0000:02:00.0", + "sys/devices/pci0000:02/0000:02:00.0/qat", + "sys/kernel/debug/qat_4xxx_0000:02:00.0", + "sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat", + "sys/bus/pci/devices/0000:02:01.0", + }, + files: map[string][]byte{ + "sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"), + "sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"), + "sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"), + "sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"), + "sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("0"), + }, + symlinks: map[string]string{ + "sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile", + "sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0", + "sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0", + "sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0", + "sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0", + "sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx", + }, + maxDevNum: 1, + expectedDevNum: 1, + expectedUnhealthyNum: 0, + }, + { + name: "vfio-pci DPDKdriver with no kernel bound driver and where vfDevID is equal to qatDevId (37c9) heartbeat status bad", + dpdkDriver: "vfio-pci", + kernelVfDrivers: []string{"c6xxvf"}, + dirs: []string{ + "sys/bus/pci/drivers/c6xx", + "sys/bus/pci/drivers/vfio-pci", + "sys/bus/pci/devices/0000:02:01.0", + "sys/bus/pci/devices/0000:02:01.1", + "sys/devices/pci0000:02/0000:02:00.0", + "sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat", + }, + files: map[string][]byte{ + "sys/bus/pci/devices/0000:02:01.0/device": []byte("0x37c9"), + "sys/bus/pci/devices/0000:02:01.1/device": []byte("0x37c9"), + "sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat/status": []byte("-1"), + }, + symlinks: map[string]string{ + "sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile", + "sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0", + "sys/bus/pci/devices/0000:02:01.1/iommu_group": "sys/kernel/iommu_groups/vfiotestfile", + "sys/bus/pci/devices/0000:02:01.1/physfn": "sys/devices/pci0000:02/0000:02:00.0", + "sys/bus/pci/drivers/c6xx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0", + "sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0", + "sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0", + "sys/devices/pci0000:02/0000:02:00.0/virtfn1": "sys/bus/pci/devices/0000:02:01.1", + "sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/c6xx", + }, + maxDevNum: 3, + expectedDevNum: 2, + expectedUnhealthyNum: 2, + }, } + + countUnhealthyDevices := func(tree dpapi.DeviceTree) int { + unhealtyNum := 0 + + for _, v := range tree { + for _, vv := range v { + field := reflect.ValueOf(vv).FieldByName("state") + if fmt.Sprintf("%+v", field) == pluginapi.Unhealthy { + unhealtyNum = unhealtyNum + 1 + } + } + } + + return unhealtyNum + } + for _, tt := range tcases { t.Run(tt.name, func(t *testing.T) { tmpdir, err := os.MkdirTemp("/tmp/", "qatplugin-TestScanPrivate-*") @@ -560,6 +674,10 @@ func TestScan(t *testing.T) { t.Errorf("expected %d, but got %d devices", tt.expectedDevNum, devNum) } + if unhealtyNum := countUnhealthyDevices(fN.tree); unhealtyNum != tt.expectedUnhealthyNum { + t.Errorf("expected %d, but got %d unhealthy devices", tt.expectedUnhealthyNum, unhealtyNum) + } + if err = os.RemoveAll(tmpdir); err != nil { t.Fatal(err) }