fpga_plugin: indicate unhealthy devices

When the device's firmware crashes the OPAE driver reports all properties
of the device as a stream of binary ones. This effectively makes
interface and afu IDs look like "ffffffffffffffffffffffffffffffff".

Mark such devices as Unhealthy.

closes #77
This commit is contained in:
Dmitry Rozhkov 2018-08-13 11:47:47 +03:00
parent 1f236d5b23
commit 92f72e4196
2 changed files with 49 additions and 4 deletions

View File

@ -52,6 +52,10 @@ const (
deviceRE = `^intel-fpga-dev.[0-9]+$`
portRE = `^intel-fpga-port.[0-9]+$`
fmeRE = `^intel-fpga-fme.[0-9]+$`
// When the device's firmware crashes the driver reports these values
unhealthyAfuID = "ffffffffffffffffffffffffffffffff"
unhealthyInterfaceID = "ffffffffffffffffffffffffffffffff"
)
type getDevTreeFunc func(devices []device) dpapi.DeviceTree
@ -62,6 +66,10 @@ func getRegionDevelTree(devices []device) dpapi.DeviceTree {
for _, dev := range devices {
for _, region := range dev.regions {
health := pluginapi.Healthy
if region.interfaceID == unhealthyInterfaceID {
health = pluginapi.Unhealthy
}
devType := fmt.Sprintf("%s-%s", regionMode, region.interfaceID)
devNodes := make([]string, len(region.afus)+1)
for num, afu := range region.afus {
@ -69,7 +77,7 @@ func getRegionDevelTree(devices []device) dpapi.DeviceTree {
}
devNodes[len(region.afus)] = region.devNode
regionTree.AddDevice(devType, region.id, dpapi.DeviceInfo{
State: pluginapi.Healthy,
State: health,
Nodes: devNodes,
})
}
@ -84,13 +92,17 @@ func getRegionTree(devices []device) dpapi.DeviceTree {
for _, dev := range devices {
for _, region := range dev.regions {
health := pluginapi.Healthy
if region.interfaceID == unhealthyInterfaceID {
health = pluginapi.Unhealthy
}
devType := fmt.Sprintf("%s-%s", regionMode, region.interfaceID)
devNodes := make([]string, len(region.afus))
for num, afu := range region.afus {
devNodes[num] = afu.devNode
}
regionTree.AddDevice(devType, region.id, dpapi.DeviceInfo{
State: pluginapi.Healthy,
State: health,
Nodes: devNodes,
})
}
@ -106,9 +118,13 @@ func getAfuTree(devices []device) dpapi.DeviceTree {
for _, dev := range devices {
for _, region := range dev.regions {
for _, afu := range region.afus {
health := pluginapi.Healthy
if afu.afuID == unhealthyAfuID {
health = pluginapi.Unhealthy
}
devType := fmt.Sprintf("%s-%s", afMode, afu.afuID)
afuTree.AddDevice(devType, afu.id, dpapi.DeviceInfo{
State: pluginapi.Healthy,
State: health,
Nodes: []string{afu.devNode},
})
}
@ -194,7 +210,7 @@ func (dp *devicePlugin) PostAllocate(response *pluginapi.AllocateResponse) error
return nil
}
// Scan starts scanning of FPGA devices on the host
// Scan starts scanning FPGA devices on the host
func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error {
for {
devTree, err := dp.scanFPGAs()

View File

@ -119,6 +119,23 @@ func getDevices() []device {
},
},
},
{
name: "intel-fpga-dev.2",
regions: []region{
{
id: "intel-fpga-fme.2",
interfaceID: unhealthyInterfaceID,
devNode: "/dev/intel-fpga-fme.2",
afus: []afu{
{
id: "intel-fpga-port.2",
afuID: unhealthyAfuID,
devNode: "/dev/intel-fpga-port.2",
},
},
},
},
},
}
}
@ -132,6 +149,10 @@ func TestGetRegionDevelTree(t *testing.T) {
State: pluginapi.Healthy,
Nodes: []string{"/dev/intel-fpga-port.1", "/dev/intel-fpga-fme.1"},
})
expected.AddDevice(regionMode+"-"+unhealthyInterfaceID, "intel-fpga-fme.2", dpapi.DeviceInfo{
State: pluginapi.Unhealthy,
Nodes: []string{"/dev/intel-fpga-port.2", "/dev/intel-fpga-fme.2"},
})
result := getRegionDevelTree(getDevices())
if !reflect.DeepEqual(result, expected) {
@ -149,6 +170,10 @@ func TestGetRegionTree(t *testing.T) {
State: pluginapi.Healthy,
Nodes: []string{"/dev/intel-fpga-port.1"},
})
expected.AddDevice(regionMode+"-"+unhealthyInterfaceID, "intel-fpga-fme.2", dpapi.DeviceInfo{
State: pluginapi.Unhealthy,
Nodes: []string{"/dev/intel-fpga-port.2"},
})
result := getRegionTree(getDevices())
if !reflect.DeepEqual(result, expected) {
@ -166,6 +191,10 @@ func TestGetAfuTree(t *testing.T) {
State: pluginapi.Healthy,
Nodes: []string{"/dev/intel-fpga-port.1"},
})
expected.AddDevice(afMode+"-"+unhealthyAfuID, "intel-fpga-port.2", dpapi.DeviceInfo{
State: pluginapi.Unhealthy,
Nodes: []string{"/dev/intel-fpga-port.2"},
})
result := getAfuTree(getDevices())
if !reflect.DeepEqual(result, expected) {