mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
fpga_plugin: indicate unhealthy devices
When the device's firmware crashes the OPAE driver reports all properties of the device as a stream of binary ones. This effectively makes interface and afu IDs look like "ffffffffffffffffffffffffffffffff". Mark such devices as Unhealthy. closes #77
This commit is contained in:
parent
1f236d5b23
commit
92f72e4196
@ -52,6 +52,10 @@ const (
|
||||
deviceRE = `^intel-fpga-dev.[0-9]+$`
|
||||
portRE = `^intel-fpga-port.[0-9]+$`
|
||||
fmeRE = `^intel-fpga-fme.[0-9]+$`
|
||||
|
||||
// When the device's firmware crashes the driver reports these values
|
||||
unhealthyAfuID = "ffffffffffffffffffffffffffffffff"
|
||||
unhealthyInterfaceID = "ffffffffffffffffffffffffffffffff"
|
||||
)
|
||||
|
||||
type getDevTreeFunc func(devices []device) dpapi.DeviceTree
|
||||
@ -62,6 +66,10 @@ func getRegionDevelTree(devices []device) dpapi.DeviceTree {
|
||||
|
||||
for _, dev := range devices {
|
||||
for _, region := range dev.regions {
|
||||
health := pluginapi.Healthy
|
||||
if region.interfaceID == unhealthyInterfaceID {
|
||||
health = pluginapi.Unhealthy
|
||||
}
|
||||
devType := fmt.Sprintf("%s-%s", regionMode, region.interfaceID)
|
||||
devNodes := make([]string, len(region.afus)+1)
|
||||
for num, afu := range region.afus {
|
||||
@ -69,7 +77,7 @@ func getRegionDevelTree(devices []device) dpapi.DeviceTree {
|
||||
}
|
||||
devNodes[len(region.afus)] = region.devNode
|
||||
regionTree.AddDevice(devType, region.id, dpapi.DeviceInfo{
|
||||
State: pluginapi.Healthy,
|
||||
State: health,
|
||||
Nodes: devNodes,
|
||||
})
|
||||
}
|
||||
@ -84,13 +92,17 @@ func getRegionTree(devices []device) dpapi.DeviceTree {
|
||||
|
||||
for _, dev := range devices {
|
||||
for _, region := range dev.regions {
|
||||
health := pluginapi.Healthy
|
||||
if region.interfaceID == unhealthyInterfaceID {
|
||||
health = pluginapi.Unhealthy
|
||||
}
|
||||
devType := fmt.Sprintf("%s-%s", regionMode, region.interfaceID)
|
||||
devNodes := make([]string, len(region.afus))
|
||||
for num, afu := range region.afus {
|
||||
devNodes[num] = afu.devNode
|
||||
}
|
||||
regionTree.AddDevice(devType, region.id, dpapi.DeviceInfo{
|
||||
State: pluginapi.Healthy,
|
||||
State: health,
|
||||
Nodes: devNodes,
|
||||
})
|
||||
}
|
||||
@ -106,9 +118,13 @@ func getAfuTree(devices []device) dpapi.DeviceTree {
|
||||
for _, dev := range devices {
|
||||
for _, region := range dev.regions {
|
||||
for _, afu := range region.afus {
|
||||
health := pluginapi.Healthy
|
||||
if afu.afuID == unhealthyAfuID {
|
||||
health = pluginapi.Unhealthy
|
||||
}
|
||||
devType := fmt.Sprintf("%s-%s", afMode, afu.afuID)
|
||||
afuTree.AddDevice(devType, afu.id, dpapi.DeviceInfo{
|
||||
State: pluginapi.Healthy,
|
||||
State: health,
|
||||
Nodes: []string{afu.devNode},
|
||||
})
|
||||
}
|
||||
@ -194,7 +210,7 @@ func (dp *devicePlugin) PostAllocate(response *pluginapi.AllocateResponse) error
|
||||
return nil
|
||||
}
|
||||
|
||||
// Scan starts scanning of FPGA devices on the host
|
||||
// Scan starts scanning FPGA devices on the host
|
||||
func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error {
|
||||
for {
|
||||
devTree, err := dp.scanFPGAs()
|
||||
|
@ -119,6 +119,23 @@ func getDevices() []device {
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "intel-fpga-dev.2",
|
||||
regions: []region{
|
||||
{
|
||||
id: "intel-fpga-fme.2",
|
||||
interfaceID: unhealthyInterfaceID,
|
||||
devNode: "/dev/intel-fpga-fme.2",
|
||||
afus: []afu{
|
||||
{
|
||||
id: "intel-fpga-port.2",
|
||||
afuID: unhealthyAfuID,
|
||||
devNode: "/dev/intel-fpga-port.2",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
@ -132,6 +149,10 @@ func TestGetRegionDevelTree(t *testing.T) {
|
||||
State: pluginapi.Healthy,
|
||||
Nodes: []string{"/dev/intel-fpga-port.1", "/dev/intel-fpga-fme.1"},
|
||||
})
|
||||
expected.AddDevice(regionMode+"-"+unhealthyInterfaceID, "intel-fpga-fme.2", dpapi.DeviceInfo{
|
||||
State: pluginapi.Unhealthy,
|
||||
Nodes: []string{"/dev/intel-fpga-port.2", "/dev/intel-fpga-fme.2"},
|
||||
})
|
||||
|
||||
result := getRegionDevelTree(getDevices())
|
||||
if !reflect.DeepEqual(result, expected) {
|
||||
@ -149,6 +170,10 @@ func TestGetRegionTree(t *testing.T) {
|
||||
State: pluginapi.Healthy,
|
||||
Nodes: []string{"/dev/intel-fpga-port.1"},
|
||||
})
|
||||
expected.AddDevice(regionMode+"-"+unhealthyInterfaceID, "intel-fpga-fme.2", dpapi.DeviceInfo{
|
||||
State: pluginapi.Unhealthy,
|
||||
Nodes: []string{"/dev/intel-fpga-port.2"},
|
||||
})
|
||||
|
||||
result := getRegionTree(getDevices())
|
||||
if !reflect.DeepEqual(result, expected) {
|
||||
@ -166,6 +191,10 @@ func TestGetAfuTree(t *testing.T) {
|
||||
State: pluginapi.Healthy,
|
||||
Nodes: []string{"/dev/intel-fpga-port.1"},
|
||||
})
|
||||
expected.AddDevice(afMode+"-"+unhealthyAfuID, "intel-fpga-port.2", dpapi.DeviceInfo{
|
||||
State: pluginapi.Unhealthy,
|
||||
Nodes: []string{"/dev/intel-fpga-port.2"},
|
||||
})
|
||||
|
||||
result := getAfuTree(getDevices())
|
||||
if !reflect.DeepEqual(result, expected) {
|
||||
|
Loading…
Reference in New Issue
Block a user