mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
fpga_plugin: indicate unhealthy devices
When the device's firmware crashes the OPAE driver reports all properties of the device as a stream of binary ones. This effectively makes interface and afu IDs look like "ffffffffffffffffffffffffffffffff". Mark such devices as Unhealthy. closes #77
This commit is contained in:
parent
1f236d5b23
commit
92f72e4196
@ -52,6 +52,10 @@ const (
|
|||||||
deviceRE = `^intel-fpga-dev.[0-9]+$`
|
deviceRE = `^intel-fpga-dev.[0-9]+$`
|
||||||
portRE = `^intel-fpga-port.[0-9]+$`
|
portRE = `^intel-fpga-port.[0-9]+$`
|
||||||
fmeRE = `^intel-fpga-fme.[0-9]+$`
|
fmeRE = `^intel-fpga-fme.[0-9]+$`
|
||||||
|
|
||||||
|
// When the device's firmware crashes the driver reports these values
|
||||||
|
unhealthyAfuID = "ffffffffffffffffffffffffffffffff"
|
||||||
|
unhealthyInterfaceID = "ffffffffffffffffffffffffffffffff"
|
||||||
)
|
)
|
||||||
|
|
||||||
type getDevTreeFunc func(devices []device) dpapi.DeviceTree
|
type getDevTreeFunc func(devices []device) dpapi.DeviceTree
|
||||||
@ -62,6 +66,10 @@ func getRegionDevelTree(devices []device) dpapi.DeviceTree {
|
|||||||
|
|
||||||
for _, dev := range devices {
|
for _, dev := range devices {
|
||||||
for _, region := range dev.regions {
|
for _, region := range dev.regions {
|
||||||
|
health := pluginapi.Healthy
|
||||||
|
if region.interfaceID == unhealthyInterfaceID {
|
||||||
|
health = pluginapi.Unhealthy
|
||||||
|
}
|
||||||
devType := fmt.Sprintf("%s-%s", regionMode, region.interfaceID)
|
devType := fmt.Sprintf("%s-%s", regionMode, region.interfaceID)
|
||||||
devNodes := make([]string, len(region.afus)+1)
|
devNodes := make([]string, len(region.afus)+1)
|
||||||
for num, afu := range region.afus {
|
for num, afu := range region.afus {
|
||||||
@ -69,7 +77,7 @@ func getRegionDevelTree(devices []device) dpapi.DeviceTree {
|
|||||||
}
|
}
|
||||||
devNodes[len(region.afus)] = region.devNode
|
devNodes[len(region.afus)] = region.devNode
|
||||||
regionTree.AddDevice(devType, region.id, dpapi.DeviceInfo{
|
regionTree.AddDevice(devType, region.id, dpapi.DeviceInfo{
|
||||||
State: pluginapi.Healthy,
|
State: health,
|
||||||
Nodes: devNodes,
|
Nodes: devNodes,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -84,13 +92,17 @@ func getRegionTree(devices []device) dpapi.DeviceTree {
|
|||||||
|
|
||||||
for _, dev := range devices {
|
for _, dev := range devices {
|
||||||
for _, region := range dev.regions {
|
for _, region := range dev.regions {
|
||||||
|
health := pluginapi.Healthy
|
||||||
|
if region.interfaceID == unhealthyInterfaceID {
|
||||||
|
health = pluginapi.Unhealthy
|
||||||
|
}
|
||||||
devType := fmt.Sprintf("%s-%s", regionMode, region.interfaceID)
|
devType := fmt.Sprintf("%s-%s", regionMode, region.interfaceID)
|
||||||
devNodes := make([]string, len(region.afus))
|
devNodes := make([]string, len(region.afus))
|
||||||
for num, afu := range region.afus {
|
for num, afu := range region.afus {
|
||||||
devNodes[num] = afu.devNode
|
devNodes[num] = afu.devNode
|
||||||
}
|
}
|
||||||
regionTree.AddDevice(devType, region.id, dpapi.DeviceInfo{
|
regionTree.AddDevice(devType, region.id, dpapi.DeviceInfo{
|
||||||
State: pluginapi.Healthy,
|
State: health,
|
||||||
Nodes: devNodes,
|
Nodes: devNodes,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -106,9 +118,13 @@ func getAfuTree(devices []device) dpapi.DeviceTree {
|
|||||||
for _, dev := range devices {
|
for _, dev := range devices {
|
||||||
for _, region := range dev.regions {
|
for _, region := range dev.regions {
|
||||||
for _, afu := range region.afus {
|
for _, afu := range region.afus {
|
||||||
|
health := pluginapi.Healthy
|
||||||
|
if afu.afuID == unhealthyAfuID {
|
||||||
|
health = pluginapi.Unhealthy
|
||||||
|
}
|
||||||
devType := fmt.Sprintf("%s-%s", afMode, afu.afuID)
|
devType := fmt.Sprintf("%s-%s", afMode, afu.afuID)
|
||||||
afuTree.AddDevice(devType, afu.id, dpapi.DeviceInfo{
|
afuTree.AddDevice(devType, afu.id, dpapi.DeviceInfo{
|
||||||
State: pluginapi.Healthy,
|
State: health,
|
||||||
Nodes: []string{afu.devNode},
|
Nodes: []string{afu.devNode},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -194,7 +210,7 @@ func (dp *devicePlugin) PostAllocate(response *pluginapi.AllocateResponse) error
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scan starts scanning of FPGA devices on the host
|
// Scan starts scanning FPGA devices on the host
|
||||||
func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error {
|
func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error {
|
||||||
for {
|
for {
|
||||||
devTree, err := dp.scanFPGAs()
|
devTree, err := dp.scanFPGAs()
|
||||||
|
@ -119,6 +119,23 @@ func getDevices() []device {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "intel-fpga-dev.2",
|
||||||
|
regions: []region{
|
||||||
|
{
|
||||||
|
id: "intel-fpga-fme.2",
|
||||||
|
interfaceID: unhealthyInterfaceID,
|
||||||
|
devNode: "/dev/intel-fpga-fme.2",
|
||||||
|
afus: []afu{
|
||||||
|
{
|
||||||
|
id: "intel-fpga-port.2",
|
||||||
|
afuID: unhealthyAfuID,
|
||||||
|
devNode: "/dev/intel-fpga-port.2",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -132,6 +149,10 @@ func TestGetRegionDevelTree(t *testing.T) {
|
|||||||
State: pluginapi.Healthy,
|
State: pluginapi.Healthy,
|
||||||
Nodes: []string{"/dev/intel-fpga-port.1", "/dev/intel-fpga-fme.1"},
|
Nodes: []string{"/dev/intel-fpga-port.1", "/dev/intel-fpga-fme.1"},
|
||||||
})
|
})
|
||||||
|
expected.AddDevice(regionMode+"-"+unhealthyInterfaceID, "intel-fpga-fme.2", dpapi.DeviceInfo{
|
||||||
|
State: pluginapi.Unhealthy,
|
||||||
|
Nodes: []string{"/dev/intel-fpga-port.2", "/dev/intel-fpga-fme.2"},
|
||||||
|
})
|
||||||
|
|
||||||
result := getRegionDevelTree(getDevices())
|
result := getRegionDevelTree(getDevices())
|
||||||
if !reflect.DeepEqual(result, expected) {
|
if !reflect.DeepEqual(result, expected) {
|
||||||
@ -149,6 +170,10 @@ func TestGetRegionTree(t *testing.T) {
|
|||||||
State: pluginapi.Healthy,
|
State: pluginapi.Healthy,
|
||||||
Nodes: []string{"/dev/intel-fpga-port.1"},
|
Nodes: []string{"/dev/intel-fpga-port.1"},
|
||||||
})
|
})
|
||||||
|
expected.AddDevice(regionMode+"-"+unhealthyInterfaceID, "intel-fpga-fme.2", dpapi.DeviceInfo{
|
||||||
|
State: pluginapi.Unhealthy,
|
||||||
|
Nodes: []string{"/dev/intel-fpga-port.2"},
|
||||||
|
})
|
||||||
|
|
||||||
result := getRegionTree(getDevices())
|
result := getRegionTree(getDevices())
|
||||||
if !reflect.DeepEqual(result, expected) {
|
if !reflect.DeepEqual(result, expected) {
|
||||||
@ -166,6 +191,10 @@ func TestGetAfuTree(t *testing.T) {
|
|||||||
State: pluginapi.Healthy,
|
State: pluginapi.Healthy,
|
||||||
Nodes: []string{"/dev/intel-fpga-port.1"},
|
Nodes: []string{"/dev/intel-fpga-port.1"},
|
||||||
})
|
})
|
||||||
|
expected.AddDevice(afMode+"-"+unhealthyAfuID, "intel-fpga-port.2", dpapi.DeviceInfo{
|
||||||
|
State: pluginapi.Unhealthy,
|
||||||
|
Nodes: []string{"/dev/intel-fpga-port.2"},
|
||||||
|
})
|
||||||
|
|
||||||
result := getAfuTree(getDevices())
|
result := getAfuTree(getDevices())
|
||||||
if !reflect.DeepEqual(result, expected) {
|
if !reflect.DeepEqual(result, expected) {
|
||||||
|
Loading…
Reference in New Issue
Block a user