gpu: change 'none' allocation policy

With shared-dev-num and multiple i915s in the resource request,
try to find as many individual GPUs to expose to the container.

Previously, with multiple i915 resources, it was typical to
get only one GPU device in the container.

Co-authored-by: Eero Tamminen <eero.t.tamminen@intel.com>
Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
Tuomas Katila 2023-04-28 15:12:03 +03:00 committed by Ukri Niemimuukko
parent e6681fa2c5
commit 4e645d823c
2 changed files with 74 additions and 9 deletions

View File

@ -64,15 +64,58 @@ type cliOptions struct {
type preferredAllocationPolicyFunc func(*pluginapi.ContainerPreferredAllocationRequest) []string
// nonePolicy is used for allocating GPU devices randomly.
// nonePolicy is used for allocating GPU devices randomly, while trying
// to select as many individual GPU devices as requested.
func nonePolicy(req *pluginapi.ContainerPreferredAllocationRequest) []string {
klog.V(2).Info("Select nonePolicy for GPU device allocation")
deviceIds := req.AvailableDeviceIDs[0:req.AllocationSize]
devices := make(map[string]bool)
selected := make(map[string]bool)
neededCount := req.AllocationSize
klog.V(2).Infof("Allocate deviceIds: %q", deviceIds)
// When shared-dev-num is greater than 1, try to find as
// many independent GPUs as possible, to satisfy the request.
return deviceIds
for _, deviceID := range req.AvailableDeviceIDs {
device := strings.Split(deviceID, "-")[0]
if _, found := devices[device]; !found {
devices[device] = true
selected[deviceID] = true
neededCount--
if neededCount == 0 {
break
}
}
}
// If there were not enough independent GPUs, use remaining untaken deviceIDs.
if neededCount > 0 {
for _, deviceID := range req.AvailableDeviceIDs {
if _, found := selected[deviceID]; !found {
selected[deviceID] = true
neededCount--
if neededCount == 0 {
break
}
}
}
}
// Convert selected map into an array
deviceIDs := []string{}
for deviceID := range selected {
deviceIDs = append(deviceIDs, deviceID)
}
klog.V(2).Infof("Allocate deviceIds: %q", deviceIDs)
return deviceIDs
}
// balancedPolicy is used for allocating GPU devices in balance.

View File

@ -20,6 +20,7 @@ import (
"path"
"path/filepath"
"reflect"
"sort"
"testing"
"github.com/pkg/errors"
@ -99,12 +100,21 @@ func TestGetPreferredAllocation(t *testing.T) {
rqt := &v1beta1.PreferredAllocationRequest{
ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{
{
AvailableDeviceIDs: []string{"card0-4", "card1-1", "card2-3", "card2-4", "card2-1", "card1-0", "card1-4", "card3-4", "card1-2", "card0-1", "card2-0", "card2-2", "card1-3", "card0-2", "card3-0", "card3-3", "card0-3", "card0-0", "card3-1", "card3-2"},
AvailableDeviceIDs: []string{"card0-4", "card0-2", "card1-1", "card2-3", "card2-4", "card2-1", "card1-0", "card1-4", "card3-4", "card1-2", "card0-1", "card2-0", "card2-2", "card1-3", "card3-0", "card3-3", "card0-3", "card0-0", "card3-1", "card3-2"},
AllocationSize: 4,
},
},
}
rqtNotEnough := &v1beta1.PreferredAllocationRequest{
ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{
{
AvailableDeviceIDs: []string{"card0-1", "card0-2", "card0-3", "card1-1"},
AllocationSize: 3,
},
},
}
rqtErr := &v1beta1.PreferredAllocationRequest{
ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{
{
@ -117,22 +127,24 @@ func TestGetPreferredAllocation(t *testing.T) {
plugin := newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"})
response, _ := plugin.GetPreferredAllocation(rqt)
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-4", "card1-1", "card2-3", "card2-4"}) {
t.Error("Unexpected return value for none preferred allocation")
sort.Strings(response.ContainerResponses[0].DeviceIDs)
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-4", "card1-1", "card2-3", "card3-4"}) {
t.Error("Unexpected return value for none preferred allocation", response.ContainerResponses[0].DeviceIDs)
}
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "balanced"})
response, _ = plugin.GetPreferredAllocation(rqt)
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-0", "card1-0", "card2-0", "card3-0"}) {
t.Error("Unexpected return value for balanced preferred allocation")
t.Error("Unexpected return value for balanced preferred allocation", response.ContainerResponses[0].DeviceIDs)
}
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "packed"})
response, _ = plugin.GetPreferredAllocation(rqt)
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-0", "card0-1", "card0-2", "card0-3"}) {
t.Error("Unexpected return value for packed preferred allocation")
t.Error("Unexpected return value for packed preferred allocation", response.ContainerResponses[0].DeviceIDs)
}
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"})
@ -141,6 +153,16 @@ func TestGetPreferredAllocation(t *testing.T) {
if response != nil {
t.Error("Fail to handle the input error that req.AllocationSize is greater than len(req.AvailableDeviceIDs).")
}
plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"})
response, _ = plugin.GetPreferredAllocation(rqtNotEnough)
sort.Strings(response.ContainerResponses[0].DeviceIDs)
if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-1", "card0-2", "card1-1"}) {
t.Error("Unexpected return value for none preferred allocation with too few separate devices",
response.ContainerResponses[0].DeviceIDs)
}
}
func TestAllocate(t *testing.T) {