From 4e645d823ca0e87742b45f1939bbd522316a6c2c Mon Sep 17 00:00:00 2001 From: Tuomas Katila Date: Fri, 28 Apr 2023 15:12:03 +0300 Subject: [PATCH] gpu: change 'none' allocation policy With shared-dev-num and multiple i915s in the resource request, try to find as many individual GPUs to expose to the container. Previously, with multiple i915 resources, it was typical to get only one GPU device in the container. Co-authored-by: Eero Tamminen Signed-off-by: Tuomas Katila --- cmd/gpu_plugin/gpu_plugin.go | 51 ++++++++++++++++++++++++++++--- cmd/gpu_plugin/gpu_plugin_test.go | 32 ++++++++++++++++--- 2 files changed, 74 insertions(+), 9 deletions(-) diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index d820974b..23a497a9 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -64,15 +64,58 @@ type cliOptions struct { type preferredAllocationPolicyFunc func(*pluginapi.ContainerPreferredAllocationRequest) []string -// nonePolicy is used for allocating GPU devices randomly. +// nonePolicy is used for allocating GPU devices randomly, while trying +// to select as many individual GPU devices as requested. func nonePolicy(req *pluginapi.ContainerPreferredAllocationRequest) []string { klog.V(2).Info("Select nonePolicy for GPU device allocation") - deviceIds := req.AvailableDeviceIDs[0:req.AllocationSize] + devices := make(map[string]bool) + selected := make(map[string]bool) + neededCount := req.AllocationSize - klog.V(2).Infof("Allocate deviceIds: %q", deviceIds) + // When shared-dev-num is greater than 1, try to find as + // many independent GPUs as possible, to satisfy the request. - return deviceIds + for _, deviceID := range req.AvailableDeviceIDs { + device := strings.Split(deviceID, "-")[0] + + if _, found := devices[device]; !found { + devices[device] = true + selected[deviceID] = true + neededCount-- + + if neededCount == 0 { + break + } + } + } + + // If there were not enough independent GPUs, use remaining untaken deviceIDs. + + if neededCount > 0 { + for _, deviceID := range req.AvailableDeviceIDs { + if _, found := selected[deviceID]; !found { + selected[deviceID] = true + neededCount-- + + if neededCount == 0 { + break + } + } + } + } + + // Convert selected map into an array + + deviceIDs := []string{} + + for deviceID := range selected { + deviceIDs = append(deviceIDs, deviceID) + } + + klog.V(2).Infof("Allocate deviceIds: %q", deviceIDs) + + return deviceIDs } // balancedPolicy is used for allocating GPU devices in balance. diff --git a/cmd/gpu_plugin/gpu_plugin_test.go b/cmd/gpu_plugin/gpu_plugin_test.go index 62536093..a5b59519 100644 --- a/cmd/gpu_plugin/gpu_plugin_test.go +++ b/cmd/gpu_plugin/gpu_plugin_test.go @@ -20,6 +20,7 @@ import ( "path" "path/filepath" "reflect" + "sort" "testing" "github.com/pkg/errors" @@ -99,12 +100,21 @@ func TestGetPreferredAllocation(t *testing.T) { rqt := &v1beta1.PreferredAllocationRequest{ ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{ { - AvailableDeviceIDs: []string{"card0-4", "card1-1", "card2-3", "card2-4", "card2-1", "card1-0", "card1-4", "card3-4", "card1-2", "card0-1", "card2-0", "card2-2", "card1-3", "card0-2", "card3-0", "card3-3", "card0-3", "card0-0", "card3-1", "card3-2"}, + AvailableDeviceIDs: []string{"card0-4", "card0-2", "card1-1", "card2-3", "card2-4", "card2-1", "card1-0", "card1-4", "card3-4", "card1-2", "card0-1", "card2-0", "card2-2", "card1-3", "card3-0", "card3-3", "card0-3", "card0-0", "card3-1", "card3-2"}, AllocationSize: 4, }, }, } + rqtNotEnough := &v1beta1.PreferredAllocationRequest{ + ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{ + { + AvailableDeviceIDs: []string{"card0-1", "card0-2", "card0-3", "card1-1"}, + AllocationSize: 3, + }, + }, + } + rqtErr := &v1beta1.PreferredAllocationRequest{ ContainerRequests: []*v1beta1.ContainerPreferredAllocationRequest{ { @@ -117,22 +127,24 @@ func TestGetPreferredAllocation(t *testing.T) { plugin := newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"}) response, _ := plugin.GetPreferredAllocation(rqt) - if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-4", "card1-1", "card2-3", "card2-4"}) { - t.Error("Unexpected return value for none preferred allocation") + sort.Strings(response.ContainerResponses[0].DeviceIDs) + + if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-4", "card1-1", "card2-3", "card3-4"}) { + t.Error("Unexpected return value for none preferred allocation", response.ContainerResponses[0].DeviceIDs) } plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "balanced"}) response, _ = plugin.GetPreferredAllocation(rqt) if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-0", "card1-0", "card2-0", "card3-0"}) { - t.Error("Unexpected return value for balanced preferred allocation") + t.Error("Unexpected return value for balanced preferred allocation", response.ContainerResponses[0].DeviceIDs) } plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "packed"}) response, _ = plugin.GetPreferredAllocation(rqt) if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-0", "card0-1", "card0-2", "card0-3"}) { - t.Error("Unexpected return value for packed preferred allocation") + t.Error("Unexpected return value for packed preferred allocation", response.ContainerResponses[0].DeviceIDs) } plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"}) @@ -141,6 +153,16 @@ func TestGetPreferredAllocation(t *testing.T) { if response != nil { t.Error("Fail to handle the input error that req.AllocationSize is greater than len(req.AvailableDeviceIDs).") } + + plugin = newDevicePlugin("", "", cliOptions{sharedDevNum: 5, resourceManagement: false, preferredAllocationPolicy: "none"}) + response, _ = plugin.GetPreferredAllocation(rqtNotEnough) + + sort.Strings(response.ContainerResponses[0].DeviceIDs) + + if !reflect.DeepEqual(response.ContainerResponses[0].DeviceIDs, []string{"card0-1", "card0-2", "card1-1"}) { + t.Error("Unexpected return value for none preferred allocation with too few separate devices", + response.ContainerResponses[0].DeviceIDs) + } } func TestAllocate(t *testing.T) {