diff --git a/cmd/gpu_plugin/gpu_plugin.go b/cmd/gpu_plugin/gpu_plugin.go index c078480e..2e118739 100644 --- a/cmd/gpu_plugin/gpu_plugin.go +++ b/cmd/gpu_plugin/gpu_plugin.go @@ -197,6 +197,10 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi // Implement the PreferredAllocator interface. func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) { + if dp.resMan != nil { + return dp.resMan.GetPreferredFractionalAllocation(rqt) + } + response := &pluginapi.PreferredAllocationResponse{} for _, req := range rqt.ContainerRequests { @@ -356,7 +360,7 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) { func (dp *devicePlugin) Allocate(request *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { if dp.resMan != nil { - return dp.resMan.ReallocateWithFractionalResources(request) + return dp.resMan.CreateFractionalResourceResponse(request) } return nil, &dpapi.UseDefaultMethodError{} diff --git a/cmd/gpu_plugin/gpu_plugin_test.go b/cmd/gpu_plugin/gpu_plugin_test.go index bdf41627..a4c30460 100644 --- a/cmd/gpu_plugin/gpu_plugin_test.go +++ b/cmd/gpu_plugin/gpu_plugin_test.go @@ -48,11 +48,15 @@ func (n *mockNotifier) Notify(newDeviceTree dpapi.DeviceTree) { type mockResourceManager struct{} -func (m *mockResourceManager) ReallocateWithFractionalResources(*v1beta1.AllocateRequest) (*v1beta1.AllocateResponse, error) { +func (m *mockResourceManager) CreateFractionalResourceResponse(*v1beta1.AllocateRequest) (*v1beta1.AllocateResponse, error) { return &v1beta1.AllocateResponse{}, &dpapi.UseDefaultMethodError{} } func (m *mockResourceManager) SetDevInfos(rm.DeviceInfoMap) {} +func (m *mockResourceManager) GetPreferredFractionalAllocation(*v1beta1.PreferredAllocationRequest) (*v1beta1.PreferredAllocationResponse, error) { + return &v1beta1.PreferredAllocationResponse{}, &dpapi.UseDefaultMethodError{} +} + func createTestFiles(root string, devfsdirs, sysfsdirs []string, sysfsfiles map[string][]byte) (string, string, error) { sysfs := path.Join(root, "sys") devfs := path.Join(root, "dev") diff --git a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go index 59628384..fcf2ae92 100644 --- a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go +++ b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager.go @@ -47,8 +47,6 @@ const ( grpcAddress = "unix:///var/lib/kubelet/pod-resources/kubelet.sock" grpcBufferSize = 4 * 1024 * 1024 grpcTimeout = 5 * time.Second - - retryTimeout = 1 * time.Second ) // Errors. @@ -63,12 +61,15 @@ func (e *zeroPendingErr) Error() string { } type podCandidate struct { - pod *v1.Pod - name string - allocatedContainers int - allocationTargetNum int + pod *v1.Pod + name string + allocatedContainerCount int + allocationTargetNum int } +// DeviceInfo is a subset of deviceplugin.DeviceInfo +// It's a lighter version of the full DeviceInfo as it is used +// to store fractional devices. type DeviceInfo struct { envs map[string]string nodes []pluginapi.DeviceSpec @@ -79,18 +80,32 @@ type getClientFunc func(string, time.Duration, int) (podresourcesv1.PodResources // ResourceManager interface for the fractional resource handling. type ResourceManager interface { - ReallocateWithFractionalResources(*pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) + CreateFractionalResourceResponse(*pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) + GetPreferredFractionalAllocation(*pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) SetDevInfos(DeviceInfoMap) } +type ContainerAssignments struct { + deviceIds map[string]bool + tileEnv string +} + +type PodAssignmentDetails struct { + containers []ContainerAssignments +} + type resourceManager struct { - deviceInfos DeviceInfoMap - nodeName string clientset kubernetes.Interface + deviceInfos DeviceInfoMap prGetClientFunc getClientFunc + assignments map[string]PodAssignmentDetails // pod name -> assignment details + nodeName string skipID string fullResourceName string + retryTimeout time.Duration + cleanupInterval time.Duration mutex sync.RWMutex // for devTree updates during scan + cleanupMutex sync.RWMutex // for assignment details during cleanup } // NewDeviceInfo creates a new DeviceInfo. @@ -124,32 +139,104 @@ func NewResourceManager(skipID, fullResourceName string) (ResourceManager, error skipID: skipID, fullResourceName: fullResourceName, prGetClientFunc: podresources.GetV1Client, + assignments: make(map[string]PodAssignmentDetails), + retryTimeout: 1 * time.Second, + cleanupInterval: 2 * time.Minute, } klog.Info("GPU device plugin resource manager enabled") + go func() { + ticker := time.NewTicker(rm.cleanupInterval) + + for range ticker.C { + klog.V(4).Info("Running cleanup") + + // Gather both running and pending pods. It might happen that + // cleanup is triggered between GetPreferredAllocation and Allocate + // and it would remove the assignment data for the soon-to-be allocated pod + running := rm.listPodsOnNodeWithState(string(v1.PodRunning)) + for podName, podItem := range rm.listPodsOnNodeWithState(string(v1.PodPending)) { + running[podName] = podItem + } + + func() { + rm.cleanupMutex.Lock() + defer rm.cleanupMutex.Unlock() + + for podName := range rm.assignments { + if _, found := running[podName]; !found { + klog.V(4).Info("Removing from assignments: ", podName) + delete(rm.assignments, podName) + } + } + }() + + klog.V(4).Info("Cleanup done") + } + }() + return &rm, nil } -// ReallocateWithFractionalResources runs the fractional resource logic. +// Generate a unique key for Pod. +func getPodKey(pod *v1.Pod) string { + return pod.Namespace + "&" + pod.Name +} + +// Generate a unique key for PodResources. +func getPodResourceKey(res *podresourcesv1.PodResources) string { + return res.Namespace + "&" + res.Name +} + +func (rm *resourceManager) listPodsOnNodeWithState(state string) map[string]*v1.Pod { + pods := make(map[string]*v1.Pod) + + selector, err := fields.ParseSelector("spec.nodeName=" + rm.nodeName + + ",status.phase=" + state) + + if err != nil { + return pods + } + + podList, err := rm.clientset.CoreV1().Pods(v1.NamespaceAll).List(context.Background(), metav1.ListOptions{ + FieldSelector: selector.String(), + }) + + if err != nil { + return pods + } + + for i := range podList.Items { + key := getPodKey(&podList.Items[i]) + pods[key] = &podList.Items[i] + } + + return pods +} + +// CreateFractionalResourceResponse returns allocate response with the details +// assigned in GetPreferredFractionalAllocation // This intentionally only logs errors and returns with the UseDefaultMethodError, // in case any errors are hit. This is to avoid clusters filling up with unexpected admission errors. -func (rm *resourceManager) ReallocateWithFractionalResources(request *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { - if !isInputOk(request, rm.skipID) { +func (rm *resourceManager) CreateFractionalResourceResponse(request *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { + if !isAllocateRequestOk(request, rm.skipID) { // it is better to leave allocated gpu devices as is and return return nil, &dpapi.UseDefaultMethodError{} } + klog.V(4).Info("Proposed device ids: ", request.ContainerRequests[0].DevicesIDs) + podCandidate, err := rm.findAllocationPodCandidate() - if _, ok := err.(*retryErr); ok { + if errors.Is(err, &retryErr{}) { klog.Warning("retrying POD resolving after sleeping") - time.Sleep(retryTimeout) + time.Sleep(rm.retryTimeout) podCandidate, err = rm.findAllocationPodCandidate() } if err != nil { - if _, ok := err.(*zeroPendingErr); !ok { + if !errors.Is(err, &zeroPendingErr{}) { klog.Error("allocation candidate not found, perhaps the GPU scheduler extender is not called, err:", err) } // it is better to leave allocated gpu devices as is and return @@ -157,13 +244,170 @@ func (rm *resourceManager) ReallocateWithFractionalResources(request *pluginapi. } pod := podCandidate.pod - cards := containerCards(pod, podCandidate.allocatedContainers) - affinityMask := containerTileAffinityMask(pod, podCandidate.allocatedContainers) - return rm.createAllocateResponse(cards, affinityMask) + rm.cleanupMutex.Lock() + + assignment, found := rm.assignments[getPodKey(pod)] + if !found { + rm.cleanupMutex.Unlock() + klog.Error("couldn't find allocation info from assignments:", getPodKey(pod)) + + return nil, &dpapi.UseDefaultMethodError{} + } + + containerIndex := podCandidate.allocatedContainerCount + + affinityMask := assignment.containers[containerIndex].tileEnv + getPrefDevices := assignment.containers[containerIndex].deviceIds + + rm.cleanupMutex.Unlock() + + devIds := request.ContainerRequests[0].DevicesIDs + + // Check if all the preferred devices were also used + if len(devIds) != len(getPrefDevices) { + klog.Warningf("Allocate called with odd number of device IDs: %d vs %d", len(devIds), len(getPrefDevices)) + } + + for _, devID := range devIds { + if _, found := getPrefDevices[devID]; !found { + klog.Warningf("Not preferred device used in Allocate: %s (%v)", devID, getPrefDevices) + } + } + + klog.V(4).Info("Allocate affinity mask: ", affinityMask) + klog.V(4).Info("Allocate device ids: ", devIds) + + return rm.createAllocateResponse(devIds, affinityMask) } -func isInputOk(rqt *pluginapi.AllocateRequest, skipID string) bool { +func (rm *resourceManager) GetPreferredFractionalAllocation(request *pluginapi.PreferredAllocationRequest) ( + *pluginapi.PreferredAllocationResponse, error) { + if !isPreferredAllocationRequestOk(request, rm.skipID) { + // it is better to leave allocated gpu devices as is and return + return &pluginapi.PreferredAllocationResponse{}, nil + } + + klog.V(4).Info("GetPreferredAllocation request: ", request) + + podCandidate, err := rm.findAllocationPodCandidate() + if errors.Is(err, &retryErr{}) { + klog.Warning("retrying POD resolving after sleeping") + time.Sleep(rm.retryTimeout) + + podCandidate, err = rm.findAllocationPodCandidate() + } + + if err != nil { + if !errors.Is(err, &zeroPendingErr{}) { + klog.Error("allocation candidate not found, perhaps the GPU scheduler extender is not called, err:", err) + } + + // Return empty response as returning an error causes + // the pod to be labeled as UnexpectedAdmissionError + return &pluginapi.PreferredAllocationResponse{}, nil + } + + pod := podCandidate.pod + containerIndex := podCandidate.allocatedContainerCount + cards := containerCards(pod, containerIndex) + affinityMask := containerTileAffinityMask(pod, containerIndex) + podKey := getPodKey(pod) + + creq := request.ContainerRequests[0] + + klog.V(4).Info("Get preferred fractional allocation: ", + podKey, creq.AllocationSize, creq.MustIncludeDeviceIDs, creq.AvailableDeviceIDs) + + deviceIds := selectDeviceIDsForContainer( + int(creq.AllocationSize), cards, creq.AvailableDeviceIDs, creq.MustIncludeDeviceIDs) + + // Map container assignment details per pod name + + rm.cleanupMutex.Lock() + + assignments, found := rm.assignments[podKey] + + if !found { + assignments.containers = make([]ContainerAssignments, podCandidate.allocationTargetNum) + } + + assignments.containers[containerIndex].tileEnv = affinityMask + // Store device ids so we can double check the ones in Allocate + assignments.containers[containerIndex].deviceIds = make(map[string]bool) + for _, devID := range deviceIds { + assignments.containers[containerIndex].deviceIds[devID] = true + } + + rm.assignments[podKey] = assignments + + rm.cleanupMutex.Unlock() + + klog.V(4).Info("Selected devices for container: ", deviceIds) + + response := pluginapi.PreferredAllocationResponse{ + ContainerResponses: []*pluginapi.ContainerPreferredAllocationResponse{ + {DeviceIDs: deviceIds}, + }, + } + + return &response, nil +} + +// selectDeviceIDsForContainer selects suitable device ids from deviceIds and mustHaveDeviceIds +// the selection is guided by the cards list. +func selectDeviceIDsForContainer(requestedCount int, cards, deviceIds, mustHaveDeviceIds []string) []string { + getBaseCard := func(deviceId string) string { + return strings.Split(deviceId, "-")[0] + } + + if requestedCount < len(cards) { + klog.Warningf("Requested count is less than card count: %d vs %d.", requestedCount, len(cards)) + cards = cards[0:requestedCount] + } + + if requestedCount > len(cards) { + klog.Warningf("Requested count is higher than card count: %d vs %d.", requestedCount, len(cards)) + } + + // map of cardX -> device id list + available := map[string][]string{} + // Keep the last used index so we can pick the next one + availableIndex := map[string]int{} + + // Place must have IDs first so they get used + for _, devID := range mustHaveDeviceIds { + baseCard := getBaseCard(devID) + available[baseCard] = append(available[baseCard], devID) + } + + for _, devID := range deviceIds { + baseCard := getBaseCard(devID) + available[baseCard] = append(available[baseCard], devID) + } + + selected := []string{} + + for _, card := range cards { + indexNow := availableIndex[card] + + availableDevices, found := available[card] + if !found { + klog.Warningf("card %s is not found from known devices: %v", card, available) + continue + } + + if indexNow < len(availableDevices) { + selected = append(selected, availableDevices[indexNow]) + indexNow++ + availableIndex[card] = indexNow + } + } + + return selected +} + +func isAllocateRequestOk(rqt *pluginapi.AllocateRequest, skipID string) bool { // so far kubelet calls allocate for each container separately. If that changes, we need to refine our logic. if len(rqt.ContainerRequests) != 1 { klog.Warning("multi-container allocation request not supported") @@ -180,6 +424,23 @@ func isInputOk(rqt *pluginapi.AllocateRequest, skipID string) bool { return true } +func isPreferredAllocationRequestOk(rqt *pluginapi.PreferredAllocationRequest, skipID string) bool { + // so far kubelet calls allocate for each container separately. If that changes, we need to refine our logic. + if len(rqt.ContainerRequests) != 1 { + klog.Warning("multi-container allocation request not supported") + return false + } + + crqt := rqt.ContainerRequests[0] + for _, id := range crqt.AvailableDeviceIDs { + if id == skipID { + return false // intentionally not printing anything, this request is skipped + } + } + + return true +} + // findAllocationPodCandidate tries to find the best allocation candidate pod, which must be: // -pending for this node // -using GPU resources in its spec @@ -230,6 +491,7 @@ func (rm *resourceManager) findAllocationPodCandidate() (*podCandidate, error) { } } + // .name here refers to a namespace+name combination sort.Slice(timestampedCandidates, func(i, j int) bool { return pendingPods[timestampedCandidates[i].name].Annotations[gasTSAnnotation] < @@ -249,29 +511,11 @@ func (rm *resourceManager) findAllocationPodCandidate() (*podCandidate, error) { // getNodePendingGPUPods returns a map of pod names -> pods that are pending and use the gpu. func (rm *resourceManager) getNodePendingGPUPods() (map[string]*v1.Pod, error) { - selector, err := fields.ParseSelector("spec.nodeName=" + rm.nodeName + - ",status.phase=" + string(v1.PodPending)) + pendingPods := rm.listPodsOnNodeWithState(string(v1.PodPending)) - if err != nil { - return nil, errors.Wrap(err, "unable to parse selector") - } - - pendingPodList, err := rm.clientset.CoreV1().Pods(v1.NamespaceAll).List(context.Background(), metav1.ListOptions{ - FieldSelector: selector.String(), - }) - - if err != nil { - return nil, errors.Wrap(err, "unable to list pods") - } - - // make a map ouf of the list, accept only GPU-using pods - pendingPods := make(map[string]*v1.Pod) - - for i := range pendingPodList.Items { - pod := &pendingPodList.Items[i] - - if numGPUUsingContainers(pod, rm.fullResourceName) > 0 { - pendingPods[pod.Name] = pod + for podName, pod := range pendingPods { + if numGPUUsingContainers(pod, rm.fullResourceName) == 0 { + delete(pendingPods, podName) } } @@ -312,14 +556,16 @@ func (rm *resourceManager) findAllocationPodCandidates(pendingPods map[string]*v } } - if pod, pending := pendingPods[podRes.Name]; pending { + key := getPodResourceKey(podRes) + + if pod, pending := pendingPods[key]; pending { allocationTargetNum := numGPUUsingContainers(pod, rm.fullResourceName) if numContainersAllocated < allocationTargetNum { candidate := podCandidate{ - pod: pod, - name: podRes.Name, - allocatedContainers: numContainersAllocated, - allocationTargetNum: allocationTargetNum, + pod: pod, + name: key, + allocatedContainerCount: numContainersAllocated, + allocationTargetNum: allocationTargetNum, } candidates = append(candidates, candidate) } @@ -335,19 +581,17 @@ func (rm *resourceManager) SetDevInfos(deviceInfos DeviceInfoMap) { rm.deviceInfos = deviceInfos } -func (rm *resourceManager) createAllocateResponse(cards []string, tileAffinityMask string) (*pluginapi.AllocateResponse, error) { +func (rm *resourceManager) createAllocateResponse(deviceIds []string, tileAffinityMask string) (*pluginapi.AllocateResponse, error) { rm.mutex.Lock() defer rm.mutex.Unlock() allocateResponse := pluginapi.AllocateResponse{} cresp := pluginapi.ContainerAllocateResponse{} - for _, card := range cards { - newDeviceID := card + "-0" - - dev, ok := rm.deviceInfos[newDeviceID] + for _, devID := range deviceIds { + dev, ok := rm.deviceInfos[devID] if !ok { - klog.Warningf("No device info for %q, using default allocation method devices", newDeviceID) + klog.Warningf("No device info for %q, using default allocation method devices", devID) return nil, &dpapi.UseDefaultMethodError{} } @@ -376,6 +620,7 @@ func (rm *resourceManager) createAllocateResponse(cards []string, tileAffinityMa if cresp.Envs == nil { cresp.Envs = make(map[string]string) } + cresp.Envs[levelZeroAffinityMaskEnvVar] = tileAffinityMask } @@ -416,7 +661,7 @@ func containerCards(pod *v1.Pod, gpuUsingContainerIndex int) []string { cards := strings.Split(cardList, ",") if len(cards) > 0 && len(cardList) > 0 { if gpuUsingContainerIndex == i { - klog.V(3).Infof("Cards for container nr %v in pod %v are %v", gpuUsingContainerIndex, pod.Name, cards) + klog.V(3).Infof("Cards for container nr %v in pod %v are %v", gpuUsingContainerIndex, getPodKey(pod), cards) return cards } i++ @@ -429,40 +674,42 @@ func containerCards(pod *v1.Pod, gpuUsingContainerIndex int) []string { } func convertTileInfoToEnvMask(tileInfo string) string { - cardTileList := strings.Split(tileInfo, ",") + cards := strings.Split(tileInfo, ",") - var tileIndices []string + tileIndices := make([]string, len(cards)) - for i, cardList := range cardTileList { - cards := strings.Split(cardList, ",") + for i, cardTileCombos := range cards { + cardTileSplit := strings.Split(cardTileCombos, ":") + if len(cardTileSplit) != 2 { + klog.Warningf("invalid card tile combo string (%v)", cardTileCombos) + return "" + } - for _, cardTileCombos := range cards { - cardTileSplit := strings.Split(cardTileCombos, ":") - if len(cardTileSplit) < 2 { - klog.Warningf("invalid card tile combo string (%v)", cardTileCombos) + tiles := strings.Split(cardTileSplit[1], "+") + + var combos []string + + for _, tile := range tiles { + if !strings.HasPrefix(tile, "gt") { + klog.Warningf("invalid tile syntax (%v)", tile) return "" } - tiles := strings.Split(cardTileSplit[1], "+") + tileNoStr := strings.TrimPrefix(tile, "gt") + tileNo, err := strconv.ParseInt(tileNoStr, 10, 16) - var combos []string - - for _, tile := range tiles { - tileNoStr := strings.TrimPrefix(tile, "gt") - tileNo, err := strconv.ParseInt(tileNoStr, 10, 16) - - if err != nil { - continue - } - - levelZeroCardTileCombo := - strconv.FormatInt(int64(i), 10) + "." + - strconv.FormatInt(int64(tileNo), 10) - combos = append(combos, levelZeroCardTileCombo) + if err != nil { + klog.Warningf("invalid tile syntax (%v)", tile) + return "" } - tileIndices = append(tileIndices, strings.Join(combos, ",")) + levelZeroCardTileCombo := + strconv.FormatInt(int64(i), 10) + "." + + strconv.FormatInt(tileNo, 10) + combos = append(combos, levelZeroCardTileCombo) } + + tileIndices[i] = strings.Join(combos, ",") } return strings.Join(tileIndices, ",") @@ -472,11 +719,12 @@ func convertTileInfoToEnvMask(tileInfo string) string { // Indices should be passed to level zero env variable to guide execution // gpuUsingContainerIndex 0 == first gpu-using container in the pod. // annotation example: -// gas-container-tiles=card0:gt0+gt1,card1:gt0|card2:gt1+gt2||card0:gt3 +// gas-container-tiles=card0:gt0+gt1,card1:gt0|card2:gt1+gt2||card0:gt3. func containerTileAffinityMask(pod *v1.Pod, gpuUsingContainerIndex int) string { fullAnnotation := pod.Annotations[gasTileAnnotation] + onlyDividers := strings.Count(fullAnnotation, "|") == len(fullAnnotation) - if fullAnnotation == "" { + if fullAnnotation == "" || onlyDividers { return "" } @@ -484,6 +732,7 @@ func containerTileAffinityMask(pod *v1.Pod, gpuUsingContainerIndex int) string { klog.Infof("%s:%v", fullAnnotation, tileLists) i := 0 + for _, containerTileInfo := range tileLists { if len(containerTileInfo) == 0 { continue @@ -497,6 +746,7 @@ func containerTileAffinityMask(pod *v1.Pod, gpuUsingContainerIndex int) string { } klog.Warningf("couldn't find tile info for gpu using container index %v", gpuUsingContainerIndex) + return "" } diff --git a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go index 2ded853d..4bb3b733 100644 --- a/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go +++ b/cmd/gpu_plugin/rm/gpu_plugin_resource_manager_test.go @@ -72,7 +72,7 @@ func (w *mockPodResources) List(ctx context.Context, resp := podresourcesv1.ListPodResourcesResponse{} for _, pod := range w.pods { resp.PodResources = append(resp.PodResources, &podresourcesv1.PodResources{ - Name: pod.ObjectMeta.Name, Containers: []*podresourcesv1.ContainerResources{{}}, + Name: pod.ObjectMeta.Name, Namespace: pod.ObjectMeta.Namespace, Containers: []*podresourcesv1.ContainerResources{{}}, }) } @@ -101,6 +101,8 @@ func newMockResourceManager(pods []v1.Pod) ResourceManager { }, skipID: "all", fullResourceName: "gpu.intel.com/i915", + assignments: make(map[string]PodAssignmentDetails), + retryTimeout: 1 * time.Millisecond, } deviceInfoMap := NewDeviceInfoMap() @@ -120,11 +122,21 @@ func newMockResourceManager(pods []v1.Pod) ResourceManager { return &rm } +type preferredTestCase struct { + name string + pods []v1.Pod + containerRequests []*v1beta1.ContainerPreferredAllocationRequest + expectDevices []string + expectedContainerLen int +} + type testCase struct { - name string - pods []v1.Pod - containerRequests []*v1beta1.ContainerAllocateRequest - expectErr bool + name string + pods []v1.Pod + prefContainerRequests []*v1beta1.ContainerPreferredAllocationRequest + containerRequests []*v1beta1.ContainerAllocateRequest + prefExpectErr bool + expectErr bool } func TestNewResourceManager(t *testing.T) { @@ -136,7 +148,166 @@ func TestNewResourceManager(t *testing.T) { } } -func TestReallocateWithFractionalResources(t *testing.T) { +func TestGetPreferredFractionalAllocation(t *testing.T) { + properTestPod := v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{gasCardAnnotation: "card0"}, + Name: "TestPod", + Namespace: "neimspeis", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "gpu.intel.com/i915": resource.MustParse("1"), + }, + }, + }, + }, + }, + } + + gpuLessTestPod := v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "TestPodLessGpu", + Namespace: "neimspeis", + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{ + { + Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "gpu.less.com/i915": resource.MustParse("1"), + }, + }, + }, + }, + }, + } + + properTestPodMultiGpu := *properTestPod.DeepCopy() + properTestPodMultiGpu.ObjectMeta.Annotations[gasCardAnnotation] = "card0,card1" + + properTestPodMultiGpu2 := *properTestPod.DeepCopy() + properTestPodMultiGpu2.ObjectMeta.Annotations[gasCardAnnotation] = "card0,card1,card0" + + monitoringPod := *properTestPod.DeepCopy() + delete(monitoringPod.Spec.Containers[0].Resources.Requests, "gpu.intel.com/i915") + monitoringPod.Spec.Containers[0].Resources.Requests["gpu.intel.com/i915_monitoring"] = resource.MustParse("1") + + allContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ + {AvailableDeviceIDs: []string{"all"}, + AllocationSize: 1}, + } + + properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ + {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, + AllocationSize: 1}, + } + + outofRangePrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ + {AvailableDeviceIDs: []string{"card6-0", "card5-1"}, + AllocationSize: 1}, + } + + mustHaveContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ + {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, + MustIncludeDeviceIDs: []string{"card0-2"}, + AllocationSize: 2}, + } + + properPrefContainerRequests3 := []*v1beta1.ContainerPreferredAllocationRequest{ + {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, + AllocationSize: 3}, + } + + testCases := []preferredTestCase{ + { + name: "Wrong number of container requests should result in empty response", + pods: []v1.Pod{properTestPod}, + containerRequests: nil, + expectedContainerLen: 0, + }, + { + name: "Proper number of containers with good devices", + pods: []v1.Pod{properTestPod}, + containerRequests: properPrefContainerRequests, + expectDevices: []string{"card0-0"}, + expectedContainerLen: 1, + }, + { + name: "Inconsistent devices vs. gas' annotated ones", + pods: []v1.Pod{properTestPod}, + containerRequests: outofRangePrefContainerRequests, + expectDevices: []string{}, + expectedContainerLen: 1, + }, + { + name: "Preferred allocation is with must have device ids", + pods: []v1.Pod{properTestPodMultiGpu}, + containerRequests: mustHaveContainerRequests, + expectDevices: []string{"card0-2", "card1-0"}, + expectedContainerLen: 1, + }, + { + name: "Duplicate card requesting pod", + pods: []v1.Pod{properTestPodMultiGpu2}, + containerRequests: properPrefContainerRequests3, + expectDevices: []string{"card0-0", "card1-0", "card0-1"}, + expectedContainerLen: 1, + }, + { + name: "Allocation size is larger than cards assigned", + pods: []v1.Pod{properTestPodMultiGpu}, + containerRequests: properPrefContainerRequests3, + expectDevices: []string{"card0-0", "card1-0"}, + expectedContainerLen: 1, + }, + { + name: "Monitoring pod is being allocated", + pods: []v1.Pod{monitoringPod}, + containerRequests: allContainerRequests, + expectDevices: []string{}, + expectedContainerLen: 0, + }, + { + name: "Two pods with one without GPU", + pods: []v1.Pod{properTestPod, gpuLessTestPod}, + containerRequests: properPrefContainerRequests, + expectDevices: []string{"card0-0"}, + expectedContainerLen: 1, + }, + } + + for _, tCase := range testCases { + rm := newMockResourceManager(tCase.pods) + resp, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ + ContainerRequests: tCase.containerRequests, + }) + + if perr != nil { + t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) + } + + if perr == nil { + // check response + expectTruef(len(resp.ContainerResponses) == tCase.expectedContainerLen, t, tCase.name, "wrong number of container responses, expected 1") + + if len(tCase.expectDevices) > 0 { + expectTruef(len(resp.ContainerResponses[0].DeviceIDs) == len(tCase.expectDevices), t, tCase.name, + "wrong number of device ids: %d (%v)", len(resp.ContainerResponses[0].DeviceIDs), resp.ContainerResponses[0].DeviceIDs) + + for i, expecteDevice := range tCase.expectDevices { + expectTruef(resp.ContainerResponses[0].DeviceIDs[i] == expecteDevice, t, tCase.name, + "wrong device id selected: %s", resp.ContainerResponses[0].DeviceIDs[i]) + } + } + } + } +} + +func TestCreateFractionalResourceResponse(t *testing.T) { properTestPod := v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Annotations: map[string]string{gasCardAnnotation: "card0"}, @@ -167,54 +338,81 @@ func TestReallocateWithFractionalResources(t *testing.T) { properContainerRequests := []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"card0-0"}}} + properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ + {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, + AllocationSize: 1}, + } + testCases := []testCase{ { - name: "Wrong number of container requests should fail", - pods: []v1.Pod{properTestPod}, - containerRequests: []*v1beta1.ContainerAllocateRequest{}, - expectErr: true, + name: "Wrong number of container requests should fail", + pods: []v1.Pod{properTestPod}, + prefContainerRequests: properPrefContainerRequests, + prefExpectErr: false, + containerRequests: []*v1beta1.ContainerAllocateRequest{}, + expectErr: true, }, { - name: "Request for monitor resource should fail", - pods: []v1.Pod{properTestPod}, - containerRequests: []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"all"}}}, - expectErr: true, + name: "Request for monitor resource should fail", + pods: []v1.Pod{properTestPod}, + prefContainerRequests: properPrefContainerRequests, + prefExpectErr: true, + containerRequests: []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"all"}}}, + expectErr: true, }, { - name: "Zero pending pods should fail", - pods: []v1.Pod{}, - containerRequests: properContainerRequests, - expectErr: true, + name: "Zero pending pods should fail", + pods: []v1.Pod{}, + prefContainerRequests: properPrefContainerRequests, + prefExpectErr: true, + containerRequests: properContainerRequests, + expectErr: true, }, { - name: "Single pending pod without annotations should fail", - pods: []v1.Pod{unAnnotatedTestPod}, - containerRequests: properContainerRequests, - expectErr: true, + name: "Single pending pod without annotations should fail", + pods: []v1.Pod{unAnnotatedTestPod}, + prefContainerRequests: properPrefContainerRequests, + prefExpectErr: true, + containerRequests: properContainerRequests, + expectErr: true, }, { - name: "Single pending pod should succeed", - pods: []v1.Pod{properTestPod}, - containerRequests: properContainerRequests, - expectErr: false, + name: "Single pending pod should succeed", + pods: []v1.Pod{properTestPod}, + prefContainerRequests: properPrefContainerRequests, + prefExpectErr: true, + containerRequests: properContainerRequests, + expectErr: false, }, { - name: "Two pending pods without timestamps should fail", - pods: []v1.Pod{properTestPod, properTestPod2}, - containerRequests: properContainerRequests, - expectErr: true, + name: "Two pending pods without timestamps should fail", + pods: []v1.Pod{properTestPod, properTestPod2}, + prefContainerRequests: properPrefContainerRequests, + prefExpectErr: true, + containerRequests: properContainerRequests, + expectErr: true, }, { - name: "Two pending pods with timestamps should reduce to one candidate and succeed", - pods: []v1.Pod{timeStampedProperTestPod, timeStampedProperTestPod2}, - containerRequests: properContainerRequests, - expectErr: false, + name: "Two pending pods with timestamps should reduce to one candidate and succeed", + pods: []v1.Pod{timeStampedProperTestPod, timeStampedProperTestPod2}, + prefContainerRequests: properPrefContainerRequests, + prefExpectErr: true, + containerRequests: properContainerRequests, + expectErr: false, }, } for _, tCase := range testCases { rm := newMockResourceManager(tCase.pods) - resp, err := rm.ReallocateWithFractionalResources(&v1beta1.AllocateRequest{ + _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ + ContainerRequests: tCase.prefContainerRequests, + }) + + if (perr != nil) && !tCase.prefExpectErr { + t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) + } + + resp, err := rm.CreateFractionalResourceResponse(&v1beta1.AllocateRequest{ ContainerRequests: tCase.containerRequests, }) @@ -239,7 +437,7 @@ func TestReallocateWithFractionalResources(t *testing.T) { } } -func TestReallocateWithFractionalResourcesWithOneCardTwoTiles(t *testing.T) { +func TestCreateFractionalResourceResponseWithOneCardTwoTiles(t *testing.T) { properTestPod := v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Annotations: map[string]string{ @@ -260,38 +458,49 @@ func TestReallocateWithFractionalResourcesWithOneCardTwoTiles(t *testing.T) { }, } + properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ + {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, + AllocationSize: 1}, + } + properContainerRequests := []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"card0-0"}}} tCase := testCase{ - name: "Single pending pod with two tiles should succeed", - pods: []v1.Pod{properTestPod}, - containerRequests: properContainerRequests, - expectErr: false, + name: "Single pending pod with two tiles should succeed", + pods: []v1.Pod{properTestPod}, + prefContainerRequests: properPrefContainerRequests, + prefExpectErr: false, + containerRequests: properContainerRequests, + expectErr: false, } rm := newMockResourceManager(tCase.pods) - resp, err := rm.ReallocateWithFractionalResources(&v1beta1.AllocateRequest{ + + _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ + ContainerRequests: tCase.prefContainerRequests, + }) + + if (perr != nil) && !tCase.prefExpectErr { + t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) + } + + resp, err := rm.CreateFractionalResourceResponse(&v1beta1.AllocateRequest{ ContainerRequests: tCase.containerRequests, }) if (err != nil) && !tCase.expectErr { t.Errorf("test %v unexpected failure, err:%v", tCase.name, err) } - if err == nil { - if tCase.expectErr { - t.Errorf("test %v unexpected success", tCase.name) - } else { - // check response - expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1") - expectTruef(len(resp.ContainerResponses[0].Envs) == 2, t, tCase.name, "wrong number of env variables in container response, expected 2") - expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set") - expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] == "0.0,0.1", t, tCase.name, "l0 affinity mask is incorrect") - expectTruef(len(resp.ContainerResponses[0].Devices) == 1, t, tCase.name, "wrong number of devices, expected 1") - } - } + + // check response + expectTruef(len(resp.ContainerResponses) == 1, t, tCase.name, "wrong number of container responses, expected 1") + expectTruef(len(resp.ContainerResponses[0].Envs) == 2, t, tCase.name, "wrong number of env variables in container response, expected 2") + expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] != "", t, tCase.name, "l0 tile mask not set") + expectTruef(resp.ContainerResponses[0].Envs[levelZeroAffinityMaskEnvVar] == "0.0,0.1", t, tCase.name, "l0 affinity mask is incorrect") + expectTruef(len(resp.ContainerResponses[0].Devices) == 1, t, tCase.name, "wrong number of devices, expected 1") } -func TestReallocateWithFractionalResourcesWithTwoCardsOneTile(t *testing.T) { +func TestCreateFractionalResourceResponseWithTwoCardsOneTile(t *testing.T) { properTestPod := v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Annotations: map[string]string{ @@ -312,23 +521,40 @@ func TestReallocateWithFractionalResourcesWithTwoCardsOneTile(t *testing.T) { }, } + properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ + {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1"}, + AllocationSize: 1}, + } + properContainerRequests := []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"card1-0", "card2-0"}}} tCase := testCase{ - name: "Single pending pod with two cards and one tile each should succeed", - pods: []v1.Pod{properTestPod}, - containerRequests: properContainerRequests, - expectErr: false, + name: "Single pending pod with two cards and one tile each should succeed", + pods: []v1.Pod{properTestPod}, + prefContainerRequests: properPrefContainerRequests, + prefExpectErr: false, + containerRequests: properContainerRequests, + expectErr: false, } rm := newMockResourceManager(tCase.pods) - resp, err := rm.ReallocateWithFractionalResources(&v1beta1.AllocateRequest{ + + _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ + ContainerRequests: tCase.prefContainerRequests, + }) + + if (perr != nil) && !tCase.prefExpectErr { + t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) + } + + resp, err := rm.CreateFractionalResourceResponse(&v1beta1.AllocateRequest{ ContainerRequests: tCase.containerRequests, }) if (err != nil) && !tCase.expectErr { t.Errorf("test %v unexpected failure, err:%v", tCase.name, err) } + if err == nil { if tCase.expectErr { t.Errorf("test %v unexpected success", tCase.name) @@ -342,7 +568,7 @@ func TestReallocateWithFractionalResourcesWithTwoCardsOneTile(t *testing.T) { } } -func TestReallocateWithFractionalResourcesWithThreeCardsTwoTiles(t *testing.T) { +func TestCreateFractionalResourceResponseWithThreeCardsTwoTiles(t *testing.T) { properTestPod := v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Annotations: map[string]string{ @@ -363,23 +589,40 @@ func TestReallocateWithFractionalResourcesWithThreeCardsTwoTiles(t *testing.T) { }, } + properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ + {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1", "card2-0", "card2-1"}, + AllocationSize: 1}, + } + properContainerRequests := []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"card0-0", "card1-0", "card2-0"}}} tCase := testCase{ - name: "Single pending pod with three cards and two tiles each should succeed", - pods: []v1.Pod{properTestPod}, - containerRequests: properContainerRequests, - expectErr: false, + name: "Single pending pod with three cards and two tiles each should succeed", + pods: []v1.Pod{properTestPod}, + prefContainerRequests: properPrefContainerRequests, + prefExpectErr: false, + containerRequests: properContainerRequests, + expectErr: false, } rm := newMockResourceManager(tCase.pods) - resp, err := rm.ReallocateWithFractionalResources(&v1beta1.AllocateRequest{ + + _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ + ContainerRequests: tCase.prefContainerRequests, + }) + + if (perr != nil) && !tCase.prefExpectErr { + t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) + } + + resp, err := rm.CreateFractionalResourceResponse(&v1beta1.AllocateRequest{ ContainerRequests: tCase.containerRequests, }) if (err != nil) && !tCase.expectErr { t.Errorf("test %v unexpected failure, err:%v", tCase.name, err) } + if err == nil { if tCase.expectErr { t.Errorf("test %v unexpected success", tCase.name) @@ -393,7 +636,7 @@ func TestReallocateWithFractionalResourcesWithThreeCardsTwoTiles(t *testing.T) { } } -func TestReallocateWithFractionalResourcesWithMultipleContainersTileEach(t *testing.T) { +func TestCreateFractionalResourceResponseWithMultipleContainersTileEach(t *testing.T) { properTestPod := v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Annotations: map[string]string{ @@ -421,23 +664,44 @@ func TestReallocateWithFractionalResourcesWithMultipleContainersTileEach(t *test }, } - properContainerRequests := []*v1beta1.ContainerAllocateRequest{{DevicesIDs: []string{"card1-0"}}, {DevicesIDs: []string{"card2-0"}}} + properPrefContainerRequests := []*v1beta1.ContainerPreferredAllocationRequest{ + {AvailableDeviceIDs: []string{"card0-0", "card0-1", "card1-0", "card1-1", "card2-0", "card2-1"}, + AllocationSize: 1}, + } + _ = properPrefContainerRequests + + properContainerRequests := []*v1beta1.ContainerAllocateRequest{ + {DevicesIDs: []string{"card1-0"}}, + {DevicesIDs: []string{"card2-0"}}, + } tCase := testCase{ - name: "Single pending pod with two containers with one tile each should fail", - pods: []v1.Pod{properTestPod}, - containerRequests: properContainerRequests, - expectErr: true, + name: "Single pending pod with two containers with one tile each should FAIL", + pods: []v1.Pod{properTestPod}, + prefContainerRequests: properPrefContainerRequests, + prefExpectErr: false, + containerRequests: properContainerRequests, + expectErr: true, } rm := newMockResourceManager(tCase.pods) - _, err := rm.ReallocateWithFractionalResources(&v1beta1.AllocateRequest{ + + _, perr := rm.GetPreferredFractionalAllocation(&v1beta1.PreferredAllocationRequest{ + ContainerRequests: properPrefContainerRequests, + }) + + if (perr != nil) && !tCase.prefExpectErr { + t.Errorf("test %v unexpected failure, err:%v", tCase.name, perr) + } + + _, err := rm.CreateFractionalResourceResponse(&v1beta1.AllocateRequest{ ContainerRequests: tCase.containerRequests, }) if (err != nil) && !tCase.expectErr { t.Errorf("test %v unexpected failure, err:%v", tCase.name, err) } + if err == nil { if tCase.expectErr { t.Errorf("test %v unexpected success", tCase.name) @@ -448,8 +712,8 @@ func TestReallocateWithFractionalResourcesWithMultipleContainersTileEach(t *test func TestTileAnnotationParsing(t *testing.T) { type parseTest struct { line string - index int result string + index int } parseTests := []parseTest{ @@ -488,11 +752,36 @@ func TestTileAnnotationParsing(t *testing.T) { index: 2, result: "0.0", }, + { + line: "||card5:gt0,card6:gt4||", + index: 0, + result: "0.0,1.4", + }, + { + line: "||card5:gt0,card6:gt4||", + index: 1, + result: "", + }, + { + line: "||card5:gt0,card:6:gt4||", + index: 0, + result: "", + }, + { + line: "||card5:gt0,card6:gt+gt+gt||", + index: 0, + result: "", + }, { line: "card1:gtX", index: 0, result: "", }, + { + line: "card1:64X", + index: 0, + result: "", + }, { line: "|", index: 0, @@ -529,6 +818,65 @@ func TestTileAnnotationParsing(t *testing.T) { } } +func TestSelectDeviceIDsForContainerDoubleCards(t *testing.T) { + cards := []string{ + "card0", + "card1", + } + + deviceIds := []string{ + "card0-0", + "card0-1", + "card0-2", + "card1-0", + "card1-1", + "card1-2", + } + + selected := selectDeviceIDsForContainer(2, cards, deviceIds, []string{}) + if len(selected) != 2 { + t.Errorf("Not the correct amount of devices were selected") + } + + correctDevices := map[string]bool{ + "card0-0": false, + "card1-0": false, + } + + for _, selected := range selected { + correctDevices[selected] = true + } + + for dev, used := range correctDevices { + if !used { + t.Errorf("correct device was not used: %s", dev) + } + } +} + +func TestSelectDeviceIDsForContainerSingleCard(t *testing.T) { + cards := []string{ + "card2", + } + + deviceIds := []string{ + "card0-0", + "card0-1", + "card1-0", + "card2-0", + "card2-1", + } + + selected := selectDeviceIDsForContainer(1, cards, deviceIds, []string{}) + if len(selected) != 1 { + t.Errorf("Not the correct amount of devices were selected") + } + + if selected[0] != "card2-0" { + t.Errorf("First selection is wrong: %s vs %s", selected[0], "card2-0") + } +} + func expectTruef(predicate bool, t *testing.T, testName, format string, args ...interface{}) { if !predicate { t.Helper()