mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
e2e: gpu: add tests for different deployments
Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
parent
402fb8d9cd
commit
8dd5b4aa44
@ -73,6 +73,8 @@ issues:
|
||||
- path: test/e2e/
|
||||
linters:
|
||||
- wsl
|
||||
- gocognit
|
||||
- gocyclo
|
||||
- path: cmd/gpu_fakedev/
|
||||
linters:
|
||||
- wsl
|
||||
|
@ -84,7 +84,7 @@ func describe() {
|
||||
ginkgo.Context("When PF resources are available [Resource:pf]", func() {
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
resource := v1.ResourceName("dlb.intel.com/pf")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second); err != nil {
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource %s: %v", resource, err)
|
||||
}
|
||||
})
|
||||
@ -101,7 +101,7 @@ func describe() {
|
||||
ginkgo.Context("When VF resources are available [Resource:vf]", func() {
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
resource := v1.ResourceName("dlb.intel.com/vf")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second); err != nil {
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resource, 30*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource %s: %v", resource, err)
|
||||
}
|
||||
})
|
||||
|
@ -97,7 +97,7 @@ func describe() {
|
||||
ginkgo.Context("When DSA resources are available [Resource:dedicated]", func() {
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
ginkgo.By("checking if the resource is allocatable")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "dsa.intel.com/wq-user-dedicated", 300*time.Second); err != nil {
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "dsa.intel.com/wq-user-dedicated", 300*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
}
|
||||
})
|
||||
|
@ -129,7 +129,7 @@ func runDevicePlugin(ctx context.Context, fmw *framework.Framework, pluginKustom
|
||||
|
||||
ginkgo.By("checking if the resource is allocatable")
|
||||
|
||||
if err = utils.WaitForNodesWithResource(ctx, fmw.ClientSet, resource, 30*time.Second); err != nil {
|
||||
if err = utils.WaitForNodesWithResource(ctx, fmw.ClientSet, resource, 30*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
}
|
||||
}
|
||||
|
@ -37,51 +37,112 @@ import (
|
||||
|
||||
const (
|
||||
kustomizationYaml = "deployments/gpu_plugin/kustomization.yaml"
|
||||
monitoringYaml = "deployments/gpu_plugin/overlays/monitoring_shared-dev_nfd/kustomization.yaml"
|
||||
rmEnabledYaml = "deployments/gpu_plugin/overlays/fractional_resources//kustomization.yaml"
|
||||
nfdRulesYaml = "deployments/nfd/overlays/node-feature-rules/kustomization.yaml"
|
||||
containerName = "testcontainer"
|
||||
tfKustomizationYaml = "deployments/gpu_tensorflow_test/kustomization.yaml"
|
||||
tfPodName = "training-pod"
|
||||
)
|
||||
|
||||
func init() {
|
||||
ginkgo.Describe("GPU plugin [Device:gpu]", describe)
|
||||
// This needs to be Ordered because only one GPU plugin can function on the node at once.
|
||||
ginkgo.Describe("GPU plugin [Device:gpu]", describe, ginkgo.Ordered)
|
||||
}
|
||||
|
||||
func createPluginAndVerifyExistence(f *framework.Framework, ctx context.Context, kustomizationPath, baseResource string) {
|
||||
ginkgo.By("deploying GPU plugin")
|
||||
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomizationPath))
|
||||
|
||||
ginkgo.By("waiting for GPU plugin's availability")
|
||||
podList, err := e2epod.WaitForPodsWithLabelRunningReady(ctx, f.ClientSet, f.Namespace.Name,
|
||||
labels.Set{"app": "intel-gpu-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second)
|
||||
if err != nil {
|
||||
e2edebug.DumpAllNamespaceInfo(ctx, f.ClientSet, f.Namespace.Name)
|
||||
e2ekubectl.LogFailedContainers(ctx, f.ClientSet, f.Namespace.Name, framework.Logf)
|
||||
framework.Failf("unable to wait for all pods to be running and ready: %v", err)
|
||||
}
|
||||
|
||||
ginkgo.By("checking GPU plugin's securityContext")
|
||||
if err = utils.TestPodsFileSystemInfo(podList.Items); err != nil {
|
||||
framework.Failf("container filesystem info checks failed: %v", err)
|
||||
}
|
||||
|
||||
ginkgo.By("checking if the resource is allocatable")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, v1.ResourceName(baseResource), 30*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func describe() {
|
||||
f := framework.NewDefaultFramework("gpuplugin")
|
||||
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged
|
||||
|
||||
kustomizationPath, errFailedToLocateRepoFile := utils.LocateRepoFile(kustomizationYaml)
|
||||
vanillaPath, errFailedToLocateRepoFile := utils.LocateRepoFile(kustomizationYaml)
|
||||
if errFailedToLocateRepoFile != nil {
|
||||
framework.Failf("unable to locate %q: %v", kustomizationYaml, errFailedToLocateRepoFile)
|
||||
}
|
||||
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
ginkgo.By("deploying GPU plugin")
|
||||
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(kustomizationPath))
|
||||
monitoringPath, errFailedToLocateRepoFile := utils.LocateRepoFile(monitoringYaml)
|
||||
if errFailedToLocateRepoFile != nil {
|
||||
framework.Failf("unable to locate %q: %v", monitoringYaml, errFailedToLocateRepoFile)
|
||||
}
|
||||
|
||||
ginkgo.By("waiting for GPU plugin's availability")
|
||||
podList, err := e2epod.WaitForPodsWithLabelRunningReady(ctx, f.ClientSet, f.Namespace.Name,
|
||||
labels.Set{"app": "intel-gpu-plugin"}.AsSelector(), 1 /* one replica */, 100*time.Second)
|
||||
if err != nil {
|
||||
e2edebug.DumpAllNamespaceInfo(ctx, f.ClientSet, f.Namespace.Name)
|
||||
e2ekubectl.LogFailedContainers(ctx, f.ClientSet, f.Namespace.Name, framework.Logf)
|
||||
framework.Failf("unable to wait for all pods to be running and ready: %v", err)
|
||||
}
|
||||
nfdRulesPath, errFailedToLocateRepoFile := utils.LocateRepoFile(nfdRulesYaml)
|
||||
if errFailedToLocateRepoFile != nil {
|
||||
framework.Failf("unable to locate %q: %v", nfdRulesYaml, errFailedToLocateRepoFile)
|
||||
}
|
||||
|
||||
ginkgo.By("checking GPU plugin's securityContext")
|
||||
if err = utils.TestPodsFileSystemInfo(podList.Items); err != nil {
|
||||
framework.Failf("container filesystem info checks failed: %v", err)
|
||||
}
|
||||
})
|
||||
resourceManagerPath, errFailedToLocateRepoFile := utils.LocateRepoFile(rmEnabledYaml)
|
||||
if errFailedToLocateRepoFile != nil {
|
||||
framework.Failf("unable to locate %q: %v", rmEnabledYaml, errFailedToLocateRepoFile)
|
||||
}
|
||||
|
||||
ginkgo.Context("When GPU resources are available [Resource:i915]", func() {
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
ginkgo.By("checking if the resource is allocatable")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/i915", 30*time.Second); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
ginkgo.Context("When GPU plugin is deployed [Resource:i915]", func() {
|
||||
ginkgo.AfterEach(func(ctx context.Context) {
|
||||
framework.Logf("Removing gpu-plugin manually")
|
||||
|
||||
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "delete", "-k", filepath.Dir(vanillaPath))
|
||||
|
||||
framework.Logf("Waiting for i915 resources to go to zero")
|
||||
|
||||
// Wait for resources to go to zero
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/i915", 30*time.Second, utils.WaitForZeroResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have no resources: %v", err)
|
||||
}
|
||||
})
|
||||
|
||||
ginkgo.It("checks availability of GPU resources [App:busybox]", func(ctx context.Context) {
|
||||
createPluginAndVerifyExistence(f, ctx, vanillaPath, "gpu.intel.com/i915")
|
||||
|
||||
podListFunc := framework.ListObjects(f.ClientSet.CoreV1().Pods(f.Namespace.Name).List, metav1.ListOptions{})
|
||||
|
||||
pods, err := podListFunc(ctx)
|
||||
if err != nil {
|
||||
framework.Failf("Couldn't list pods: %+v", err)
|
||||
}
|
||||
|
||||
if len(pods.Items) != 1 {
|
||||
framework.Failf("Invalid amount of Pods listed %d", len(pods.Items))
|
||||
}
|
||||
|
||||
pluginPod := pods.Items[0]
|
||||
|
||||
ginkgo.By("checking if CDI path is included in volumes")
|
||||
found := false
|
||||
for _, v := range pluginPod.Spec.Volumes {
|
||||
if v.HostPath != nil && v.HostPath.Path == "/var/run/cdi" {
|
||||
framework.Logf("CDI volume found")
|
||||
found = true
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
framework.Fail("Couldn't find CDI volume in GPU plugin deployment")
|
||||
}
|
||||
|
||||
ginkgo.By("submitting a pod requesting GPU resources")
|
||||
podSpec := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "gpuplugin-tester"},
|
||||
@ -122,7 +183,41 @@ func describe() {
|
||||
framework.Logf("found card and renderD from the log")
|
||||
})
|
||||
|
||||
ginkgo.Context("When [Deployment:monitoring] deployment is applied [Resource:i915]", func() {
|
||||
ginkgo.It("check if monitoring resource is available", func(ctx context.Context) {
|
||||
createPluginAndVerifyExistence(f, ctx, monitoringPath, "gpu.intel.com/i915")
|
||||
|
||||
ginkgo.By("checking if the monitoring resource is allocatable")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/i915_monitoring", 30*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
ginkgo.Context("When [Deployment:resourceManager] deployment is applied [Resource:i915]", func() {
|
||||
ginkgo.It("check if i915 resources is available", func(ctx context.Context) {
|
||||
e2ekubectl.RunKubectlOrDie(f.Namespace.Name, "apply", "-k", filepath.Dir(nfdRulesPath))
|
||||
|
||||
createPluginAndVerifyExistence(f, ctx, resourceManagerPath, "gpu.intel.com/i915")
|
||||
|
||||
// To speed up extended resource detection, let's restart NFD worker
|
||||
e2ekubectl.RunKubectlOrDie("node-feature-discovery", "rollout", "restart", "daemonset", "nfd-worker")
|
||||
|
||||
ginkgo.By("checking if the millicores resource is allocatable")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/millicores", 30*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
}
|
||||
|
||||
ginkgo.By("checking if the tiles resource is allocatable")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/tiles", 30*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
ginkgo.It("run a small workload on the GPU [App:tensorflow]", func(ctx context.Context) {
|
||||
createPluginAndVerifyExistence(f, ctx, vanillaPath, "gpu.intel.com/i915")
|
||||
|
||||
kustomYaml, err := utils.LocateRepoFile(tfKustomizationYaml)
|
||||
if err != nil {
|
||||
framework.Failf("unable to locate %q: %v", tfKustomizationYaml, err)
|
||||
@ -146,13 +241,9 @@ func describe() {
|
||||
})
|
||||
|
||||
ginkgo.Context("When GPU resources are available [Resource:xe]", func() {
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
ginkgo.By("checking if the resource is allocatable")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "gpu.intel.com/xe", 30*time.Second); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
}
|
||||
})
|
||||
ginkgo.It("checks availability of GPU resources [App:busybox]", func(ctx context.Context) {
|
||||
createPluginAndVerifyExistence(f, ctx, vanillaPath, "gpu.intel.com/xe")
|
||||
|
||||
ginkgo.By("submitting a pod requesting GPU resources")
|
||||
podSpec := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: "gpuplugin-tester"},
|
||||
|
@ -97,7 +97,7 @@ func describe() {
|
||||
ginkgo.Context("When IAA resources are available [Resource:dedicated]", func() {
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
ginkgo.By("checking if the resource is allocatable")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "iaa.intel.com/wq-user-dedicated", 300*time.Second); err != nil {
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "iaa.intel.com/wq-user-dedicated", 300*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
}
|
||||
})
|
||||
|
@ -89,7 +89,7 @@ func testPluginWithOperator(deviceName string, resourceNames []v1.ResourceName,
|
||||
}
|
||||
|
||||
for _, resourceName := range resourceNames {
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, timeout); err != nil {
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, timeout, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
}
|
||||
}
|
||||
|
@ -98,7 +98,7 @@ func describeQatDpdkPlugin() {
|
||||
}
|
||||
|
||||
ginkgo.By("checking if the resource is allocatable")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, 30*time.Second); err != nil {
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, resourceName, 30*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
}
|
||||
})
|
||||
|
@ -82,7 +82,7 @@ func describeQatKernelPlugin() {
|
||||
ginkgo.Context("When QAT resources are available [Resource:cy1_dc0]", func() {
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
ginkgo.By("checking if the resource is allocatable")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "qat.intel.com/cy1_dc0", 30*time.Second); err != nil {
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "qat.intel.com/cy1_dc0", 30*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable resource: %v", err)
|
||||
}
|
||||
})
|
||||
|
@ -82,13 +82,13 @@ func describe() {
|
||||
ginkgo.Context("When SGX resources are available", func() {
|
||||
ginkgo.BeforeEach(func(ctx context.Context) {
|
||||
ginkgo.By("checking if the resource is allocatable")
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/epc", 150*time.Second); err != nil {
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/epc", 150*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable epc resource: %v", err)
|
||||
}
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/enclave", 30*time.Second); err != nil {
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/enclave", 30*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable enclave resource: %v", err)
|
||||
}
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/provision", 30*time.Second); err != nil {
|
||||
if err := utils.WaitForNodesWithResource(ctx, f.ClientSet, "sgx.intel.com/provision", 30*time.Second, utils.WaitForPositiveResource); err != nil {
|
||||
framework.Failf("unable to wait for nodes to have positive allocatable provision resource: %v", err)
|
||||
}
|
||||
})
|
||||
|
@ -52,9 +52,20 @@ func GetPodLogs(ctx context.Context, f *framework.Framework, podName, containerN
|
||||
return fmt.Sprintf("log output of the container %s in the pod %s:%s", containerName, podName, log)
|
||||
}
|
||||
|
||||
// WaitForNodesWithResource waits for nodes to have positive allocatable resource.
|
||||
func WaitForNodesWithResource(ctx context.Context, c clientset.Interface, res v1.ResourceName, timeout time.Duration) error {
|
||||
framework.Logf("Waiting up to %s for any positive allocatable resource %q", timeout, res)
|
||||
type WaitForResourceFunc func(resourceCount int) bool
|
||||
|
||||
func WaitForPositiveResource(resourceCount int) bool {
|
||||
return resourceCount > 0
|
||||
}
|
||||
|
||||
func WaitForZeroResource(resourceCount int) bool {
|
||||
return resourceCount == 0
|
||||
}
|
||||
|
||||
// WaitForNodesWithResource waits for node's resources to change.
|
||||
// Depending on the waitOperation, function waits for positive resource count or a zero resource count.
|
||||
func WaitForNodesWithResource(ctx context.Context, c clientset.Interface, res v1.ResourceName, timeout time.Duration, waitForResourceFunc WaitForResourceFunc) error {
|
||||
framework.Logf("Waiting up to %s for allocatable resource %q", timeout, res)
|
||||
|
||||
start := time.Now()
|
||||
|
||||
@ -73,7 +84,8 @@ func WaitForNodesWithResource(ctx context.Context, c clientset.Interface, res v1
|
||||
}
|
||||
}
|
||||
framework.Logf("Found %d of %q. Elapsed: %s", resNum, res, time.Since(start))
|
||||
if resNum > 0 {
|
||||
|
||||
if waitForResourceFunc(resNum) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user