From ba9705befa7f57fcb9f013117408982c06941bb3 Mon Sep 17 00:00:00 2001 From: Sabuj Maity Date: Fri, 24 Apr 2026 15:28:53 +0530 Subject: [PATCH] OCPNODE-4108: add E2E tests for upstream dra-example-driver Add hardware-independent DRA regression tests using the upstream dra-example-driver (kubernetes-sigs/dra-example-driver). Tests cover single and multi-device allocation, pod lifecycle cleanup, claim sharing, and ResourceClaimTemplate flows. OpenShift adaptations: - SCC grant for privileged kubelet plugin DaemonSet - PSA labels on driver namespace - SNO tolerations for control-plane scheduling - image.ShellImage() for proper release payload mirroring --- test/extended/include.go | 1 + test/extended/node/dra/example/OWNERS | 17 + test/extended/node/dra/example/README.md | 138 +++++ .../node/dra/example/device_validator.go | 128 +++++ test/extended/node/dra/example/example_dra.go | 510 ++++++++++++++++++ .../dra/example/prerequisites_installer.go | 374 +++++++++++++ .../node/dra/example/resource_builder.go | 172 ++++++ 7 files changed, 1340 insertions(+) create mode 100644 test/extended/node/dra/example/OWNERS create mode 100644 test/extended/node/dra/example/README.md create mode 100644 test/extended/node/dra/example/device_validator.go create mode 100644 test/extended/node/dra/example/example_dra.go create mode 100644 test/extended/node/dra/example/prerequisites_installer.go create mode 100644 test/extended/node/dra/example/resource_builder.go diff --git a/test/extended/include.go b/test/extended/include.go index 12b374dec145..6e7ffafa7f11 100644 --- a/test/extended/include.go +++ b/test/extended/include.go @@ -40,6 +40,7 @@ import ( _ "github.com/openshift/origin/test/extended/machines" _ "github.com/openshift/origin/test/extended/networking" _ "github.com/openshift/origin/test/extended/node" + _ "github.com/openshift/origin/test/extended/node/dra/example" _ "github.com/openshift/origin/test/extended/node/dra/nvidia" _ "github.com/openshift/origin/test/extended/node/node_e2e" _ "github.com/openshift/origin/test/extended/node_tuning" diff --git a/test/extended/node/dra/example/OWNERS b/test/extended/node/dra/example/OWNERS new file mode 100644 index 000000000000..07217dd1abb3 --- /dev/null +++ b/test/extended/node/dra/example/OWNERS @@ -0,0 +1,17 @@ +approvers: + - sairameshv + - harche + - haircommander + - rphillips + - mrunalp + +reviewers: + - sairameshv + - harche + - haircommander + - rphillips + - mrunalp + +labels: + - sig/scheduling + - area/dra diff --git a/test/extended/node/dra/example/README.md b/test/extended/node/dra/example/README.md new file mode 100644 index 000000000000..01bed5d5bb80 --- /dev/null +++ b/test/extended/node/dra/example/README.md @@ -0,0 +1,138 @@ +# DRA Example Driver Extended Tests for OpenShift + +This directory contains extended tests for the upstream [dra-example-driver](https://github.com/kubernetes-sigs/dra-example-driver) on OpenShift clusters. These tests provide **hardware-independent** DRA regression coverage — no GPU or special hardware is required. + +## Overview + +These tests validate: +- DRA example driver installation and lifecycle +- Single device allocation via ResourceClaims +- Multi-device allocation +- Pod lifecycle and resource cleanup +- Claim sharing behavior +- ResourceClaimTemplate-based claim creation and cleanup + +## Prerequisites + +1. **OpenShift 4.21+** cluster (DRA API enabled by default) +2. **Helm 3** installed and available in PATH +3. **git** installed and available in PATH +4. **Cluster-admin** access + +The test framework automatically: +- Clones the upstream `dra-example-driver` repository +- Installs the driver via Helm with OpenShift SCC permissions +- Waits for driver components to be ready + +## Quick Start + +```bash +# 1. Build test binary +make WHAT=cmd/openshift-tests + +# 2. Set kubeconfig +export KUBECONFIG=/path/to/kubeconfig + +# 3. Run all DRA example driver tests (local binary) +OPENSHIFT_SKIP_EXTERNAL_TESTS=1 \ + ./openshift-tests run --dry-run all 2>&1 | \ + grep "\[Feature:DRA-Example\]" | \ + OPENSHIFT_SKIP_EXTERNAL_TESTS=1 ./openshift-tests run -f - + +# OR run a specific test +OPENSHIFT_SKIP_EXTERNAL_TESTS=1 ./openshift-tests run-test \ + '[sig-scheduling][Feature:DRA-Example][Suite:openshift/dra-example][Serial] Basic Device Allocation should allocate single device to pod via DRA' + +# OR list all available tests +OPENSHIFT_SKIP_EXTERNAL_TESTS=1 \ + ./openshift-tests run --dry-run all 2>&1 | grep "\[Feature:DRA-Example\]" +``` + +> **Note**: `OPENSHIFT_SKIP_EXTERNAL_TESTS=1` is required when running a locally +> built binary. Without it, the `run` command attempts to extract test binaries +> from the cluster's release payload, which does not contain your local changes. +> This variable is NOT needed in CI where the binary is part of the payload. + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `DRA_EXAMPLE_DRIVER_REF` | `main` | Git ref (branch/tag) of the upstream dra-example-driver to install | + +## Test Scenarios + +### 1. Single Device Allocation +- Creates DeviceClass with CEL selector for `gpu.example.com` driver +- Creates ResourceClaim requesting 1 device +- Schedules pod with ResourceClaim +- Validates device allocation in ResourceClaim status + +### 2. Resource Cleanup +- Creates pod with device ResourceClaim +- Deletes pod +- Verifies ResourceClaim persists after pod deletion but is unreserved + +### 3. Multi-Device Allocation +- Creates ResourceClaim requesting 2 devices +- Schedules pod requiring multiple devices +- Validates all devices are allocated (driver publishes 9 virtual devices per node) + +### 4. Claim Sharing +- Creates a single ResourceClaim +- Creates two pods referencing the same ResourceClaim +- Verifies behavior: both pods run (sharing supported) or second pod stays Pending + +### 5. ResourceClaimTemplate +- Creates a ResourceClaimTemplate +- Creates pod with ResourceClaimTemplate reference +- Validates that ResourceClaim is auto-created from template +- Validates automatic cleanup of template-generated claim when pod is deleted + +## OpenShift-Specific Adaptations + +The upstream `dra-example-driver` Helm chart requires the following OpenShift adaptations (handled automatically by the test framework): + +1. **SCC Grant**: The kubelet plugin DaemonSet runs with `privileged: true` and mounts hostPath volumes. A ClusterRoleBinding grants the `system:openshift:scc:privileged` ClusterRole to the driver ServiceAccount. + +2. **SNO Tolerations**: Control-plane tolerations are added to allow scheduling on single-node OpenShift clusters. + +## Troubleshooting + +### Helm not found + +**Cause**: Helm 3 not installed. + +**Solution**: Install Helm following [official instructions](https://helm.sh/docs/intro/install/). + +### SCC denied — kubelet plugin pod rejected + +**Cause**: ClusterRoleBinding for privileged SCC not created. + +**Solution**: The test framework creates this automatically. For manual debugging: + +```bash +oc adm policy add-scc-to-user privileged \ + -n dra-example-driver \ + -z dra-example-driver-service-account +``` + +### ResourceSlices not appearing + +**Cause**: DRA driver DaemonSet not ready. + +**Solution**: + +```bash +# Check DRA driver pods +oc get pods -n dra-example-driver + +# Check DaemonSet logs +oc logs -n dra-example-driver -l app.kubernetes.io/name=dra-example-driver --all-containers +``` + +## References + +- **Upstream repository**: https://github.com/kubernetes-sigs/dra-example-driver +- **Kubernetes DRA docs**: https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/ +- **OpenShift Extended Tests**: https://github.com/openshift/origin/tree/master/test/extended +- **NVIDIA DRA tests (reference)**: `test/extended/node/dra/nvidia/` diff --git a/test/extended/node/dra/example/device_validator.go b/test/extended/node/dra/example/device_validator.go new file mode 100644 index 000000000000..b5badae2c4ac --- /dev/null +++ b/test/extended/node/dra/example/device_validator.go @@ -0,0 +1,128 @@ +package example + +import ( + "context" + "fmt" + + resourceapi "k8s.io/api/resource/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" +) + +// DeviceValidator validates DRA device allocation and ResourceSlice state for the example driver. +type DeviceValidator struct { + client kubernetes.Interface + framework *framework.Framework +} + +// NewDeviceValidator creates a DeviceValidator using the provided test framework. +func NewDeviceValidator(f *framework.Framework) *DeviceValidator { + return &DeviceValidator{ + client: f.ClientSet, + framework: f, + } +} + +// ValidateDeviceAllocation checks that the given ResourceClaim has exactly expectedCount devices allocated. +func (dv *DeviceValidator) ValidateDeviceAllocation(ctx context.Context, namespace, claimName string, expectedCount int) error { + framework.Logf("Validating ResourceClaim allocation for %s/%s (expected %d device(s))", namespace, claimName, expectedCount) + + claim, err := dv.client.ResourceV1().ResourceClaims(namespace).Get(ctx, claimName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get ResourceClaim %s/%s: %w", namespace, claimName, err) + } + + if claim.Status.Allocation == nil { + return fmt.Errorf("ResourceClaim %s/%s is not allocated", namespace, claimName) + } + + deviceCount := len(claim.Status.Allocation.Devices.Results) + if deviceCount != expectedCount { + return fmt.Errorf("ResourceClaim %s/%s expected %d device(s) but got %d", + namespace, claimName, expectedCount, deviceCount) + } + + framework.Logf("ResourceClaim %s/%s has %d device(s) allocated", namespace, claimName, deviceCount) + + for i, result := range claim.Status.Allocation.Devices.Results { + if result.Driver != exampleDriverName { + return fmt.Errorf("device %d has incorrect driver %q, expected %q", i, result.Driver, exampleDriverName) + } + if result.Pool == "" { + return fmt.Errorf("device %d has empty pool field", i) + } + if result.Device == "" { + return fmt.Errorf("device %d has empty device field", i) + } + if result.Request == "" { + return fmt.Errorf("device %d has empty request field", i) + } + + framework.Logf("Device %d validated: driver=%s, pool=%s, device=%s, request=%s", + i, result.Driver, result.Pool, result.Device, result.Request) + } + + return nil +} + +// ValidateResourceSlice finds and validates the ResourceSlice published by the example driver on the given node. +func (dv *DeviceValidator) ValidateResourceSlice(ctx context.Context, nodeName string) (*resourceapi.ResourceSlice, error) { + framework.Logf("Validating ResourceSlice for node %s", nodeName) + + sliceList, err := dv.client.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to list ResourceSlices: %w", err) + } + + var nodeSlice *resourceapi.ResourceSlice + totalDevices := 0 + for i := range sliceList.Items { + slice := &sliceList.Items[i] + if slice.Spec.NodeName != nil && *slice.Spec.NodeName == nodeName && + slice.Spec.Driver == exampleDriverName { + totalDevices += len(slice.Spec.Devices) + if nodeSlice == nil && len(slice.Spec.Devices) > 0 { + nodeSlice = slice + } + } + } + + if nodeSlice == nil { + return nil, fmt.Errorf("no ResourceSlice with devices found for driver %s on node %s", exampleDriverName, nodeName) + } + + framework.Logf("Node %s has %d total device(s) across matching ResourceSlices (returning slice %s)", + nodeName, totalDevices, nodeSlice.Name) + return nodeSlice, nil +} + +// GetTotalDeviceCount returns the total number of devices published by the example driver across all nodes. +func (dv *DeviceValidator) GetTotalDeviceCount(ctx context.Context) (int, error) { + framework.Logf("Counting total devices from %s driver via ResourceSlices", exampleDriverName) + + sliceList, err := dv.client.ResourceV1().ResourceSlices().List(ctx, metav1.ListOptions{}) + if err != nil { + return 0, fmt.Errorf("failed to list ResourceSlices: %w", err) + } + + totalDevices := 0 + for _, slice := range sliceList.Items { + if slice.Spec.Driver == exampleDriverName { + totalDevices += len(slice.Spec.Devices) + } + } + + framework.Logf("Found %d total device(s) from %s driver", totalDevices, exampleDriverName) + return totalDevices, nil +} + +// IsDriverPublishingDevices returns true if the example driver has published at least one device. +func (dv *DeviceValidator) IsDriverPublishingDevices(ctx context.Context) bool { + count, err := dv.GetTotalDeviceCount(ctx) + if err != nil { + framework.Logf("Failed to check if %s is publishing devices: %v", exampleDriverName, err) + return false + } + return count > 0 +} diff --git a/test/extended/node/dra/example/example_dra.go b/test/extended/node/dra/example/example_dra.go new file mode 100644 index 000000000000..a936543e4f1e --- /dev/null +++ b/test/extended/node/dra/example/example_dra.go @@ -0,0 +1,510 @@ +package example + +import ( + "context" + "fmt" + "strings" + "sync" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + + g "github.com/onsi/ginkgo/v2" + o "github.com/onsi/gomega" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/dynamic" + "k8s.io/kubernetes/test/e2e/framework" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + admissionapi "k8s.io/pod-security-admission/api" + "k8s.io/utils/ptr" + + exutil "github.com/openshift/origin/test/extended/util" +) + +var ( + deviceClassGVR = schema.GroupVersionResource{ + Group: "resource.k8s.io", + Version: "v1", + Resource: "deviceclasses", + } + resourceClaimGVR = schema.GroupVersionResource{ + Group: "resource.k8s.io", + Version: "v1", + Resource: "resourceclaims", + } + resourceClaimTemplateGVR = schema.GroupVersionResource{ + Group: "resource.k8s.io", + Version: "v1", + Resource: "resourceclaimtemplates", + } + + prerequisitesOnce sync.Once + prerequisitesInstalled bool + prerequisitesError error +) + +var _ = g.Describe("[sig-scheduling][Feature:DRA-Example][Suite:openshift/dra-example][Serial][Skipped:Disconnected]", func() { + defer g.GinkgoRecover() + + oc := exutil.NewCLIWithPodSecurityLevel("dra-example", admissionapi.LevelPrivileged) + + var ( + prereqInstaller *PrerequisitesInstaller + validator *DeviceValidator + builder *ResourceBuilder + ) + + g.BeforeEach(func(ctx context.Context) { + isMicroShift, err := exutil.IsMicroShiftCluster(oc.AdminKubeClient()) + o.Expect(err).NotTo(o.HaveOccurred()) + if isMicroShift { + g.Skip("Skipping DRA example driver tests on MicroShift cluster") + } + + validator = NewDeviceValidator(oc.KubeFramework()) + builder = NewResourceBuilder(oc.Namespace()) + prereqInstaller = NewPrerequisitesInstaller(oc.KubeFramework()) + + prerequisitesOnce.Do(func() { + framework.Logf("Checking DRA example driver prerequisites") + + if prereqInstaller.IsDriverInstalled(ctx) && validator.IsDriverPublishingDevices(ctx) { + framework.Logf("DRA example driver already installed and publishing devices") + prerequisitesInstalled = true + return + } + + framework.Logf("Installing DRA example driver...") + if err := prereqInstaller.InstallAll(ctx); err != nil { + prerequisitesError = err + framework.Logf("ERROR: Failed to install DRA example driver: %v", err) + return + } + + prerequisitesInstalled = true + framework.Logf("DRA example driver installation completed successfully") + }) + + if prerequisitesError != nil { + g.Fail(fmt.Sprintf("DRA example driver prerequisites failed: %v", prerequisitesError)) + } + if !prerequisitesInstalled { + g.Fail("DRA example driver prerequisites not installed") + } + }) + + g.Context("Basic Device Allocation", func() { + g.It("should allocate single device to pod via DRA", func(ctx context.Context) { + deviceClassName := "test-example-device-" + oc.Namespace() + claimName := "test-device-claim" + podName := "test-device-pod" + + g.By("Creating DeviceClass for example driver") + deviceClass := builder.BuildDeviceClass(deviceClassName) + err := createDeviceClass(ctx, oc.KubeFramework().DynamicClient, deviceClass) + framework.ExpectNoError(err, "Failed to create DeviceClass") + defer func() { + if err := deleteDeviceClass(ctx, oc.KubeFramework().DynamicClient, deviceClassName); err != nil { + framework.Logf("Warning: failed to delete DeviceClass %s: %v", deviceClassName, err) + } + }() + + g.By("Creating ResourceClaim requesting 1 device") + claim := builder.BuildResourceClaim(claimName, deviceClassName, 1) + err = createResourceClaim(ctx, oc.KubeFramework().DynamicClient, oc.Namespace(), claim) + framework.ExpectNoError(err, "Failed to create ResourceClaim") + defer func() { + if err := deleteResourceClaim(ctx, oc.KubeFramework().DynamicClient, oc.Namespace(), claimName); err != nil { + framework.Logf("Warning: failed to delete ResourceClaim %s/%s: %v", oc.Namespace(), claimName, err) + } + }() + + g.By("Creating Pod using the ResourceClaim") + pod := builder.BuildPodWithClaim(podName, claimName, "") + pod, err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Create(ctx, pod, metav1.CreateOptions{}) + framework.ExpectNoError(err, "Failed to create pod") + defer func() { + if err := oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Delete(ctx, podName, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { + framework.Logf("Warning: failed to delete pod %s/%s: %v", oc.Namespace(), podName, err) + } + }() + + g.By("Waiting for pod to be running") + err = e2epod.WaitForPodRunningInNamespace(ctx, oc.KubeFramework().ClientSet, pod) + framework.ExpectNoError(err, "Pod failed to start") + + g.By("Validating device allocation in ResourceClaim") + err = validator.ValidateDeviceAllocation(ctx, oc.Namespace(), claimName, 1) + framework.ExpectNoError(err, "Device allocation validation failed") + }) + + g.It("should handle pod deletion and resource cleanup", func(ctx context.Context) { + deviceClassName := "test-example-cleanup-" + oc.Namespace() + claimName := "test-device-claim-cleanup" + podName := "test-device-pod-cleanup" + + g.By("Creating DeviceClass") + deviceClass := builder.BuildDeviceClass(deviceClassName) + err := createDeviceClass(ctx, oc.KubeFramework().DynamicClient, deviceClass) + framework.ExpectNoError(err) + defer func() { + if err := deleteDeviceClass(ctx, oc.KubeFramework().DynamicClient, deviceClassName); err != nil { + framework.Logf("Warning: failed to delete DeviceClass %s: %v", deviceClassName, err) + } + }() + + g.By("Creating ResourceClaim") + claim := builder.BuildResourceClaim(claimName, deviceClassName, 1) + err = createResourceClaim(ctx, oc.KubeFramework().DynamicClient, oc.Namespace(), claim) + framework.ExpectNoError(err) + defer func() { + if err := deleteResourceClaim(ctx, oc.KubeFramework().DynamicClient, oc.Namespace(), claimName); err != nil { + framework.Logf("Warning: failed to delete ResourceClaim %s/%s: %v", oc.Namespace(), claimName, err) + } + }() + + g.By("Creating and verifying pod with device") + pod := builder.BuildLongRunningPodWithClaim(podName, claimName, "") + pod, err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Create(ctx, pod, metav1.CreateOptions{}) + framework.ExpectNoError(err) + + err = e2epod.WaitForPodRunningInNamespace(ctx, oc.KubeFramework().ClientSet, pod) + framework.ExpectNoError(err) + + g.By("Validating device allocation before pod deletion") + err = validator.ValidateDeviceAllocation(ctx, oc.Namespace(), claimName, 1) + framework.ExpectNoError(err) + + g.By("Deleting pod") + err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Delete(ctx, podName, metav1.DeleteOptions{}) + framework.ExpectNoError(err) + + g.By("Waiting for pod to be deleted") + err = e2epod.WaitForPodNotFoundInNamespace(ctx, oc.KubeFramework().ClientSet, podName, oc.Namespace(), 1*time.Minute) + framework.ExpectNoError(err) + + g.By("Verifying ResourceClaim still exists but is not reserved") + err = wait.PollUntilContextTimeout(ctx, 2*time.Second, 1*time.Minute, true, func(ctx context.Context) (bool, error) { + claimObj, getErr := oc.KubeFramework().DynamicClient.Resource(resourceClaimGVR).Namespace(oc.Namespace()).Get(ctx, claimName, metav1.GetOptions{}) + if getErr != nil { + return false, getErr + } + + reservedFor, found, nestErr := unstructured.NestedSlice(claimObj.Object, "status", "reservedFor") + if nestErr != nil { + return false, nestErr + } + if found && len(reservedFor) > 0 { + framework.Logf("ResourceClaim %s still has %d reservation(s), waiting for DRA controller to clear...", claimName, len(reservedFor)) + return false, nil + } + return true, nil + }) + framework.ExpectNoError(err, "ResourceClaim %s reservation was not released within timeout after pod deletion", claimName) + framework.Logf("ResourceClaim %s successfully cleaned up after pod deletion", claimName) + }) + }) + + g.Context("Multi-Device Allocation", func() { + g.It("should allocate multiple devices to single pod", func(ctx context.Context) { + totalDevices, err := validator.GetTotalDeviceCount(ctx) + if err != nil { + g.Fail(fmt.Sprintf("Failed to count total devices: %v", err)) + } + if totalDevices < 2 { + g.Skip(fmt.Sprintf("Multi-device test requires at least 2 devices, but only %d available", totalDevices)) + } + + deviceClassName := "test-example-multi-" + oc.Namespace() + claimName := "test-multi-device-claim" + podName := "test-multi-device-pod" + + g.By("Creating DeviceClass") + deviceClass := builder.BuildDeviceClass(deviceClassName) + err = createDeviceClass(ctx, oc.KubeFramework().DynamicClient, deviceClass) + framework.ExpectNoError(err) + defer func() { + if err := deleteDeviceClass(ctx, oc.KubeFramework().DynamicClient, deviceClassName); err != nil { + framework.Logf("Warning: failed to delete DeviceClass %s: %v", deviceClassName, err) + } + }() + + g.By("Creating ResourceClaim requesting 2 devices") + claim := builder.BuildResourceClaim(claimName, deviceClassName, 2) + err = createResourceClaim(ctx, oc.KubeFramework().DynamicClient, oc.Namespace(), claim) + framework.ExpectNoError(err) + defer func() { + if err := deleteResourceClaim(ctx, oc.KubeFramework().DynamicClient, oc.Namespace(), claimName); err != nil { + framework.Logf("Warning: failed to delete ResourceClaim %s/%s: %v", oc.Namespace(), claimName, err) + } + }() + + g.By("Creating Pod using the multi-device claim") + pod := builder.BuildPodWithClaim(podName, claimName, "") + pod, err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Create(ctx, pod, metav1.CreateOptions{}) + framework.ExpectNoError(err) + defer func() { + if err := oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Delete(ctx, podName, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { + framework.Logf("Warning: failed to delete pod %s/%s: %v", oc.Namespace(), podName, err) + } + }() + + g.By("Waiting for pod to be running") + err = e2epod.WaitForPodRunningInNamespace(ctx, oc.KubeFramework().ClientSet, pod) + framework.ExpectNoError(err, "Pod failed to start") + + g.By("Validating 2 devices allocated") + err = validator.ValidateDeviceAllocation(ctx, oc.Namespace(), claimName, 2) + framework.ExpectNoError(err, "Expected 2 devices to be allocated") + }) + }) + + g.Context("Claim Sharing", func() { + g.It("should allow multiple pods to share the same ResourceClaim", func(ctx context.Context) { + deviceClassName := "test-example-shared-" + oc.Namespace() + claimName := "test-shared-claim" + pod1Name := "test-shared-pod-1" + pod2Name := "test-shared-pod-2" + + g.By("Creating DeviceClass") + deviceClass := builder.BuildDeviceClass(deviceClassName) + err := createDeviceClass(ctx, oc.KubeFramework().DynamicClient, deviceClass) + framework.ExpectNoError(err) + defer func() { + if err := deleteDeviceClass(ctx, oc.KubeFramework().DynamicClient, deviceClassName); err != nil { + framework.Logf("Warning: failed to delete DeviceClass %s: %v", deviceClassName, err) + } + }() + + g.By("Creating shared ResourceClaim") + claim := builder.BuildResourceClaim(claimName, deviceClassName, 1) + err = createResourceClaim(ctx, oc.KubeFramework().DynamicClient, oc.Namespace(), claim) + framework.ExpectNoError(err) + defer func() { + if err := deleteResourceClaim(ctx, oc.KubeFramework().DynamicClient, oc.Namespace(), claimName); err != nil { + framework.Logf("Warning: failed to delete ResourceClaim %s/%s: %v", oc.Namespace(), claimName, err) + } + }() + + g.By("Creating first pod using the shared claim") + pod1 := builder.BuildLongRunningPodWithClaim(pod1Name, claimName, "") + pod1, err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Create(ctx, pod1, metav1.CreateOptions{}) + framework.ExpectNoError(err, "Failed to create first pod") + defer func() { + if err := oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Delete(ctx, pod1Name, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { + framework.Logf("Warning: failed to delete pod %s/%s: %v", oc.Namespace(), pod1Name, err) + } + }() + + g.By("Waiting for first pod to be running") + err = e2epod.WaitForPodRunningInNamespace(ctx, oc.KubeFramework().ClientSet, pod1) + framework.ExpectNoError(err, "First pod failed to start") + + g.By("Creating second pod using the same claim") + pod2 := builder.BuildLongRunningPodWithClaim(pod2Name, claimName, "") + pod2, err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Create(ctx, pod2, metav1.CreateOptions{}) + framework.ExpectNoError(err, "Failed to create second pod") + defer func() { + if err := oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Delete(ctx, pod2Name, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { + framework.Logf("Warning: failed to delete pod %s/%s: %v", oc.Namespace(), pod2Name, err) + } + }() + + g.By("Checking if second pod can share the claim") + const pollInterval = 2 * time.Second + const pollTimeout = 60 * time.Second + var finalPhase corev1.PodPhase + var schedulingFailed bool + + err = wait.PollUntilContextTimeout(ctx, pollInterval, pollTimeout, true, func(ctx context.Context) (bool, error) { + pod2, err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Get(ctx, pod2Name, metav1.GetOptions{}) + if err != nil { + return false, err + } + + finalPhase = pod2.Status.Phase + + if pod2.Status.Phase == corev1.PodRunning { + framework.Logf("Second pod is Running — claim sharing supported") + return true, nil + } + + if pod2.Status.Phase == corev1.PodPending { + for _, cond := range pod2.Status.Conditions { + if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse && cond.Reason == "Unschedulable" { + msg := strings.ToLower(cond.Message) + if strings.Contains(msg, "claim") || strings.Contains(msg, "allocat") { + framework.Logf("Second pod is Pending due to DRA claim conflict: %s", cond.Message) + schedulingFailed = true + return true, nil + } + framework.Logf("Second pod is Pending with non-DRA Unschedulable reason (likely taints or resources): %s", cond.Message) + } + } + framework.Logf("Second pod is Pending, continuing to poll...") + return false, nil + } + + return false, fmt.Errorf("second pod in unexpected phase: %s", pod2.Status.Phase) + }) + framework.ExpectNoError(err, "Failed to determine second pod state") + + if schedulingFailed { + framework.Logf("Second pod unschedulable — claim sharing not supported by example driver") + g.By("Verifying first pod still has device access") + err = validator.ValidateDeviceAllocation(ctx, oc.Namespace(), claimName, 1) + framework.ExpectNoError(err) + } else if finalPhase == corev1.PodRunning { + framework.Logf("Both pods running — claim sharing is supported") + g.By("Verifying claim is still allocated") + err = validator.ValidateDeviceAllocation(ctx, oc.Namespace(), claimName, 1) + framework.ExpectNoError(err) + } + }) + }) + + g.Context("ResourceClaimTemplate", func() { + g.It("should create claim from template for pod", func(ctx context.Context) { + deviceClassName := "test-example-template-" + oc.Namespace() + templateName := "test-device-template" + podName := "test-template-pod" + + g.By("Creating DeviceClass") + deviceClass := builder.BuildDeviceClass(deviceClassName) + err := createDeviceClass(ctx, oc.KubeFramework().DynamicClient, deviceClass) + framework.ExpectNoError(err) + defer func() { + if err := deleteDeviceClass(ctx, oc.KubeFramework().DynamicClient, deviceClassName); err != nil { + framework.Logf("Warning: failed to delete DeviceClass %s: %v", deviceClassName, err) + } + }() + + g.By("Creating ResourceClaimTemplate") + template := builder.BuildResourceClaimTemplate(templateName, deviceClassName, 1) + err = createResourceClaimTemplate(ctx, oc.KubeFramework().DynamicClient, oc.Namespace(), template) + framework.ExpectNoError(err) + defer func() { + if err := deleteResourceClaimTemplate(ctx, oc.KubeFramework().DynamicClient, oc.Namespace(), templateName); err != nil { + framework.Logf("Warning: failed to delete ResourceClaimTemplate %s/%s: %v", oc.Namespace(), templateName, err) + } + }() + + g.By("Creating Pod with ResourceClaimTemplate reference") + pod := builder.BuildPodWithInlineClaim(podName) + *pod.Spec.ResourceClaims[0].ResourceClaimTemplateName = templateName + pod, err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Create(ctx, pod, metav1.CreateOptions{}) + framework.ExpectNoError(err, "Failed to create pod") + + g.By("Waiting for pod to be running") + err = e2epod.WaitForPodRunningInNamespace(ctx, oc.KubeFramework().ClientSet, pod) + framework.ExpectNoError(err, "Pod failed to start") + + g.By("Verifying ResourceClaim was created from template") + claimPrefix := podName + "-device" + + claimList, err := oc.KubeFramework().DynamicClient.Resource(resourceClaimGVR).Namespace(oc.Namespace()).List(ctx, metav1.ListOptions{}) + framework.ExpectNoError(err, "Failed to list ResourceClaims") + + var generatedClaimName string + var claimObj *unstructured.Unstructured + for _, claim := range claimList.Items { + if strings.HasPrefix(claim.GetName(), claimPrefix) { + generatedClaimName = claim.GetName() + claimObj = &claim + framework.Logf("Found template-generated ResourceClaim: %s (matches prefix: %s)", generatedClaimName, claimPrefix) + break + } + } + + o.Expect(generatedClaimName).NotTo(o.BeEmpty(), "ResourceClaim with prefix %s should be auto-created from template", claimPrefix) + o.Expect(claimObj).NotTo(o.BeNil()) + framework.Logf("ResourceClaim %s was successfully created from template", generatedClaimName) + + g.By("Deleting pod and verifying claim cleanup") + err = oc.KubeFramework().ClientSet.CoreV1().Pods(oc.Namespace()).Delete(ctx, podName, metav1.DeleteOptions{}) + framework.ExpectNoError(err) + + err = e2epod.WaitForPodNotFoundInNamespace(ctx, oc.KubeFramework().ClientSet, podName, oc.Namespace(), 1*time.Minute) + framework.ExpectNoError(err) + + g.By("Verifying auto-generated claim is deleted") + err = wait.PollUntilContextTimeout(ctx, 1*time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) { + _, getErr := oc.KubeFramework().DynamicClient.Resource(resourceClaimGVR).Namespace(oc.Namespace()).Get(ctx, generatedClaimName, metav1.GetOptions{}) + if getErr != nil { + if errors.IsNotFound(getErr) { + framework.Logf("ResourceClaim %s was deleted as expected", generatedClaimName) + return true, nil + } + return false, getErr + } + framework.Logf("ResourceClaim %s still exists, waiting for cleanup...", generatedClaimName) + return false, nil + }) + if err != nil { + g.Fail(fmt.Sprintf("ResourceClaim %s not deleted within timeout — expected automatic cleanup: %v", generatedClaimName, err)) + } + framework.Logf("ResourceClaim was cleaned up with pod deletion as expected") + }) + }) +}) + +func convertToUnstructured(obj interface{}) (*unstructured.Unstructured, error) { + unstructuredObj := &unstructured.Unstructured{} + content, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + if err != nil { + return nil, err + } + unstructuredObj.Object = content + return unstructuredObj, nil +} + +func createDeviceClass(ctx context.Context, client dynamic.Interface, deviceClass interface{}) error { + unstructuredObj, err := convertToUnstructured(deviceClass) + if err != nil { + return err + } + _, err = client.Resource(deviceClassGVR).Create(ctx, unstructuredObj, metav1.CreateOptions{}) + return err +} + +func deleteDeviceClass(ctx context.Context, client dynamic.Interface, name string) error { + return client.Resource(deviceClassGVR).Delete(ctx, name, metav1.DeleteOptions{ + GracePeriodSeconds: ptr.To[int64](0), + }) +} + +func createResourceClaim(ctx context.Context, client dynamic.Interface, namespace string, claim interface{}) error { + unstructuredObj, err := convertToUnstructured(claim) + if err != nil { + return err + } + _, err = client.Resource(resourceClaimGVR).Namespace(namespace).Create(ctx, unstructuredObj, metav1.CreateOptions{}) + return err +} + +func deleteResourceClaim(ctx context.Context, client dynamic.Interface, namespace, name string) error { + return client.Resource(resourceClaimGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{ + GracePeriodSeconds: ptr.To[int64](0), + }) +} + +func createResourceClaimTemplate(ctx context.Context, client dynamic.Interface, namespace string, template interface{}) error { + unstructuredObj, err := convertToUnstructured(template) + if err != nil { + return err + } + _, err = client.Resource(resourceClaimTemplateGVR).Namespace(namespace).Create(ctx, unstructuredObj, metav1.CreateOptions{}) + return err +} + +func deleteResourceClaimTemplate(ctx context.Context, client dynamic.Interface, namespace, name string) error { + return client.Resource(resourceClaimTemplateGVR).Namespace(namespace).Delete(ctx, name, metav1.DeleteOptions{ + GracePeriodSeconds: ptr.To[int64](0), + }) +} diff --git a/test/extended/node/dra/example/prerequisites_installer.go b/test/extended/node/dra/example/prerequisites_installer.go new file mode 100644 index 000000000000..76068b5ab9c8 --- /dev/null +++ b/test/extended/node/dra/example/prerequisites_installer.go @@ -0,0 +1,374 @@ +package example + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" +) + +const ( + driverNamespace = "dra-example-driver" + driverRelease = "dra-example-driver" + driverServiceAccount = "dra-example-driver-service-account" + driverCRBName = "dra-example-driver-privileged-scc" + + upstreamRepoURL = "https://github.com/kubernetes-sigs/dra-example-driver.git" + helmChartRelPath = "deployments/helm/dra-example-driver" +) + +// PrerequisitesInstaller handles installation and cleanup of the upstream dra-example-driver on OpenShift. +type PrerequisitesInstaller struct { + client kubernetes.Interface + framework *framework.Framework + cloneDir string +} + +// NewPrerequisitesInstaller creates a PrerequisitesInstaller using the provided test framework. +func NewPrerequisitesInstaller(f *framework.Framework) *PrerequisitesInstaller { + return &PrerequisitesInstaller{ + client: f.ClientSet, + framework: f, + } +} + +// InstallAll clones the upstream repo, creates the namespace with PSA labels, grants SCC, and installs via Helm. +func (pi *PrerequisitesInstaller) InstallAll(ctx context.Context) error { + framework.Logf("=== Installing DRA Example Driver Prerequisites ===") + + if err := pi.ensureHelm(ctx); err != nil { + return fmt.Errorf("helm not available: %w", err) + } + + if pi.IsDriverInstalled(ctx) { + framework.Logf("DRA example driver already installed, waiting for device publication...") + return pi.WaitForDriver(ctx, 5*time.Minute) + } + + if err := pi.cloneUpstreamRepo(ctx); err != nil { + return fmt.Errorf("failed to clone upstream repo: %w", err) + } + + if err := pi.createNamespace(ctx); err != nil { + return fmt.Errorf("failed to create namespace: %w", err) + } + + if err := pi.grantSCCPermissions(ctx); err != nil { + pi.RollbackMutations(ctx) + return fmt.Errorf("failed to grant SCC permissions: %w", err) + } + + if err := pi.helmInstall(ctx); err != nil { + pi.RollbackMutations(ctx) + return fmt.Errorf("failed to install via Helm: %w", err) + } + + framework.Logf("Waiting for DRA example driver to be ready...") + if err := pi.WaitForDriver(ctx, 5*time.Minute); err != nil { + pi.RollbackMutations(ctx) + return fmt.Errorf("driver failed to become ready: %w", err) + } + + framework.Logf("=== DRA Example Driver installation complete ===") + return nil +} + +func (pi *PrerequisitesInstaller) ensureHelm(ctx context.Context) error { + cmd := exec.CommandContext(ctx, "helm", "version", "--short") + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("helm command not found or failed: %w\nOutput: %s", err, string(output)) + } + framework.Logf("Helm version: %s", strings.TrimSpace(string(output))) + return nil +} + +func (pi *PrerequisitesInstaller) cloneUpstreamRepo(ctx context.Context) error { + tmpDir, err := os.MkdirTemp("", "dra-example-driver-*") + if err != nil { + return fmt.Errorf("failed to create temp dir: %w", err) + } + pi.cloneDir = tmpDir + + framework.Logf("Cloning upstream dra-example-driver to %s", tmpDir) + + ref := os.Getenv("DRA_EXAMPLE_DRIVER_REF") + if ref == "" { + ref = "main" + } + + cmd := exec.CommandContext(ctx, "git", "clone", "--depth=1", "--branch", ref, upstreamRepoURL, tmpDir) + output, err := cmd.CombinedOutput() + if err != nil { + os.RemoveAll(tmpDir) + pi.cloneDir = "" + return fmt.Errorf("failed to clone repo: %w\nOutput: %s", err, string(output)) + } + + framework.Logf("Cloned dra-example-driver (ref: %s)", ref) + return nil +} + +func (pi *PrerequisitesInstaller) createNamespace(ctx context.Context) error { + requiredLabels := map[string]string{ + "pod-security.kubernetes.io/enforce": "privileged", + "pod-security.kubernetes.io/warn": "privileged", + "pod-security.kubernetes.io/audit": "privileged", + } + + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: driverNamespace, + Labels: requiredLabels, + }, + } + _, err := pi.client.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{}) + if err == nil { + framework.Logf("Created namespace %s with privileged PSA labels", driverNamespace) + return nil + } + if !errors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create namespace %s: %w", driverNamespace, err) + } + + existing, err := pi.client.CoreV1().Namespaces().Get(ctx, driverNamespace, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get existing namespace %s: %w", driverNamespace, err) + } + needsUpdate := false + if existing.Labels == nil { + existing.Labels = make(map[string]string) + } + for k, v := range requiredLabels { + if existing.Labels[k] != v { + existing.Labels[k] = v + needsUpdate = true + } + } + if needsUpdate { + _, err = pi.client.CoreV1().Namespaces().Update(ctx, existing, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update PSA labels on namespace %s: %w", driverNamespace, err) + } + framework.Logf("Updated namespace %s with required PSA labels", driverNamespace) + } else { + framework.Logf("Namespace %s already exists with correct PSA labels", driverNamespace) + } + return nil +} + +func (pi *PrerequisitesInstaller) grantSCCPermissions(ctx context.Context) error { + framework.Logf("Granting privileged SCC to DRA example driver ServiceAccount") + + crb := &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: driverCRBName, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "ClusterRole", + Name: "system:openshift:scc:privileged", + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: driverServiceAccount, + Namespace: driverNamespace, + }, + }, + } + + _, err := pi.client.RbacV1().ClusterRoleBindings().Create(ctx, crb, metav1.CreateOptions{}) + if err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create ClusterRoleBinding %s: %w", driverCRBName, err) + } + framework.Logf("SCC permissions granted to %s/%s", driverNamespace, driverServiceAccount) + return nil +} + +func (pi *PrerequisitesInstaller) helmInstall(ctx context.Context) error { + chartPath := filepath.Join(pi.cloneDir, helmChartRelPath) + + framework.Logf("Installing DRA example driver via Helm from %s", chartPath) + + args := []string{ + "install", driverRelease, chartPath, + "--namespace", driverNamespace, + "--set", "kubeletPlugin.tolerations[0].key=node-role.kubernetes.io/master", + "--set", "kubeletPlugin.tolerations[0].operator=Exists", + "--set", "kubeletPlugin.tolerations[0].effect=NoSchedule", + "--set", "kubeletPlugin.tolerations[1].key=node-role.kubernetes.io/control-plane", + "--set", "kubeletPlugin.tolerations[1].operator=Exists", + "--set", "kubeletPlugin.tolerations[1].effect=NoSchedule", + "--wait", + "--timeout", "5m", + } + + cmd := exec.CommandContext(ctx, "helm", args...) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("helm install failed: %w\nOutput: %s", err, string(output)) + } + + framework.Logf("DRA example driver Helm install succeeded") + return nil +} + +// WaitForDriver blocks until the DaemonSet is ready and ResourceSlices are published. +func (pi *PrerequisitesInstaller) WaitForDriver(ctx context.Context, timeout time.Duration) error { + framework.Logf("Waiting for DRA example driver DaemonSet to be ready (timeout: %v)", timeout) + + if err := pi.waitForDaemonSet(ctx, timeout); err != nil { + return fmt.Errorf("kubelet plugin DaemonSet not ready: %w", err) + } + + framework.Logf("Waiting for ResourceSlices to be published...") + validator := NewDeviceValidator(pi.framework) + err := wait.PollUntilContextTimeout(ctx, 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + count, err := validator.GetTotalDeviceCount(ctx) + if err != nil { + framework.Logf("Error checking device count: %v", err) + return false, nil + } + if count > 0 { + framework.Logf("Found %d published device(s) in ResourceSlices", count) + return true, nil + } + framework.Logf("No devices published yet, waiting...") + return false, nil + }) + if err != nil { + return fmt.Errorf("no published device slices within timeout: %w", err) + } + + framework.Logf("DRA example driver is ready") + return nil +} + +func (pi *PrerequisitesInstaller) waitForDaemonSet(ctx context.Context, timeout time.Duration) error { + return wait.PollUntilContextTimeout(ctx, 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + dsList, err := pi.client.AppsV1().DaemonSets(driverNamespace).List(ctx, metav1.ListOptions{}) + if err != nil { + return false, err + } + + for _, ds := range dsList.Items { + if strings.Contains(ds.Name, driverRelease) { + ready := ds.Status.DesiredNumberScheduled > 0 && + ds.Status.NumberReady == ds.Status.DesiredNumberScheduled && + ds.Status.NumberUnavailable == 0 + + if !ready { + framework.Logf("DaemonSet %s/%s not ready: desired=%d, ready=%d, unavailable=%d", + driverNamespace, ds.Name, ds.Status.DesiredNumberScheduled, ds.Status.NumberReady, ds.Status.NumberUnavailable) + return false, nil + } + + framework.Logf("DaemonSet %s/%s is ready", driverNamespace, ds.Name) + return true, nil + } + } + + framework.Logf("DaemonSet for %s not found yet in %s", driverRelease, driverNamespace) + return false, nil + }) +} + +// IsDriverInstalled returns true if the driver namespace exists and has at least one fully ready pod. +func (pi *PrerequisitesInstaller) IsDriverInstalled(ctx context.Context) bool { + _, err := pi.client.CoreV1().Namespaces().Get(ctx, driverNamespace, metav1.GetOptions{}) + if err != nil { + return false + } + + pods, err := pi.client.CoreV1().Pods(driverNamespace).List(ctx, metav1.ListOptions{}) + if err != nil || len(pods.Items) == 0 { + return false + } + + for _, pod := range pods.Items { + if pod.Status.Phase != corev1.PodRunning { + continue + } + allReady := len(pod.Status.ContainerStatuses) > 0 + for _, cs := range pod.Status.ContainerStatuses { + if !cs.Ready { + allReady = false + break + } + } + if allReady { + framework.Logf("Found fully ready DRA example driver pod: %s", pod.Name) + return true + } + } + + return false +} + +// UninstallAll removes the Helm release, cluster-scoped resources, namespace, and cloned repo. +func (pi *PrerequisitesInstaller) UninstallAll(ctx context.Context) error { + framework.Logf("=== Cleaning up DRA Example Driver ===") + + cmd := exec.CommandContext(ctx, "helm", "uninstall", driverRelease, + "--namespace", driverNamespace, + "--wait", + "--timeout", "5m") + output, err := cmd.CombinedOutput() + if err != nil && !strings.Contains(string(output), "not found") { + framework.Logf("Warning: helm uninstall failed: %v\nOutput: %s", err, string(output)) + } + + pi.cleanupClusterResources(ctx) + + if err := pi.client.CoreV1().Namespaces().Delete(ctx, driverNamespace, metav1.DeleteOptions{}); err != nil { + if !errors.IsNotFound(err) { + framework.Logf("Warning: failed to delete namespace %s: %v", driverNamespace, err) + } + } + + if pi.cloneDir != "" { + os.RemoveAll(pi.cloneDir) + } + + framework.Logf("=== Cleanup complete ===") + return nil +} + +// RollbackMutations performs best-effort cleanup of cluster-scoped resources after a partial install failure. +func (pi *PrerequisitesInstaller) RollbackMutations(ctx context.Context) { + framework.Logf("Rolling back DRA example driver cluster mutations (best-effort)...") + + pi.cleanupClusterResources(ctx) + + err := pi.client.CoreV1().Namespaces().Delete(ctx, driverNamespace, metav1.DeleteOptions{}) + if err != nil && !errors.IsNotFound(err) { + framework.Logf("Warning: failed to delete namespace %s during rollback: %v", driverNamespace, err) + } + + if pi.cloneDir != "" { + os.RemoveAll(pi.cloneDir) + } + + framework.Logf("Rollback complete") +} + +func (pi *PrerequisitesInstaller) cleanupClusterResources(ctx context.Context) { + err := pi.client.RbacV1().ClusterRoleBindings().Delete(ctx, driverCRBName, metav1.DeleteOptions{}) + if err != nil && !errors.IsNotFound(err) { + framework.Logf("Warning: failed to delete ClusterRoleBinding %s: %v", driverCRBName, err) + } else if err == nil { + framework.Logf("Deleted ClusterRoleBinding %s", driverCRBName) + } +} diff --git a/test/extended/node/dra/example/resource_builder.go b/test/extended/node/dra/example/resource_builder.go new file mode 100644 index 000000000000..bcf8e1216379 --- /dev/null +++ b/test/extended/node/dra/example/resource_builder.go @@ -0,0 +1,172 @@ +package example + +import ( + "fmt" + + corev1 "k8s.io/api/core/v1" + resourceapi "k8s.io/api/resource/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + + "github.com/openshift/origin/test/extended/util/image" +) + +const ( + exampleDriverName = "gpu.example.com" + defaultDeviceClass = "gpu.example.com" + deviceRequestName = "device" +) + +// ResourceBuilder constructs Kubernetes DRA objects for the dra-example-driver. +type ResourceBuilder struct { + namespace string +} + +// NewResourceBuilder creates a ResourceBuilder scoped to the given namespace. +func NewResourceBuilder(namespace string) *ResourceBuilder { + return &ResourceBuilder{namespace: namespace} +} + +// BuildDeviceClass creates a DeviceClass with a CEL selector for the example driver. +func (rb *ResourceBuilder) BuildDeviceClass(name string) *resourceapi.DeviceClass { + if name == "" { + name = defaultDeviceClass + } + + return &resourceapi.DeviceClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: resourceapi.DeviceClassSpec{ + Selectors: []resourceapi.DeviceSelector{ + { + CEL: &resourceapi.CELDeviceSelector{ + Expression: fmt.Sprintf("device.driver == %q", exampleDriverName), + }, + }, + }, + }, + } +} + +// BuildResourceClaim creates a ResourceClaim requesting the specified number of devices. +func (rb *ResourceBuilder) BuildResourceClaim(name, deviceClassName string, count int) *resourceapi.ResourceClaim { + if count <= 0 { + panic(fmt.Sprintf("BuildResourceClaim: count must be > 0, got %d", count)) + } + if deviceClassName == "" { + deviceClassName = defaultDeviceClass + } + + return &resourceapi.ResourceClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: rb.namespace, + }, + Spec: resourceapi.ResourceClaimSpec{ + Devices: resourceapi.DeviceClaim{ + Requests: []resourceapi.DeviceRequest{ + { + Name: deviceRequestName, + Exactly: &resourceapi.ExactDeviceRequest{ + DeviceClassName: deviceClassName, + Count: int64(count), + }, + }, + }, + }, + }, + } +} + +// BuildPodWithClaim creates a Pod that references an existing ResourceClaim. +// If img is empty, the OpenShift release payload tools image is used. +func (rb *ResourceBuilder) BuildPodWithClaim(name, claimName, img string) *corev1.Pod { + if img == "" { + img = image.ShellImage() + } + + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: rb.namespace, + }, + Spec: corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + Containers: []corev1.Container{ + { + Name: "test-container", + Image: img, + Command: []string{"sh", "-c", "echo DRA device allocated && sleep infinity"}, + Resources: corev1.ResourceRequirements{ + Claims: []corev1.ResourceClaim{ + { + Name: deviceRequestName, + }, + }, + }, + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: ptr.To(false), + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + }, + }, + }, + ResourceClaims: []corev1.PodResourceClaim{ + { + Name: deviceRequestName, + ResourceClaimName: &claimName, + }, + }, + }, + } +} + +// BuildLongRunningPodWithClaim creates a Pod with a periodic heartbeat loop instead of sleep infinity. +func (rb *ResourceBuilder) BuildLongRunningPodWithClaim(name, claimName, img string) *corev1.Pod { + pod := rb.BuildPodWithClaim(name, claimName, img) + pod.Spec.Containers[0].Command = []string{"sh", "-c", "while true; do echo DRA device active; sleep 60; done"} + return pod +} + +// BuildPodWithInlineClaim creates a Pod that references a ResourceClaimTemplate for inline allocation. +func (rb *ResourceBuilder) BuildPodWithInlineClaim(name string) *corev1.Pod { + templateName := name + "-template" + pod := rb.BuildPodWithClaim(name, "", "") + pod.Spec.ResourceClaims[0].ResourceClaimName = nil + pod.Spec.ResourceClaims[0].ResourceClaimTemplateName = &templateName + return pod +} + +// BuildResourceClaimTemplate creates a ResourceClaimTemplate for inline claim generation. +func (rb *ResourceBuilder) BuildResourceClaimTemplate(name, deviceClassName string, count int) *resourceapi.ResourceClaimTemplate { + if count <= 0 { + panic(fmt.Sprintf("BuildResourceClaimTemplate: count must be > 0, got %d", count)) + } + if deviceClassName == "" { + deviceClassName = defaultDeviceClass + } + + return &resourceapi.ResourceClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: rb.namespace, + }, + Spec: resourceapi.ResourceClaimTemplateSpec{ + Spec: resourceapi.ResourceClaimSpec{ + Devices: resourceapi.DeviceClaim{ + Requests: []resourceapi.DeviceRequest{ + { + Name: deviceRequestName, + Exactly: &resourceapi.ExactDeviceRequest{ + DeviceClassName: deviceClassName, + Count: int64(count), + }, + }, + }, + }, + }, + }, + } +}