From 4f3f257454ef98d32b82d8e39ba7a1d373060ce1 Mon Sep 17 00:00:00 2001 From: Dominik Pajak Date: Thu, 13 Nov 2025 12:32:08 +0000 Subject: [PATCH 01/10] slice becomes cluster-scoped --- slice/api/v1alpha1/slice_types.go | 1 + .../slice.accelerator.gke.io_slices.yaml | 2 +- slice/internal/controller/indexer.go | 33 +++++++++++++++++-- .../controller/workload_controller.go | 28 +++++++++------- slice/internal/core/constants.go | 5 +++ slice/internal/core/slice.go | 7 ++-- 6 files changed, 59 insertions(+), 17 deletions(-) diff --git a/slice/api/v1alpha1/slice_types.go b/slice/api/v1alpha1/slice_types.go index f014664ca..c201055ec 100644 --- a/slice/api/v1alpha1/slice_types.go +++ b/slice/api/v1alpha1/slice_types.go @@ -53,6 +53,7 @@ type SliceStatus struct { // +kubebuilder:object:root=true // +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Cluster // +kubebuilder:printcolumn:name="Type",type=string,JSONPath=`.spec.type` // +kubebuilder:printcolumn:name="Topology",type=string,JSONPath=`.spec.topology` // +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.conditions[0].type` diff --git a/slice/config/crd/bases/slice.accelerator.gke.io_slices.yaml b/slice/config/crd/bases/slice.accelerator.gke.io_slices.yaml index ec626d639..77109d26f 100644 --- a/slice/config/crd/bases/slice.accelerator.gke.io_slices.yaml +++ b/slice/config/crd/bases/slice.accelerator.gke.io_slices.yaml @@ -12,7 +12,7 @@ spec: listKind: SliceList plural: slices singular: slice - scope: Namespaced + scope: Cluster versions: - additionalPrinterColumns: - jsonPath: .spec.type diff --git a/slice/internal/controller/indexer.go b/slice/internal/controller/indexer.go index 5f8c58223..879ce6135 100644 --- a/slice/internal/controller/indexer.go +++ b/slice/internal/controller/indexer.go @@ -25,24 +25,53 @@ import ( kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" "tpu-slice-controller/api/v1alpha1" + "tpu-slice-controller/internal/core" "tpu-slice-controller/internal/util/slices" ) const ( + // OwnerReferenceUID is an index key for owner references. OwnerReferenceUID = "metadata.ownerReferences.uid" + // WorkloadNamespaceIndex is an index key for the workload namespace annotation. + WorkloadNamespaceIndex = "workload.namespace" + // WorkloadNameIndex is an index key for the workload name annotation. + WorkloadNameIndex = "workload.name" ) func indexOwnerReferenceUID(obj client.Object) []string { return slices.Map(obj.GetOwnerReferences(), func(o *metav1.OwnerReference) string { return string(o.UID) }) } +func indexSliceByWorkloadNamespace(obj client.Object) []string { + if slice, ok := obj.(*v1alpha1.Slice); ok { + if ns, found := slice.GetAnnotations()[core.OwnerWorkloadNamespaceAnnotation]; found { + return []string{ns} + } + } + return nil +} + +func indexSliceByWorkloadName(obj client.Object) []string { + if slice, ok := obj.(*v1alpha1.Slice); ok { + if name, found := slice.GetAnnotations()[core.OwnerWorkloadNameAnnotation]; found { + return []string{name} + } + } + return nil +} + // SetupIndexer configures the indexer to index specific fields for kueue.Workload and v1alpha1.Slice resources. func SetupIndexer(ctx context.Context, indexer client.FieldIndexer) error { if err := indexer.IndexField(ctx, &kueue.Workload{}, OwnerReferenceUID, indexOwnerReferenceUID); err != nil { return fmt.Errorf("setting index on ownerReferences.uid for Workload: %w", err) } - if err := indexer.IndexField(ctx, &v1alpha1.Slice{}, OwnerReferenceUID, indexOwnerReferenceUID); err != nil { - return fmt.Errorf("setting index on ownerReferences.uid for Slice: %w", err) + // Since Slice is now cluster-scoped, it cannot have a controller owner reference to a namespaced Workload. + // We use annotations for linking Slices to Workloads. + if err := indexer.IndexField(ctx, &v1alpha1.Slice{}, WorkloadNamespaceIndex, indexSliceByWorkloadNamespace); err != nil { + return fmt.Errorf("setting index on workload namespace for Slice: %w", err) + } + if err := indexer.IndexField(ctx, &v1alpha1.Slice{}, WorkloadNameIndex, indexSliceByWorkloadName); err != nil { + return fmt.Errorf("setting index on workload name for Slice: %w", err) } return nil } diff --git a/slice/internal/controller/workload_controller.go b/slice/internal/controller/workload_controller.go index efb902aa0..06c35c416 100644 --- a/slice/internal/controller/workload_controller.go +++ b/slice/internal/controller/workload_controller.go @@ -291,8 +291,10 @@ func (r *WorkloadReconciler) cleanupSlices(ctx context.Context, wl *kueue.Worklo func (r *WorkloadReconciler) findWorkloadSlices(ctx context.Context, wl *kueue.Workload) ([]v1alpha1.Slice, error) { slices := &v1alpha1.SliceList{} opts := []client.ListOption{ - client.InNamespace(wl.Namespace), - client.MatchingFields{OwnerReferenceUID: string(wl.UID)}, + client.MatchingFields{ + WorkloadNamespaceIndex: wl.Namespace, + WorkloadNameIndex: wl.Name, + }, } if err := r.client.List(ctx, slices, opts...); err != nil { return nil, err @@ -525,10 +527,10 @@ func (r *WorkloadReconciler) createSlice(ctx context.Context, wl *kueue.Workload slice := core.SliceWithMetadata(wl, psa.Name) log := ctrl.LoggerFrom(ctx).WithValues("slice", klog.KObj(slice)) log.V(3).Info("Creating Slice") - - if err := controllerutil.SetControllerReference(wl, slice, r.client.Scheme()); err != nil { - return nil, err - } + // Since Slice is a cluster-scoped object and Workload is namespaced, + // we cannot set a controller owner reference. The Workload's namespace and name + // are stored as annotations on the Slice for lookup. + // The garbage collection of Slices will be handled manually during Workload finalization. parseTopologyAssignmentIntoNodeSelector(slice, psa.TopologyAssignment, nodes) ps := podset.FindPodSetByName(wl.Spec.PodSets, psa.Name) @@ -701,18 +703,20 @@ func (h *sliceHandler) handleEvent(ctx context.Context, obj client.Object, q wor log := ctrl.LoggerFrom(ctx) - owner := metav1.GetControllerOf(slice) - if owner == nil { - log.V(3).Info("Owner not found") + workloadNamespace, nsFound := slice.Annotations[core.OwnerWorkloadNamespaceAnnotation] + workloadName, nameFound := slice.Annotations[core.OwnerWorkloadNameAnnotation] + + if !nsFound || !nameFound { + log.V(3).Info("Slice is missing workload owner annotations, skipping event handling", "slice", klog.KObj(slice)) return } - log.V(3).Info("Handle Slice event", "workload", klog.KRef(slice.Namespace, slice.Name)) + log.V(3).Info("Handle Slice event", "workload", klog.KRef(workloadNamespace, workloadName)) req := reconcile.Request{ NamespacedName: types.NamespacedName{ - Name: owner.Name, - Namespace: slice.Namespace, + Name: workloadName, + Namespace: workloadNamespace, }, } diff --git a/slice/internal/core/constants.go b/slice/internal/core/constants.go index 84d49e47b..25a51b372 100644 --- a/slice/internal/core/constants.go +++ b/slice/internal/core/constants.go @@ -27,3 +27,8 @@ const ( AcceleratorTpu7x = "tpu-v7x" ) + +const ( + OwnerWorkloadNamespaceAnnotation = "slice.accelerator.gke.io/owner-workload-namespace" + OwnerWorkloadNameAnnotation = "slice.accelerator.gke.io/owner-workload-name" +) diff --git a/slice/internal/core/slice.go b/slice/internal/core/slice.go index 4cc7a66fb..9ff2c2e8a 100644 --- a/slice/internal/core/slice.go +++ b/slice/internal/core/slice.go @@ -33,8 +33,11 @@ func SliceKeyFromWorkload(wl *kueue.Workload, podSetName kueue.PodSetReference) func SliceWithMetadata(wl *kueue.Workload, podSetName kueue.PodSetReference) *v1alpha1.Slice { return &v1alpha1.Slice{ ObjectMeta: metav1.ObjectMeta{ - Name: SliceName(wl.Name, podSetName), - Namespace: wl.Namespace, + Name: SliceName(wl.Name, podSetName), + Annotations: map[string]string{ + OwnerWorkloadNamespaceAnnotation: wl.Namespace, + OwnerWorkloadNameAnnotation: wl.Name, + }, }, } } From 87d3de6160d6fb3b1b28b949940650577eb133be Mon Sep 17 00:00:00 2001 From: Dominik Pajak Date: Thu, 13 Nov 2025 13:11:35 +0000 Subject: [PATCH 02/10] Remove comment --- slice/internal/controller/workload_controller.go | 1 - 1 file changed, 1 deletion(-) diff --git a/slice/internal/controller/workload_controller.go b/slice/internal/controller/workload_controller.go index 06c35c416..d2bb466e6 100644 --- a/slice/internal/controller/workload_controller.go +++ b/slice/internal/controller/workload_controller.go @@ -530,7 +530,6 @@ func (r *WorkloadReconciler) createSlice(ctx context.Context, wl *kueue.Workload // Since Slice is a cluster-scoped object and Workload is namespaced, // we cannot set a controller owner reference. The Workload's namespace and name // are stored as annotations on the Slice for lookup. - // The garbage collection of Slices will be handled manually during Workload finalization. parseTopologyAssignmentIntoNodeSelector(slice, psa.TopologyAssignment, nodes) ps := podset.FindPodSetByName(wl.Spec.PodSets, psa.Name) From 0ef2211d75c946cdde05f3f15f5eeabba070b8a1 Mon Sep 17 00:00:00 2001 From: Dominik Pajak Date: Thu, 13 Nov 2025 13:12:45 +0000 Subject: [PATCH 03/10] Cleanup --- slice/internal/controller/indexer.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/slice/internal/controller/indexer.go b/slice/internal/controller/indexer.go index 879ce6135..f43fd59c0 100644 --- a/slice/internal/controller/indexer.go +++ b/slice/internal/controller/indexer.go @@ -30,12 +30,9 @@ import ( ) const ( - // OwnerReferenceUID is an index key for owner references. - OwnerReferenceUID = "metadata.ownerReferences.uid" - // WorkloadNamespaceIndex is an index key for the workload namespace annotation. + OwnerReferenceUID = "metadata.ownerReferences.uid" WorkloadNamespaceIndex = "workload.namespace" - // WorkloadNameIndex is an index key for the workload name annotation. - WorkloadNameIndex = "workload.name" + WorkloadNameIndex = "workload.name" ) func indexOwnerReferenceUID(obj client.Object) []string { From 95bf8f2820284cfe77dc397f7170de31933fa360 Mon Sep 17 00:00:00 2001 From: Dominik Pajak Date: Fri, 14 Nov 2025 12:13:00 +0000 Subject: [PATCH 04/10] e2e test for same name jobsets --- slice/test/e2e/jobset_test.go | 208 ++++++++++++++++++++++++++++++++++ 1 file changed, 208 insertions(+) diff --git a/slice/test/e2e/jobset_test.go b/slice/test/e2e/jobset_test.go index 07b70c1f0..82bee96f3 100644 --- a/slice/test/e2e/jobset_test.go +++ b/slice/test/e2e/jobset_test.go @@ -1067,5 +1067,213 @@ var _ = ginkgo.Describe("JobSet", func() { utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdWorkload, false) }) }) + + ginkgo.It("should handle two JobSets with the same name in different namespaces", func() { + // Create two distinct namespaces for this test case + ns1 := testing.MakeNamespaceWithGenerateName("e2e-jobset-ns1-") + ns2 := testing.MakeNamespaceWithGenerateName("e2e-jobset-ns2-") + utils.MustCreate(ctx, k8sClient, ns1) + utils.MustCreate(ctx, k8sClient, ns2) + + // Defer cleanup for these new namespaces + ginkgo.DeferCleanup(func() { + gomega.Expect(utils.DeleteNamespace(ctx, k8sClient, ns1)).To(gomega.Succeed()) + gomega.Expect(utils.DeleteNamespace(ctx, k8sClient, ns2)).To(gomega.Succeed()) + }) + + // Create local queues for each namespace, referencing the shared ClusterQueue + lq1 := testing.MakeLocalQueue("lq-ns1", ns1.Name).ClusterQueue(cq.Name).Obj() + lq2 := testing.MakeLocalQueue("lq-ns2", ns2.Name).ClusterQueue(cq.Name).Obj() + utils.MustCreate(ctx, k8sClient, lq1) + utils.MustCreate(ctx, k8sClient, lq2) + ginkgo.DeferCleanup(func() { + utils.ExpectObjectToBeDeleted(ctx, k8sClient, lq1, true) + utils.ExpectObjectToBeDeleted(ctx, k8sClient, lq2, true) + }) + + const commonJobSetName = "jobset-common" + tpuTopology := "4x4x4" + tpuRequests := "4" + parallelism := int32(16) + replicas := int32(1) + wantSliceSize := int32(16) + + jobSet1 := testingjobsjobset.MakeJobSet(commonJobSetName, ns1.Name). + Queue(lq1.Name). + ReplicatedJobs( + testingjobsjobset.ReplicatedJobRequirements{ + Name: "rj1", + Image: utils.E2eTestAgnHostImage, + Args: utils.BehaviorWaitForDeletion, + Replicas: replicas, + Parallelism: parallelism, + Completions: parallelism, + PodAnnotations: map[string]string{ + "cloud.google.com/gke-tpu-topology": tpuTopology, + }, + NodeSelector: map[string]string{ + "cloud.google.com/gke-tpu-accelerator": "tpu-v7x", + }, + }, + ). + RequestAndLimit("rj1", extraResource, tpuRequests). + Obj() + + jobSet2 := testingjobsjobset.MakeJobSet(commonJobSetName, ns2.Name). + Queue(lq2.Name). + ReplicatedJobs( + testingjobsjobset.ReplicatedJobRequirements{ + Name: "rj1", + Image: utils.E2eTestAgnHostImage, + Args: utils.BehaviorWaitForDeletion, + Replicas: replicas, + Parallelism: parallelism, + Completions: parallelism, + PodAnnotations: map[string]string{ + "cloud.google.com/gke-tpu-topology": tpuTopology, + }, + NodeSelector: map[string]string{ + "cloud.google.com/gke-tpu-accelerator": "tpu-v7x", + }, + }, + ). + RequestAndLimit("rj1", extraResource, tpuRequests). + Obj() + + ginkgo.By("Creating JobSet 1", func() { + utils.MustCreate(ctx, k8sClient, jobSet1) + }) + ginkgo.By("Creating JobSet 2", func() { + utils.MustCreate(ctx, k8sClient, jobSet2) + }) + + createdJobSet1 := &jobset.JobSet{} + createdJobSet2 := &jobset.JobSet{} + + ginkgo.By("Checking that JobSet 1 is created with annotations/selectors", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(jobSet1), createdJobSet1)).To(gomega.Succeed()) + for _, replicatedJob := range createdJobSet1.Spec.ReplicatedJobs { + annotations := replicatedJob.Template.Spec.Template.Annotations + g.Expect(annotations["kueue.x-k8s.io/podset-required-topology"]).Should(gomega.Equal("cloud.google.com/gce-topology-block")) + g.Expect(annotations["kueue.x-k8s.io/podset-slice-required-topology"]).Should(gomega.Equal("cloud.google.com/gke-tpu-slice-4x4x4-id")) + g.Expect(annotations["kueue.x-k8s.io/podset-slice-size"]).Should(gomega.Equal(fmt.Sprint(wantSliceSize))) + g.Expect(replicatedJob.Template.Spec.Template.Spec.NodeSelector["cloud.google.com/gke-tpu-slice-4x4x4-health"]).Should(gomega.Equal("true")) + } + }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) + }) + + ginkgo.By("Checking that JobSet 2 is created with annotations/selectors", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(jobSet2), createdJobSet2)).To(gomega.Succeed()) + for _, replicatedJob := range createdJobSet2.Spec.ReplicatedJobs { + annotations := replicatedJob.Template.Spec.Template.Annotations + g.Expect(annotations["kueue.x-k8s.io/podset-required-topology"]).Should(gomega.Equal("cloud.google.com/gce-topology-block")) + g.Expect(annotations["kueue.x-k8s.io/podset-slice-required-topology"]).Should(gomega.Equal("cloud.google.com/gke-tpu-slice-4x4x4-id")) + g.Expect(annotations["kueue.x-k8s.io/podset-slice-size"]).Should(gomega.Equal(fmt.Sprint(wantSliceSize))) + g.Expect(replicatedJob.Template.Spec.Template.Spec.NodeSelector["cloud.google.com/gke-tpu-slice-4x4x4-health"]).Should(gomega.Equal("true")) + } + }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) + }) + + createdWorkload1 := &kueue.Workload{} + wlKey1 := types.NamespacedName{ + Name: jobsetcontroller.GetWorkloadNameForJobSet(jobSet1.Name, jobSet1.UID), + Namespace: ns1.Name, + } + createdWorkload2 := &kueue.Workload{} + wlKey2 := types.NamespacedName{ + Name: jobsetcontroller.GetWorkloadNameForJobSet(jobSet2.Name, jobSet2.UID), + Namespace: ns2.Name, + } + + ginkgo.By("Waiting for Admission of Workload 1", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, wlKey1, createdWorkload1)).Should(gomega.Succeed()) + g.Expect(createdWorkload1.Status.Admission).ShouldNot(gomega.BeNil()) + }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) + }) + + ginkgo.By("Waiting for Admission of Workload 2", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, wlKey2, createdWorkload2)).Should(gomega.Succeed()) + g.Expect(createdWorkload2.Status.Admission).ShouldNot(gomega.BeNil()) + }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) + }) + + createdSlice1 := &slice.Slice{} + sliceKey1 := core.SliceKeyFromWorkload(createdWorkload1, "rj1") + createdSlice2 := &slice.Slice{} + sliceKey2 := core.SliceKeyFromWorkload(createdWorkload2, "rj1") + + var slice1UID types.UID + + ginkgo.By("Checking that Slice 1 is created and setting to Ready", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, sliceKey1, createdSlice1)).To(gomega.Succeed()) + slice1UID = createdSlice1.GetUID() + g.Expect(createdSlice1.Spec.Topology).To(gomega.Equal(tpuTopology)) + g.Expect(createdSlice1.Spec.Type).To(gomega.Equal("tpu-v7x")) + meta.SetStatusCondition(&createdSlice1.Status.Conditions, metav1.Condition{Type: string(slice.Ready), Status: metav1.ConditionTrue, Reason: "TestReady", Message: "Slice is ready"}) + g.Expect(k8sClient.Status().Update(ctx, createdSlice1)).To(gomega.Succeed()) + }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) + }) + + ginkgo.By("Checking that Slice 2 is created and setting to Ready", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, sliceKey2, createdSlice2)).To(gomega.Succeed()) + g.Expect(createdSlice2.GetUID()).ShouldNot(gomega.Equal(slice1UID)) + g.Expect(createdSlice2.Spec.Topology).To(gomega.Equal(tpuTopology)) + g.Expect(createdSlice2.Spec.Type).To(gomega.Equal("tpu-v7x")) + meta.SetStatusCondition(&createdSlice2.Status.Conditions, metav1.Condition{Type: string(slice.Ready), Status: metav1.ConditionTrue, Reason: "TestReady", Message: "Slice is ready"}) + g.Expect(k8sClient.Status().Update(ctx, createdSlice2)).To(gomega.Succeed()) + }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) + }) + + ginkgo.By("Checking that Workload 1 is admitted and admission check status is ready", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, wlKey1, createdWorkload1)).Should(gomega.Succeed()) + g.Expect(workload.IsAdmitted(createdWorkload1)).Should(gomega.BeTrue()) + g.Expect(createdWorkload1.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ + Name: kueue.AdmissionCheckReference(ac.Name), + State: kueue.CheckStateReady, + Message: `Slices are in states: 1 Ready`, + }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) + }, utils.LongTimeout, utils.Timeout).Should(gomega.Succeed()) + }) + + ginkgo.By("Checking that Workload 2 is admitted and admission check status is ready", func() { + gomega.Eventually(func(g gomega.Gomega) { + g.Expect(k8sClient.Get(ctx, wlKey2, createdWorkload2)).Should(gomega.Succeed()) + g.Expect(workload.IsAdmitted(createdWorkload2)).Should(gomega.BeTrue()) + g.Expect(createdWorkload2.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ + Name: kueue.AdmissionCheckReference(ac.Name), + State: kueue.CheckStateReady, + Message: `Slices are in states: 1 Ready`, + }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) + }, utils.LongTimeout, utils.Timeout).Should(gomega.Succeed()) + }) + + ginkgo.By("Deleting JobSet 1", func() { + utils.ExpectObjectToBeDeleted(ctx, k8sClient, jobSet1, true) + }) + ginkgo.By("Deleting JobSet 2", func() { + utils.ExpectObjectToBeDeleted(ctx, k8sClient, jobSet2, true) + }) + + ginkgo.By("Checking that Slice 1 is deleted", func() { + utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdSlice1, false) + }) + ginkgo.By("Checking that Slice 2 is deleted", func() { + utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdSlice2, false) + }) + + ginkgo.By("Checking that Workload 1 is deleted", func() { + utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdWorkload1, false) + }) + ginkgo.By("Checking that Workload 2 is deleted", func() { + utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdWorkload2, false) + }) + }) }) }) From 041ecb722bd465883877aa75371517ecba729c02 Mon Sep 17 00:00:00 2001 From: Dominik Pajak Date: Fri, 14 Nov 2025 13:37:14 +0000 Subject: [PATCH 05/10] Fix test after merge --- slice/test/e2e/jobset_test.go | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/slice/test/e2e/jobset_test.go b/slice/test/e2e/jobset_test.go index 521b0456a..b2e1f26c9 100644 --- a/slice/test/e2e/jobset_test.go +++ b/slice/test/e2e/jobset_test.go @@ -1295,9 +1295,12 @@ var _ = ginkgo.Describe("JobSet", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey1, createdSlice1)).To(gomega.Succeed()) slice1UID = createdSlice1.GetUID() - g.Expect(createdSlice1.Spec.Topology).To(gomega.Equal(tpuTopology)) - g.Expect(createdSlice1.Spec.Type).To(gomega.Equal("tpu-v7x")) - meta.SetStatusCondition(&createdSlice1.Status.Conditions, metav1.Condition{Type: string(slice.Ready), Status: metav1.ConditionTrue, Reason: "TestReady", Message: "Slice is ready"}) + meta.SetStatusCondition(&createdSlice1.Status.Conditions, metav1.Condition{ + Type: slice.SliceStateConditionType, + Status: metav1.ConditionTrue, + Reason: string(core.MMIGHealthStatusActive), + Message: "Slice is ready", + }) g.Expect(k8sClient.Status().Update(ctx, createdSlice1)).To(gomega.Succeed()) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) @@ -1306,9 +1309,12 @@ var _ = ginkgo.Describe("JobSet", func() { gomega.Eventually(func(g gomega.Gomega) { g.Expect(k8sClient.Get(ctx, sliceKey2, createdSlice2)).To(gomega.Succeed()) g.Expect(createdSlice2.GetUID()).ShouldNot(gomega.Equal(slice1UID)) - g.Expect(createdSlice2.Spec.Topology).To(gomega.Equal(tpuTopology)) - g.Expect(createdSlice2.Spec.Type).To(gomega.Equal("tpu-v7x")) - meta.SetStatusCondition(&createdSlice2.Status.Conditions, metav1.Condition{Type: string(slice.Ready), Status: metav1.ConditionTrue, Reason: "TestReady", Message: "Slice is ready"}) + meta.SetStatusCondition(&createdSlice2.Status.Conditions, metav1.Condition{ + Type: slice.SliceStateConditionType, + Status: metav1.ConditionTrue, + Reason: string(core.MMIGHealthStatusActive), + Message: "Slice is ready", + }) g.Expect(k8sClient.Status().Update(ctx, createdSlice2)).To(gomega.Succeed()) }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) }) @@ -1320,7 +1326,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload1.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStateReady, - Message: `Slices are in states: 1 Ready`, + Message: `Slices are in states: 1 ACTIVE`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.LongTimeout, utils.Timeout).Should(gomega.Succeed()) }) @@ -1332,7 +1338,7 @@ var _ = ginkgo.Describe("JobSet", func() { g.Expect(createdWorkload2.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ Name: kueue.AdmissionCheckReference(ac.Name), State: kueue.CheckStateReady, - Message: `Slices are in states: 1 Ready`, + Message: `Slices are in states: 1 ACTIVE`, }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) }, utils.LongTimeout, utils.Timeout).Should(gomega.Succeed()) }) From 0092bb320807ffe9ef23ee1cca8abebfc89f2ef2 Mon Sep 17 00:00:00 2001 From: Dominik Pajak Date: Tue, 18 Nov 2025 09:52:12 +0000 Subject: [PATCH 06/10] Fix unit tests --- .../controller/workload_controller.go | 6 ++--- .../controller/workload_controller_test.go | 27 ++++++++++--------- slice/internal/core/slice.go | 6 ++--- slice/internal/util/testing/wrappers.go | 14 +++++++--- 4 files changed, 31 insertions(+), 22 deletions(-) diff --git a/slice/internal/controller/workload_controller.go b/slice/internal/controller/workload_controller.go index 3b6141037..3b5813638 100644 --- a/slice/internal/controller/workload_controller.go +++ b/slice/internal/controller/workload_controller.go @@ -465,7 +465,7 @@ func (r *WorkloadReconciler) syncSlices( continue } - sliceName := core.SliceName(wl.Name, psa.Name) + sliceName := core.SliceName(wl.Namespace, wl.Name, psa.Name) if _, exist := slicesByName[sliceName]; exist { // Slice already exists, nothing to do. @@ -523,7 +523,7 @@ func (r *WorkloadReconciler) createSlice(ctx context.Context, wl *kueue.Workload slice.Spec.Topology = core.GetTPUTopology(ps.Template) if err := r.client.Create(ctx, slice); err != nil { - msg := fmt.Sprintf("Error creating Slice %q: %v", client.ObjectKeyFromObject(slice), err) + msg := fmt.Sprintf("Error creating Slice %q: %v", slice.Name, err) log.Error(err, msg) r.record.Event(wl, corev1.EventTypeWarning, FailedCreateSliceEventType, api.TruncateEventMessage(msg)) ac.State = kueue.CheckStatePending @@ -548,7 +548,7 @@ func (r *WorkloadReconciler) updateWorkloadAdmissionCheckStatus(ctx context.Cont func buildCreationEventMessage(slices []v1alpha1.Slice) string { sliceNames := make([]string, len(slices)) for index, slice := range slices { - sliceNames[index] = fmt.Sprintf("%q", client.ObjectKeyFromObject(&slice)) + sliceNames[index] = slice.Name } sort.Strings(sliceNames) return fmt.Sprintf("The Slices %s have been created", strings.Join(sliceNames, ", ")) diff --git a/slice/internal/controller/workload_controller_test.go b/slice/internal/controller/workload_controller_test.go index 9b5db9a3a..58518d26d 100644 --- a/slice/internal/controller/workload_controller_test.go +++ b/slice/internal/controller/workload_controller_test.go @@ -133,14 +133,15 @@ func TestWorkloadReconciler(t *testing.T) { baseWorkloadWrapper := utiltesting.MakeWorkload(baseWorkloadName, corev1.NamespaceDefault). UID(baseWorkloadName). AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, "")) - baseSlice1Wrapper := utiltesting.MakeSliceWrapper(core.SliceName(baseWorkloadName, "ps1"), corev1.NamespaceDefault). + baseSlice1Wrapper := utiltesting.MakeSliceWrapper(core.SliceName(corev1.NamespaceDefault, baseWorkloadName, "ps1")). Type("tpu-v7x"). Topology("4x4x12"). - ControllerReference(workloadGVK, baseWorkloadName, baseWorkloadName). + OwnerWorkloadAnnotations(corev1.NamespaceDefault, baseWorkloadName). PartitionIds("subblock1") - baseSlice2Wrapper := baseSlice1Wrapper.Clone().Name(core.SliceName(baseWorkloadName, "ps2")). + baseSlice2Wrapper := baseSlice1Wrapper.Clone().Name(core.SliceName(corev1.NamespaceDefault, baseWorkloadName, "ps2")). Type("tpu-v7x"). Topology("4x4x12"). + OwnerWorkloadAnnotations(corev1.NamespaceDefault, baseWorkloadName). PartitionIds("subblock2") worker1Node := utiltesting.MakeNode("worker1").Label("cloud.google.com/gke-tpu-slice-4x4x4-id", "subblock1") @@ -738,7 +739,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices "default/workload-ps1", "default/workload-ps2" have been created`), + `The Slices default-workload-ps1, default-workload-ps2 have been created`), }, }, "should create Slices only for relevant PodSets (invalid pod template)": { @@ -790,7 +791,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices "default/workload-ps1" have been created`), + `The Slices default-workload-ps1 have been created`), }, }, "should create Slices only for relevant PodSets (invalid assignment)": { @@ -838,7 +839,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices "default/workload-ps1" have been created`), + `The Slices default-workload-ps1 have been created`), }, }, "should create missed Slices": { @@ -870,7 +871,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices "default/workload-ps2" have been created`), + `The Slices default-workload-ps2 have been created`), }, }, "parse TAS Assignment to populate PartitionIDs in Slice": { @@ -901,7 +902,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices "default/workload-ps1", "default/workload-ps2" have been created`), + `The Slices default-workload-ps1, default-workload-ps2 have been created`), }, }, "parse TAS Assignment to populate NodeSelector in Slice (hostname)": { @@ -966,7 +967,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices "default/workload-ps1", "default/workload-ps2" have been created`), + `The Slices default-workload-ps1, default-workload-ps2 have been created`), }, }, "error on Slice creation": { @@ -994,13 +995,13 @@ func TestWorkloadReconciler(t *testing.T) { ReserveQuota(baseAdmission, now). ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). Finalizers(SliceControllerName). - AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Error creating Slice "default/workload-ps1": test error`)). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Error creating Slice "default-workload-ps1": test error`)). Obj(), }, wantErr: errTest, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.EventTypeWarning, FailedCreateSliceEventType, - `Error creating Slice "default/workload-ps1": test error`), + `Error creating Slice "default-workload-ps1": test error`), }, }, "should update the Workload's AdmissionCheckState": { @@ -1357,8 +1358,8 @@ func TestSliceHandlerHandleEvent(t *testing.T) { obj: utiltesting.MakeWorkload(baseWlName, corev1.NamespaceDefault).Obj(), }, "has a workload that should be handled": { - obj: utiltesting.MakeSliceWrapper(baseSliceName, corev1.NamespaceDefault). - ControllerReference(workloadGVK, baseWlName, baseWlName). + obj: utiltesting.MakeSliceWrapper(baseSliceName). + OwnerWorkloadAnnotations(corev1.NamespaceDefault, baseWlName). Obj(), want: []requestDuration{ { diff --git a/slice/internal/core/slice.go b/slice/internal/core/slice.go index fdf1735f2..7f8777014 100644 --- a/slice/internal/core/slice.go +++ b/slice/internal/core/slice.go @@ -38,7 +38,7 @@ func SliceKeyFromWorkload(wl *kueue.Workload, podSetName kueue.PodSetReference) func SliceWithMetadata(wl *kueue.Workload, podSetName kueue.PodSetReference) *v1alpha1.Slice { return &v1alpha1.Slice{ ObjectMeta: metav1.ObjectMeta{ - Name: SliceName(wl.Name, podSetName), + Name: SliceName(wl.Namespace, wl.Name, podSetName), Annotations: map[string]string{ OwnerWorkloadNamespaceAnnotation: wl.Namespace, OwnerWorkloadNameAnnotation: wl.Name, @@ -47,8 +47,8 @@ func SliceWithMetadata(wl *kueue.Workload, podSetName kueue.PodSetReference) *v1 } } -func SliceName(workloadName string, podSetName kueue.PodSetReference) string { - return fmt.Sprintf("%s-%s", workloadName, podSetName) +func SliceName(ns string, workloadName string, podSetName kueue.PodSetReference) string { + return fmt.Sprintf("%s-%s-%s", ns, workloadName, podSetName) } func isStale(slice *v1alpha1.Slice) bool { diff --git a/slice/internal/util/testing/wrappers.go b/slice/internal/util/testing/wrappers.go index 31aa4c06b..63c35edb1 100644 --- a/slice/internal/util/testing/wrappers.go +++ b/slice/internal/util/testing/wrappers.go @@ -250,12 +250,11 @@ type SliceWrapper struct { v1alpha1.Slice } -func MakeSliceWrapper(name, namespace string) *SliceWrapper { +func MakeSliceWrapper(name string) *SliceWrapper { return &SliceWrapper{ v1alpha1.Slice{ ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: namespace, + Name: name, }, }, } @@ -289,6 +288,15 @@ func (s *SliceWrapper) ControllerReference(gvk schema.GroupVersionKind, name, ui return s } +func (s *SliceWrapper) OwnerWorkloadAnnotations(ns, name string) *SliceWrapper { + if s.Annotations == nil { + s.Annotations = make(map[string]string) + } + s.Annotations["slice.accelerator.gke.io/owner-workload-name"] = name + s.Annotations["slice.accelerator.gke.io/owner-workload-namespace"] = ns + return s +} + func (s *SliceWrapper) PartitionIds(ids ...string) *SliceWrapper { s.Spec.PartitionIds = ids return s From 95845a479000287bafcba0521d0c71fcbb5c21e7 Mon Sep 17 00:00:00 2001 From: Dominik Pajak Date: Tue, 18 Nov 2025 11:28:20 +0000 Subject: [PATCH 07/10] Add test with same-name workloads --- .../controller/workload_controller_test.go | 119 +++++++++++++++--- 1 file changed, 103 insertions(+), 16 deletions(-) diff --git a/slice/internal/controller/workload_controller_test.go b/slice/internal/controller/workload_controller_test.go index c4d978362..c8d6f9e05 100644 --- a/slice/internal/controller/workload_controller_test.go +++ b/slice/internal/controller/workload_controller_test.go @@ -59,9 +59,8 @@ var ( ) var ( - jobSetGVK = jobset.SchemeGroupVersion.WithKind("JobSet") - jobGVK = batchv1.SchemeGroupVersion.WithKind("Job") - workloadGVK = kueue.SchemeGroupVersion.WithKind("Workload") + jobSetGVK = jobset.SchemeGroupVersion.WithKind("JobSet") + jobGVK = batchv1.SchemeGroupVersion.WithKind("Job") ) func TestWorkloadReconciler(t *testing.T) { @@ -85,9 +84,9 @@ func TestWorkloadReconciler(t *testing.T) { } } - buildEventRecord := func(eventType, reason, message string) utiltesting.EventRecord { + buildEventRecord := func(namespace, eventType, reason, message string) utiltesting.EventRecord { return utiltesting.EventRecord{ - Key: client.ObjectKey{Namespace: corev1.NamespaceDefault, Name: baseWorkloadName}, + Key: client.ObjectKey{Namespace: namespace, Name: baseWorkloadName}, EventType: eventType, Reason: reason, Message: message, @@ -738,7 +737,7 @@ func TestWorkloadReconciler(t *testing.T) { *baseSlice2Wrapper.DeepCopy(), }, wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, + buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, `The Slices default-workload-ps1, default-workload-ps2 have been created`), }, }, @@ -790,7 +789,7 @@ func TestWorkloadReconciler(t *testing.T) { *baseSlice1Wrapper.DeepCopy(), }, wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, + buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, `The Slices default-workload-ps1 have been created`), }, }, @@ -838,7 +837,7 @@ func TestWorkloadReconciler(t *testing.T) { *baseSlice1Wrapper.DeepCopy(), }, wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, + buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, `The Slices default-workload-ps1 have been created`), }, }, @@ -870,7 +869,7 @@ func TestWorkloadReconciler(t *testing.T) { *baseSlice2Wrapper.DeepCopy(), }, wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, + buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, `The Slices default-workload-ps2 have been created`), }, }, @@ -901,7 +900,7 @@ func TestWorkloadReconciler(t *testing.T) { *baseSlice2Wrapper.DeepCopy(), }, wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, + buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, `The Slices default-workload-ps1, default-workload-ps2 have been created`), }, }, @@ -966,7 +965,7 @@ func TestWorkloadReconciler(t *testing.T) { *baseSlice2Wrapper.DeepCopy(), }, wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeNormal, SlicesCreatedEventType, + buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, `The Slices default-workload-ps1, default-workload-ps2 have been created`), }, }, @@ -1000,7 +999,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantErr: errTest, wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeWarning, FailedCreateSliceEventType, + buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeWarning, FailedCreateSliceEventType, `Error creating Slice "default-workload-ps1": test error`), }, }, @@ -1149,7 +1148,7 @@ func TestWorkloadReconciler(t *testing.T) { *baseSlice2Wrapper.Clone().Active().Obj(), }, wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, + buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, fmt.Sprintf(`Admission check %q updated state from "Pending" to "Ready"`, baseACName)), }, }, @@ -1181,7 +1180,7 @@ func TestWorkloadReconciler(t *testing.T) { *baseSlice1Wrapper.Clone().Active().Obj(), *baseSlice2Wrapper.Clone().Degraded().Obj()}, wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, + buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, fmt.Sprintf(`Admission check %q updated state from "Pending" to "Ready"`, baseACName)), }, }, @@ -1213,7 +1212,7 @@ func TestWorkloadReconciler(t *testing.T) { *baseSlice1Wrapper.Clone().Active().Obj(), }, wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, + buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, fmt.Sprintf(`Admission check %q updated state from "Pending" to "Retry"`, baseACName)), }, }, @@ -1275,10 +1274,98 @@ func TestWorkloadReconciler(t *testing.T) { *baseSlice2Wrapper.Clone().Active().Obj(), }, wantEvents: []utiltesting.EventRecord{ - buildEventRecord(corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, + buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, AdmissionCheckUpdatedEventType, fmt.Sprintf(`Admission check %q updated state from "Pending" to "Ready"`, baseACName)), }, }, + "should create a slice for another workload with the same name but in a different namespace": { + request: types.NamespacedName{Name: baseWorkloadName, Namespace: "namespace2"}, + objs: []client.Object{ + worker1Node.DeepCopy(), + worker2Node.DeepCopy(), + baseAdmissionCheckWrapper.DeepCopy(), + utiltesting.MakeWorkload(baseWorkloadName, "namespace1"). + UID(baseWorkloadName+"-ns1"). + PodSets(basePodSets...). + ReserveQuota(baseAdmission, now). + ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). + Finalizers(SliceControllerName). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStateReady, `Slices are in states: 2 ACTIVE`)). + Obj(), + utiltesting.MakeSliceWrapper(core.SliceName("namespace1", baseWorkloadName, "ps1")). + Type(slice.TypeTpu7x). + Topology("4x4x12"). + OwnerWorkloadAnnotations("namespace1", baseWorkloadName). + PartitionIds("subblock1"). + Active(). + Obj(), + utiltesting.MakeSliceWrapper(core.SliceName("namespace1", baseWorkloadName, "ps2")). + Type(slice.TypeTpu7x). + Topology("4x4x12"). + OwnerWorkloadAnnotations("namespace1", baseWorkloadName). + PartitionIds("subblock2"). + Active(). + Obj(), + utiltesting.MakeWorkload(baseWorkloadName, "namespace2"). + UID(baseWorkloadName+"-ns2"). + PodSets(basePodSets...). + ReserveQuota(baseAdmission, now). + ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). + Finalizers(SliceControllerName). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, "")). + Obj(), + }, + wantWorkloads: []kueue.Workload{ + *utiltesting.MakeWorkload(baseWorkloadName, "namespace1"). + UID(baseWorkloadName+"-ns1"). + PodSets(basePodSets...). + ReserveQuota(baseAdmission, now). + ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). + Finalizers(SliceControllerName). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStateReady, `Slices are in states: 2 ACTIVE`)). + Obj(), + *utiltesting.MakeWorkload(baseWorkloadName, "namespace2"). + UID(baseWorkloadName+"-ns2"). + PodSets(basePodSets...). + ReserveQuota(baseAdmission, now). + ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName). + Finalizers(SliceControllerName). + AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, `Slices are in states: 2 CREATED`)). + Obj(), + }, + wantSlices: []slice.Slice{ + *utiltesting.MakeSliceWrapper(core.SliceName("namespace1", baseWorkloadName, "ps1")). + Type(slice.TypeTpu7x). + Topology("4x4x12"). + OwnerWorkloadAnnotations("namespace1", baseWorkloadName). + PartitionIds("subblock1"). + Active(). + Obj(), + *utiltesting.MakeSliceWrapper(core.SliceName("namespace1", baseWorkloadName, "ps2")). + Type(slice.TypeTpu7x). + Topology("4x4x12"). + OwnerWorkloadAnnotations("namespace1", baseWorkloadName). + PartitionIds("subblock2"). + Active(). + Obj(), + *utiltesting.MakeSliceWrapper(core.SliceName("namespace2", baseWorkloadName, "ps1")). + Type(slice.TypeTpu7x). + Topology("4x4x12"). + OwnerWorkloadAnnotations("namespace2", baseWorkloadName). + PartitionIds("subblock1"). + Obj(), + *utiltesting.MakeSliceWrapper(core.SliceName("namespace2", baseWorkloadName, "ps2")). + Type(slice.TypeTpu7x). + Topology("4x4x12"). + OwnerWorkloadAnnotations("namespace2", baseWorkloadName). + PartitionIds("subblock2"). + Obj(), + }, + wantEvents: []utiltesting.EventRecord{ + buildEventRecord("namespace2", corev1.EventTypeNormal, SlicesCreatedEventType, + `The Slices namespace2-workload-ps1, namespace2-workload-ps2 have been created`), + }, + }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { From 2a0efdbade7f24e5cbd153c8387c035c4b72ae76 Mon Sep 17 00:00:00 2001 From: Dominik Pajak Date: Tue, 18 Nov 2025 11:29:53 +0000 Subject: [PATCH 08/10] Remove redundant test --- slice/test/e2e/jobset_test.go | 214 ---------------------------------- 1 file changed, 214 deletions(-) diff --git a/slice/test/e2e/jobset_test.go b/slice/test/e2e/jobset_test.go index 22b575226..e6160eb4c 100644 --- a/slice/test/e2e/jobset_test.go +++ b/slice/test/e2e/jobset_test.go @@ -1150,219 +1150,5 @@ var _ = ginkgo.Describe("JobSet", func() { utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdWorkload, false) }) }) - - ginkgo.It("should handle two JobSets with the same name in different namespaces", func() { - // Create two distinct namespaces for this test case - ns1 := testing.MakeNamespaceWithGenerateName("e2e-jobset-ns1-") - ns2 := testing.MakeNamespaceWithGenerateName("e2e-jobset-ns2-") - utils.MustCreate(ctx, k8sClient, ns1) - utils.MustCreate(ctx, k8sClient, ns2) - - // Defer cleanup for these new namespaces - ginkgo.DeferCleanup(func() { - gomega.Expect(utils.DeleteNamespace(ctx, k8sClient, ns1)).To(gomega.Succeed()) - gomega.Expect(utils.DeleteNamespace(ctx, k8sClient, ns2)).To(gomega.Succeed()) - }) - - // Create local queues for each namespace, referencing the shared ClusterQueue - lq1 := testing.MakeLocalQueue("lq-ns1", ns1.Name).ClusterQueue(cq.Name).Obj() - lq2 := testing.MakeLocalQueue("lq-ns2", ns2.Name).ClusterQueue(cq.Name).Obj() - utils.MustCreate(ctx, k8sClient, lq1) - utils.MustCreate(ctx, k8sClient, lq2) - ginkgo.DeferCleanup(func() { - utils.ExpectObjectToBeDeleted(ctx, k8sClient, lq1, true) - utils.ExpectObjectToBeDeleted(ctx, k8sClient, lq2, true) - }) - - const commonJobSetName = "jobset-common" - tpuTopology := "4x4x4" - tpuRequests := "4" - parallelism := int32(16) - replicas := int32(1) - wantSliceSize := int32(16) - - jobSet1 := testingjobsjobset.MakeJobSet(commonJobSetName, ns1.Name). - Queue(lq1.Name). - ReplicatedJobs( - testingjobsjobset.ReplicatedJobRequirements{ - Name: "rj1", - Image: utils.E2eTestAgnHostImage, - Args: utils.BehaviorWaitForDeletion, - Replicas: replicas, - Parallelism: parallelism, - Completions: parallelism, - PodAnnotations: map[string]string{ - "cloud.google.com/gke-tpu-topology": tpuTopology, - }, - NodeSelector: map[string]string{ - "cloud.google.com/gke-tpu-accelerator": "tpu-v7x", - }, - }, - ). - RequestAndLimit("rj1", extraResource, tpuRequests). - Obj() - - jobSet2 := testingjobsjobset.MakeJobSet(commonJobSetName, ns2.Name). - Queue(lq2.Name). - ReplicatedJobs( - testingjobsjobset.ReplicatedJobRequirements{ - Name: "rj1", - Image: utils.E2eTestAgnHostImage, - Args: utils.BehaviorWaitForDeletion, - Replicas: replicas, - Parallelism: parallelism, - Completions: parallelism, - PodAnnotations: map[string]string{ - "cloud.google.com/gke-tpu-topology": tpuTopology, - }, - NodeSelector: map[string]string{ - "cloud.google.com/gke-tpu-accelerator": "tpu-v7x", - }, - }, - ). - RequestAndLimit("rj1", extraResource, tpuRequests). - Obj() - - ginkgo.By("Creating JobSet 1", func() { - utils.MustCreate(ctx, k8sClient, jobSet1) - }) - ginkgo.By("Creating JobSet 2", func() { - utils.MustCreate(ctx, k8sClient, jobSet2) - }) - - createdJobSet1 := &jobset.JobSet{} - createdJobSet2 := &jobset.JobSet{} - - ginkgo.By("Checking that JobSet 1 is created with annotations/selectors", func() { - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(jobSet1), createdJobSet1)).To(gomega.Succeed()) - for _, replicatedJob := range createdJobSet1.Spec.ReplicatedJobs { - annotations := replicatedJob.Template.Spec.Template.Annotations - g.Expect(annotations["kueue.x-k8s.io/podset-required-topology"]).Should(gomega.Equal("cloud.google.com/gce-topology-block")) - g.Expect(annotations["kueue.x-k8s.io/podset-slice-required-topology"]).Should(gomega.Equal("cloud.google.com/gke-tpu-slice-4x4x4-id")) - g.Expect(annotations["kueue.x-k8s.io/podset-slice-size"]).Should(gomega.Equal(fmt.Sprint(wantSliceSize))) - g.Expect(replicatedJob.Template.Spec.Template.Spec.NodeSelector["cloud.google.com/gke-tpu-slice-4x4x4-health"]).Should(gomega.Equal("true")) - } - }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) - }) - - ginkgo.By("Checking that JobSet 2 is created with annotations/selectors", func() { - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(jobSet2), createdJobSet2)).To(gomega.Succeed()) - for _, replicatedJob := range createdJobSet2.Spec.ReplicatedJobs { - annotations := replicatedJob.Template.Spec.Template.Annotations - g.Expect(annotations["kueue.x-k8s.io/podset-required-topology"]).Should(gomega.Equal("cloud.google.com/gce-topology-block")) - g.Expect(annotations["kueue.x-k8s.io/podset-slice-required-topology"]).Should(gomega.Equal("cloud.google.com/gke-tpu-slice-4x4x4-id")) - g.Expect(annotations["kueue.x-k8s.io/podset-slice-size"]).Should(gomega.Equal(fmt.Sprint(wantSliceSize))) - g.Expect(replicatedJob.Template.Spec.Template.Spec.NodeSelector["cloud.google.com/gke-tpu-slice-4x4x4-health"]).Should(gomega.Equal("true")) - } - }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) - }) - - createdWorkload1 := &kueue.Workload{} - wlKey1 := types.NamespacedName{ - Name: jobsetcontroller.GetWorkloadNameForJobSet(jobSet1.Name, jobSet1.UID), - Namespace: ns1.Name, - } - createdWorkload2 := &kueue.Workload{} - wlKey2 := types.NamespacedName{ - Name: jobsetcontroller.GetWorkloadNameForJobSet(jobSet2.Name, jobSet2.UID), - Namespace: ns2.Name, - } - - ginkgo.By("Waiting for Admission of Workload 1", func() { - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, wlKey1, createdWorkload1)).Should(gomega.Succeed()) - g.Expect(createdWorkload1.Status.Admission).ShouldNot(gomega.BeNil()) - }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) - }) - - ginkgo.By("Waiting for Admission of Workload 2", func() { - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, wlKey2, createdWorkload2)).Should(gomega.Succeed()) - g.Expect(createdWorkload2.Status.Admission).ShouldNot(gomega.BeNil()) - }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) - }) - - createdSlice1 := &slice.Slice{} - sliceKey1 := core.SliceKeyFromWorkload(createdWorkload1, "rj1") - createdSlice2 := &slice.Slice{} - sliceKey2 := core.SliceKeyFromWorkload(createdWorkload2, "rj1") - - var slice1UID types.UID - - ginkgo.By("Checking that Slice 1 is created and setting to Ready", func() { - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, sliceKey1, createdSlice1)).To(gomega.Succeed()) - slice1UID = createdSlice1.GetUID() - meta.SetStatusCondition(&createdSlice1.Status.Conditions, metav1.Condition{ - Type: slice.SliceStateConditionType, - Status: metav1.ConditionTrue, - Reason: string(core.MMIGHealthStatusActive), - Message: "Slice is ready", - }) - g.Expect(k8sClient.Status().Update(ctx, createdSlice1)).To(gomega.Succeed()) - }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) - }) - - ginkgo.By("Checking that Slice 2 is created and setting to Ready", func() { - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, sliceKey2, createdSlice2)).To(gomega.Succeed()) - g.Expect(createdSlice2.GetUID()).ShouldNot(gomega.Equal(slice1UID)) - meta.SetStatusCondition(&createdSlice2.Status.Conditions, metav1.Condition{ - Type: slice.SliceStateConditionType, - Status: metav1.ConditionTrue, - Reason: string(core.MMIGHealthStatusActive), - Message: "Slice is ready", - }) - g.Expect(k8sClient.Status().Update(ctx, createdSlice2)).To(gomega.Succeed()) - }, utils.Timeout, utils.Interval).Should(gomega.Succeed()) - }) - - ginkgo.By("Checking that Workload 1 is admitted and admission check status is ready", func() { - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, wlKey1, createdWorkload1)).Should(gomega.Succeed()) - g.Expect(workload.IsAdmitted(createdWorkload1)).Should(gomega.BeTrue()) - g.Expect(createdWorkload1.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ - Name: kueue.AdmissionCheckReference(ac.Name), - State: kueue.CheckStateReady, - Message: `Slices are in states: 1 ACTIVE`, - }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) - }, utils.LongTimeout, utils.Timeout).Should(gomega.Succeed()) - }) - - ginkgo.By("Checking that Workload 2 is admitted and admission check status is ready", func() { - gomega.Eventually(func(g gomega.Gomega) { - g.Expect(k8sClient.Get(ctx, wlKey2, createdWorkload2)).Should(gomega.Succeed()) - g.Expect(workload.IsAdmitted(createdWorkload2)).Should(gomega.BeTrue()) - g.Expect(createdWorkload2.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{ - Name: kueue.AdmissionCheckReference(ac.Name), - State: kueue.CheckStateReady, - Message: `Slices are in states: 1 ACTIVE`, - }}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates"))) - }, utils.LongTimeout, utils.Timeout).Should(gomega.Succeed()) - }) - - ginkgo.By("Deleting JobSet 1", func() { - utils.ExpectObjectToBeDeleted(ctx, k8sClient, jobSet1, true) - }) - ginkgo.By("Deleting JobSet 2", func() { - utils.ExpectObjectToBeDeleted(ctx, k8sClient, jobSet2, true) - }) - - ginkgo.By("Checking that Slice 1 is deleted", func() { - utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdSlice1, false) - }) - ginkgo.By("Checking that Slice 2 is deleted", func() { - utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdSlice2, false) - }) - - ginkgo.By("Checking that Workload 1 is deleted", func() { - utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdWorkload1, false) - }) - ginkgo.By("Checking that Workload 2 is deleted", func() { - utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdWorkload2, false) - }) - }) }) }) From 29f4200a1e99750d13edb3d1bd8ffd6060c10586 Mon Sep 17 00:00:00 2001 From: Dominik Pajak Date: Tue, 18 Nov 2025 11:39:59 +0000 Subject: [PATCH 09/10] Fix quotes --- slice/internal/controller/workload_controller.go | 2 +- .../controller/workload_controller_test.go | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/slice/internal/controller/workload_controller.go b/slice/internal/controller/workload_controller.go index 3b5813638..91305a8a1 100644 --- a/slice/internal/controller/workload_controller.go +++ b/slice/internal/controller/workload_controller.go @@ -548,7 +548,7 @@ func (r *WorkloadReconciler) updateWorkloadAdmissionCheckStatus(ctx context.Cont func buildCreationEventMessage(slices []v1alpha1.Slice) string { sliceNames := make([]string, len(slices)) for index, slice := range slices { - sliceNames[index] = slice.Name + sliceNames[index] = fmt.Sprintf("%q", slice.Name) } sort.Strings(sliceNames) return fmt.Sprintf("The Slices %s have been created", strings.Join(sliceNames, ", ")) diff --git a/slice/internal/controller/workload_controller_test.go b/slice/internal/controller/workload_controller_test.go index c8d6f9e05..a7624e650 100644 --- a/slice/internal/controller/workload_controller_test.go +++ b/slice/internal/controller/workload_controller_test.go @@ -738,7 +738,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices default-workload-ps1, default-workload-ps2 have been created`), + `The Slices "default-workload-ps1", "default-workload-ps2" have been created`), }, }, "should create Slices only for relevant PodSets (invalid pod template)": { @@ -790,7 +790,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices default-workload-ps1 have been created`), + `The Slices "default-workload-ps1" have been created`), }, }, "should create Slices only for relevant PodSets (invalid assignment)": { @@ -838,7 +838,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices default-workload-ps1 have been created`), + `The Slices "default-workload-ps1" have been created`), }, }, "should create missed Slices": { @@ -870,7 +870,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices default-workload-ps2 have been created`), + `The Slices "default-workload-ps2" have been created`), }, }, "parse TAS Assignment to populate PartitionIDs in Slice": { @@ -901,7 +901,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices default-workload-ps1, default-workload-ps2 have been created`), + `The Slices "default-workload-ps1", "default-workload-ps2" have been created`), }, }, "parse TAS Assignment to populate NodeSelector in Slice (hostname)": { @@ -966,7 +966,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices default-workload-ps1, default-workload-ps2 have been created`), + `The Slices "default-workload-ps1", "default-workload-ps2" have been created`), }, }, "error on Slice creation": { @@ -1363,7 +1363,7 @@ func TestWorkloadReconciler(t *testing.T) { }, wantEvents: []utiltesting.EventRecord{ buildEventRecord("namespace2", corev1.EventTypeNormal, SlicesCreatedEventType, - `The Slices namespace2-workload-ps1, namespace2-workload-ps2 have been created`), + `The Slices "namespace2-workload-ps1", "namespace2-workload-ps2" have been created`), }, }, } From 9b4e0ffde79d0900b41ce87638be5e79ed6f7a82 Mon Sep 17 00:00:00 2001 From: Dominik Pajak Date: Mon, 24 Nov 2025 10:24:17 +0000 Subject: [PATCH 10/10] Replace string with const --- slice/internal/util/testing/wrappers.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/slice/internal/util/testing/wrappers.go b/slice/internal/util/testing/wrappers.go index 93decca72..68ebaa52a 100644 --- a/slice/internal/util/testing/wrappers.go +++ b/slice/internal/util/testing/wrappers.go @@ -292,8 +292,8 @@ func (s *SliceWrapper) OwnerWorkloadAnnotations(ns, name string) *SliceWrapper { if s.Annotations == nil { s.Annotations = make(map[string]string) } - s.Annotations["slice.accelerator.gke.io/owner-workload-name"] = name - s.Annotations["slice.accelerator.gke.io/owner-workload-namespace"] = ns + s.Annotations[core.OwnerWorkloadNameAnnotation] = name + s.Annotations[core.OwnerWorkloadNamespaceAnnotation] = ns return s }