Skip to content

Commit e58527f

Browse files
authored
[slice] Cluster scoped (#827)
* slice becomes cluster-scoped * Remove comment * Cleanup * e2e test for same name jobsets * Fix test after merge * Fix unit tests * Add test with same-name workloads * Remove redundant test * Fix quotes * Replace string with const
1 parent 58a23c2 commit e58527f

File tree

8 files changed

+189
-55
lines changed

8 files changed

+189
-55
lines changed

slice/api/v1alpha1/slice_types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ type SliceStatus struct {
5959

6060
// +kubebuilder:object:root=true
6161
// +kubebuilder:subresource:status
62+
// +kubebuilder:resource:scope=Cluster
6263
// +kubebuilder:printcolumn:name="Type",type=string,JSONPath=`.spec.type`
6364
// +kubebuilder:printcolumn:name="Topology",type=string,JSONPath=`.spec.topology`
6465
// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].reason`

slice/config/crd/bases/slice.accelerator.gke.io_slices.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ spec:
1212
listKind: SliceList
1313
plural: slices
1414
singular: slice
15-
scope: Namespaced
15+
scope: Cluster
1616
versions:
1717
- additionalPrinterColumns:
1818
- jsonPath: .spec.type

slice/internal/controller/indexer.go

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,24 +25,50 @@ import (
2525
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
2626

2727
"tpu-slice-controller/api/v1alpha1"
28+
"tpu-slice-controller/internal/core"
2829
"tpu-slice-controller/internal/util/slices"
2930
)
3031

3132
const (
32-
OwnerReferenceUID = "metadata.ownerReferences.uid"
33+
OwnerReferenceUID = "metadata.ownerReferences.uid"
34+
WorkloadNamespaceIndex = "workload.namespace"
35+
WorkloadNameIndex = "workload.name"
3336
)
3437

3538
func indexOwnerReferenceUID(obj client.Object) []string {
3639
return slices.Map(obj.GetOwnerReferences(), func(o *metav1.OwnerReference) string { return string(o.UID) })
3740
}
3841

42+
func indexSliceByWorkloadNamespace(obj client.Object) []string {
43+
if slice, ok := obj.(*v1alpha1.Slice); ok {
44+
if ns, found := slice.GetAnnotations()[core.OwnerWorkloadNamespaceAnnotation]; found {
45+
return []string{ns}
46+
}
47+
}
48+
return nil
49+
}
50+
51+
func indexSliceByWorkloadName(obj client.Object) []string {
52+
if slice, ok := obj.(*v1alpha1.Slice); ok {
53+
if name, found := slice.GetAnnotations()[core.OwnerWorkloadNameAnnotation]; found {
54+
return []string{name}
55+
}
56+
}
57+
return nil
58+
}
59+
3960
// SetupIndexer configures the indexer to index specific fields for kueue.Workload and v1alpha1.Slice resources.
4061
func SetupIndexer(ctx context.Context, indexer client.FieldIndexer) error {
4162
if err := indexer.IndexField(ctx, &kueue.Workload{}, OwnerReferenceUID, indexOwnerReferenceUID); err != nil {
4263
return fmt.Errorf("setting index on ownerReferences.uid for Workload: %w", err)
4364
}
44-
if err := indexer.IndexField(ctx, &v1alpha1.Slice{}, OwnerReferenceUID, indexOwnerReferenceUID); err != nil {
45-
return fmt.Errorf("setting index on ownerReferences.uid for Slice: %w", err)
65+
// Since Slice is now cluster-scoped, it cannot have a controller owner reference to a namespaced Workload.
66+
// We use annotations for linking Slices to Workloads.
67+
if err := indexer.IndexField(ctx, &v1alpha1.Slice{}, WorkloadNamespaceIndex, indexSliceByWorkloadNamespace); err != nil {
68+
return fmt.Errorf("setting index on workload namespace for Slice: %w", err)
69+
}
70+
if err := indexer.IndexField(ctx, &v1alpha1.Slice{}, WorkloadNameIndex, indexSliceByWorkloadName); err != nil {
71+
return fmt.Errorf("setting index on workload name for Slice: %w", err)
4672
}
4773
return nil
4874
}

slice/internal/controller/workload_controller.go

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,10 @@ func (r *WorkloadReconciler) cleanupSlices(ctx context.Context, wl *kueue.Worklo
282282
func (r *WorkloadReconciler) findWorkloadSlices(ctx context.Context, wl *kueue.Workload) ([]v1alpha1.Slice, error) {
283283
slices := &v1alpha1.SliceList{}
284284
opts := []client.ListOption{
285-
client.InNamespace(wl.Namespace),
286-
client.MatchingFields{OwnerReferenceUID: string(wl.UID)},
285+
client.MatchingFields{
286+
WorkloadNamespaceIndex: wl.Namespace,
287+
WorkloadNameIndex: wl.Name,
288+
},
287289
}
288290
if err := r.client.List(ctx, slices, opts...); err != nil {
289291
return nil, err
@@ -463,7 +465,7 @@ func (r *WorkloadReconciler) syncSlices(
463465
continue
464466
}
465467

466-
sliceName := core.SliceName(wl.Name, psa.Name)
468+
sliceName := core.SliceName(wl.Namespace, wl.Name, psa.Name)
467469

468470
if _, exist := slicesByName[sliceName]; exist {
469471
// Slice already exists, nothing to do.
@@ -511,18 +513,17 @@ func (r *WorkloadReconciler) createSlice(ctx context.Context, wl *kueue.Workload
511513
slice := core.SliceWithMetadata(wl, psa.Name)
512514
log := ctrl.LoggerFrom(ctx).WithValues("slice", klog.KObj(slice))
513515
log.V(3).Info("Creating Slice")
514-
515-
if err := controllerutil.SetControllerReference(wl, slice, r.client.Scheme()); err != nil {
516-
return nil, err
517-
}
516+
// Since Slice is a cluster-scoped object and Workload is namespaced,
517+
// we cannot set a controller owner reference. The Workload's namespace and name
518+
// are stored as annotations on the Slice for lookup.
518519
parseTopologyAssignmentIntoPartitionIds(slice, psa.TopologyAssignment, nodes)
519520

520521
ps := podset.FindPodSetByName(wl.Spec.PodSets, psa.Name)
521522
slice.Spec.Type = v1alpha1.Type(core.GetTPUAccelerator(ps.Template))
522523
slice.Spec.Topology = core.GetTPUTopology(ps.Template)
523524

524525
if err := r.client.Create(ctx, slice); err != nil {
525-
msg := fmt.Sprintf("Error creating Slice %q: %v", client.ObjectKeyFromObject(slice), err)
526+
msg := fmt.Sprintf("Error creating Slice %q: %v", slice.Name, err)
526527
log.Error(err, msg)
527528
r.record.Event(wl, corev1.EventTypeWarning, FailedCreateSliceEventType, api.TruncateEventMessage(msg))
528529
ac.State = kueue.CheckStatePending
@@ -547,7 +548,7 @@ func (r *WorkloadReconciler) updateWorkloadAdmissionCheckStatus(ctx context.Cont
547548
func buildCreationEventMessage(slices []v1alpha1.Slice) string {
548549
sliceNames := make([]string, len(slices))
549550
for index, slice := range slices {
550-
sliceNames[index] = fmt.Sprintf("%q", client.ObjectKeyFromObject(&slice))
551+
sliceNames[index] = fmt.Sprintf("%q", slice.Name)
551552
}
552553
sort.Strings(sliceNames)
553554
return fmt.Sprintf("The Slices %s have been created", strings.Join(sliceNames, ", "))
@@ -672,18 +673,20 @@ func (h *sliceHandler) handleEvent(ctx context.Context, obj client.Object, q wor
672673

673674
log := ctrl.LoggerFrom(ctx)
674675

675-
owner := metav1.GetControllerOf(slice)
676-
if owner == nil {
677-
log.V(3).Info("Owner not found")
676+
workloadNamespace, nsFound := slice.Annotations[core.OwnerWorkloadNamespaceAnnotation]
677+
workloadName, nameFound := slice.Annotations[core.OwnerWorkloadNameAnnotation]
678+
679+
if !nsFound || !nameFound {
680+
log.V(3).Info("Slice is missing workload owner annotations, skipping event handling", "slice", klog.KObj(slice))
678681
return
679682
}
680683

681-
log.V(3).Info("Handle Slice event", "workload", klog.KRef(slice.Namespace, slice.Name))
684+
log.V(3).Info("Handle Slice event", "workload", klog.KRef(workloadNamespace, workloadName))
682685

683686
req := reconcile.Request{
684687
NamespacedName: types.NamespacedName{
685-
Name: owner.Name,
686-
Namespace: slice.Namespace,
688+
Name: workloadName,
689+
Namespace: workloadNamespace,
687690
},
688691
}
689692

0 commit comments

Comments
 (0)