Skip to content
1 change: 1 addition & 0 deletions slice/api/v1alpha1/slice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ type SliceStatus struct {

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:scope=Cluster
// +kubebuilder:printcolumn:name="Type",type=string,JSONPath=`.spec.type`
// +kubebuilder:printcolumn:name="Topology",type=string,JSONPath=`.spec.topology`
// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.conditions[0].type`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
listKind: SliceList
plural: slices
singular: slice
scope: Namespaced
scope: Cluster
versions:
- additionalPrinterColumns:
- jsonPath: .spec.type
Expand Down
32 changes: 29 additions & 3 deletions slice/internal/controller/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,50 @@ import (
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"

"tpu-slice-controller/api/v1alpha1"
"tpu-slice-controller/internal/core"
"tpu-slice-controller/internal/util/slices"
)

const (
OwnerReferenceUID = "metadata.ownerReferences.uid"
OwnerReferenceUID = "metadata.ownerReferences.uid"
WorkloadNamespaceIndex = "workload.namespace"
WorkloadNameIndex = "workload.name"
)

func indexOwnerReferenceUID(obj client.Object) []string {
return slices.Map(obj.GetOwnerReferences(), func(o *metav1.OwnerReference) string { return string(o.UID) })
}

func indexSliceByWorkloadNamespace(obj client.Object) []string {
if slice, ok := obj.(*v1alpha1.Slice); ok {
if ns, found := slice.GetAnnotations()[core.OwnerWorkloadNamespaceAnnotation]; found {
return []string{ns}
}
}
return nil
}

func indexSliceByWorkloadName(obj client.Object) []string {
if slice, ok := obj.(*v1alpha1.Slice); ok {
if name, found := slice.GetAnnotations()[core.OwnerWorkloadNameAnnotation]; found {
return []string{name}
}
}
return nil
}

// SetupIndexer configures the indexer to index specific fields for kueue.Workload and v1alpha1.Slice resources.
func SetupIndexer(ctx context.Context, indexer client.FieldIndexer) error {
if err := indexer.IndexField(ctx, &kueue.Workload{}, OwnerReferenceUID, indexOwnerReferenceUID); err != nil {
return fmt.Errorf("setting index on ownerReferences.uid for Workload: %w", err)
}
if err := indexer.IndexField(ctx, &v1alpha1.Slice{}, OwnerReferenceUID, indexOwnerReferenceUID); err != nil {
return fmt.Errorf("setting index on ownerReferences.uid for Slice: %w", err)
// Since Slice is now cluster-scoped, it cannot have a controller owner reference to a namespaced Workload.
// We use annotations for linking Slices to Workloads.
if err := indexer.IndexField(ctx, &v1alpha1.Slice{}, WorkloadNamespaceIndex, indexSliceByWorkloadNamespace); err != nil {
return fmt.Errorf("setting index on workload namespace for Slice: %w", err)
}
if err := indexer.IndexField(ctx, &v1alpha1.Slice{}, WorkloadNameIndex, indexSliceByWorkloadName); err != nil {
return fmt.Errorf("setting index on workload name for Slice: %w", err)
}
return nil
}
27 changes: 15 additions & 12 deletions slice/internal/controller/workload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,10 @@ func (r *WorkloadReconciler) cleanupSlices(ctx context.Context, wl *kueue.Worklo
func (r *WorkloadReconciler) findWorkloadSlices(ctx context.Context, wl *kueue.Workload) ([]v1alpha1.Slice, error) {
slices := &v1alpha1.SliceList{}
opts := []client.ListOption{
client.InNamespace(wl.Namespace),
client.MatchingFields{OwnerReferenceUID: string(wl.UID)},
client.MatchingFields{
WorkloadNamespaceIndex: wl.Namespace,
WorkloadNameIndex: wl.Name,
},
}
if err := r.client.List(ctx, slices, opts...); err != nil {
return nil, err
Expand Down Expand Up @@ -525,10 +527,9 @@ func (r *WorkloadReconciler) createSlice(ctx context.Context, wl *kueue.Workload
slice := core.SliceWithMetadata(wl, psa.Name)
log := ctrl.LoggerFrom(ctx).WithValues("slice", klog.KObj(slice))
log.V(3).Info("Creating Slice")

if err := controllerutil.SetControllerReference(wl, slice, r.client.Scheme()); err != nil {
return nil, err
}
// Since Slice is a cluster-scoped object and Workload is namespaced,
// we cannot set a controller owner reference. The Workload's namespace and name
// are stored as annotations on the Slice for lookup.
parseTopologyAssignmentIntoNodeSelector(slice, psa.TopologyAssignment, nodes)

ps := podset.FindPodSetByName(wl.Spec.PodSets, psa.Name)
Expand Down Expand Up @@ -701,18 +702,20 @@ func (h *sliceHandler) handleEvent(ctx context.Context, obj client.Object, q wor

log := ctrl.LoggerFrom(ctx)

owner := metav1.GetControllerOf(slice)
if owner == nil {
log.V(3).Info("Owner not found")
workloadNamespace, nsFound := slice.Annotations[core.OwnerWorkloadNamespaceAnnotation]
workloadName, nameFound := slice.Annotations[core.OwnerWorkloadNameAnnotation]

if !nsFound || !nameFound {
log.V(3).Info("Slice is missing workload owner annotations, skipping event handling", "slice", klog.KObj(slice))
return
}

log.V(3).Info("Handle Slice event", "workload", klog.KRef(slice.Namespace, slice.Name))
log.V(3).Info("Handle Slice event", "workload", klog.KRef(workloadNamespace, workloadName))

req := reconcile.Request{
NamespacedName: types.NamespacedName{
Name: owner.Name,
Namespace: slice.Namespace,
Name: workloadName,
Namespace: workloadNamespace,
},
}

Expand Down
5 changes: 5 additions & 0 deletions slice/internal/core/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,8 @@ const (

AcceleratorTpu7x = "tpu-v7x"
)

const (
OwnerWorkloadNamespaceAnnotation = "slice.accelerator.gke.io/owner-workload-namespace"
OwnerWorkloadNameAnnotation = "slice.accelerator.gke.io/owner-workload-name"
)
7 changes: 5 additions & 2 deletions slice/internal/core/slice.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,11 @@ func SliceKeyFromWorkload(wl *kueue.Workload, podSetName kueue.PodSetReference)
func SliceWithMetadata(wl *kueue.Workload, podSetName kueue.PodSetReference) *v1alpha1.Slice {
return &v1alpha1.Slice{
ObjectMeta: metav1.ObjectMeta{
Name: SliceName(wl.Name, podSetName),
Namespace: wl.Namespace,
Name: SliceName(wl.Name, podSetName),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if we have workloads with the same names and PodSet names in different namespaces? Should we include the namespace in the name?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please also add test case for this scenario.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a test "should handle two JobSets with the same name in different namespaces", which passes without any additional changes. The thing is that each workload gets a different name (bc it is build using has of UID of the JobSet https://github.com/kubernetes-sigs/kueue/blob/36e330ef5aa07ec20157cd523805cd173dac488b/pkg/controller/jobframework/workload_names.go#L52).

Do you think its possible to construct a test in which two workloads have the same names?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, what if the user creates a prebuilt workload?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I see. I think you are right. So I prepended the namespace to the slice name for fix the potential name collision. Added a unit test "should create a slice for another workload with the same name but in a different namespace" that should cover this scenario. Please take a look.

Annotations: map[string]string{
OwnerWorkloadNamespaceAnnotation: wl.Namespace,
OwnerWorkloadNameAnnotation: wl.Name,
},
},
}
}
Expand Down
Loading