From efc98e8bf44ef522030cad3a48b2a67b8267d785 Mon Sep 17 00:00:00 2001 From: dberkov Date: Fri, 26 Jun 2026 11:15:24 -0700 Subject: [PATCH 1/2] feat: implement container readiness probes with custom HTTP endpoint validation --- .../internal/controlapi/workflow_pause.go | 1 + .../internal/controlapi/workflow_suspend.go | 1 + .../internal/controlapi/workload_spec.go | 18 ++ .../internal/controlapi/workload_spec_test.go | 44 ++++ .../controllers/actortemplate_controller.go | 27 +- .../actortemplate_controller_test.go | 84 ++++++ cmd/atelet/main.go | 24 +- cmd/atelet/main_test.go | 36 +++ cmd/ateom-gvisor/main.go | 11 + cmd/ateom-microvm/restore.go | 6 + cmd/ateom-microvm/run.go | 6 + demos/counter/counter-microvm.yaml.tmpl | 4 + demos/counter/counter.go | 20 +- demos/counter/counter.yaml.tmpl | 4 + docs/api-guide.md | 27 +- internal/proto/ateletpb/atelet.pb.go | 239 +++++++++++++----- internal/proto/ateletpb/atelet.proto | 15 ++ internal/proto/ateompb/ateom.pb.go | 205 +++++++++++---- internal/proto/ateompb/ateom.proto | 15 ++ internal/readyz/readyz.go | 159 ++++++++++++ internal/readyz/readyz_test.go | 201 +++++++++++++++ .../generated/ate.dev_actortemplates.yaml | 31 +++ pkg/api/v1alpha1/actortemplate_types.go | 36 +++ .../v1alpha1/actortemplate_validation_test.go | 85 +++++++ pkg/api/v1alpha1/zz_generated.deepcopy.go | 40 +++ 25 files changed, 1231 insertions(+), 108 deletions(-) create mode 100644 cmd/atecontroller/internal/controllers/actortemplate_controller_test.go create mode 100644 internal/readyz/readyz.go create mode 100644 internal/readyz/readyz_test.go diff --git a/cmd/ateapi/internal/controlapi/workflow_pause.go b/cmd/ateapi/internal/controlapi/workflow_pause.go index a7aaa41bc..3678c6254 100644 --- a/cmd/ateapi/internal/controlapi/workflow_pause.go +++ b/cmd/ateapi/internal/controlapi/workflow_pause.go @@ -137,6 +137,7 @@ func (s *CallAteletPauseStep) Execute(ctx context.Context, input *PauseInput, st Name: ctr.Name, Image: ctr.Image, Command: ctr.Command, + Readyz: toAteletReadyz(ctr.Readyz), } for _, env := range ctr.Env { var val string diff --git a/cmd/ateapi/internal/controlapi/workflow_suspend.go b/cmd/ateapi/internal/controlapi/workflow_suspend.go index 1c0bcec2d..73d8e34d7 100644 --- a/cmd/ateapi/internal/controlapi/workflow_suspend.go +++ b/cmd/ateapi/internal/controlapi/workflow_suspend.go @@ -139,6 +139,7 @@ func (s *CallAteletSuspendStep) Execute(ctx context.Context, input *SuspendInput Name: ctr.Name, Image: ctr.Image, Command: ctr.Command, + Readyz: toAteletReadyz(ctr.Readyz), } for _, env := range ctr.Env { var val string diff --git a/cmd/ateapi/internal/controlapi/workload_spec.go b/cmd/ateapi/internal/controlapi/workload_spec.go index 018fb444f..6986a46b1 100644 --- a/cmd/ateapi/internal/controlapi/workload_spec.go +++ b/cmd/ateapi/internal/controlapi/workload_spec.go @@ -47,6 +47,7 @@ func workloadSpecFromActorTemplate(ctx context.Context, kubeClient kubernetes.In Name: ctr.Name, Image: ctr.Image, Command: ctr.Command, + Readyz: toAteletReadyz(ctr.Readyz), } for _, env := range ctr.Env { ateletEnv, err := resolver.resolve(ctx, ctr.Name, env) @@ -63,6 +64,23 @@ func workloadSpecFromActorTemplate(ctx context.Context, kubeClient kubernetes.In return workloadSpec, nil } +// toAteletReadyz projects the CRD readyz field onto the ateletpb wire type. +// Returns nil when the source is nil so containers without a probe stay +// unchanged on the wire. +func toAteletReadyz(in *atev1alpha1.ContainerReadyz) *ateletpb.Readyz { + if in == nil { + return nil + } + out := &ateletpb.Readyz{} + if in.HTTPGet != nil { + out.HttpGet = &ateletpb.HTTPGetAction{ + Path: in.HTTPGet.Path, + Port: in.HTTPGet.Port, + } + } + return out +} + type envResolver struct { kubeClient kubernetes.Interface namespace string diff --git a/cmd/ateapi/internal/controlapi/workload_spec_test.go b/cmd/ateapi/internal/controlapi/workload_spec_test.go index 45ccdcbf4..595f80909 100644 --- a/cmd/ateapi/internal/controlapi/workload_spec_test.go +++ b/cmd/ateapi/internal/controlapi/workload_spec_test.go @@ -98,6 +98,50 @@ func TestWorkloadSpecFromActorTemplateResolvesValueFromEnv(t *testing.T) { } } +func TestWorkloadSpecFromActorTemplatePropagatesReadyz(t *testing.T) { + ctx := context.Background() + got, err := workloadSpecFromActorTemplate(ctx, fake.NewSimpleClientset(), nil, &atev1alpha1.ActorTemplate{ + ObjectMeta: metav1.ObjectMeta{Name: "tmpl-readyz", Namespace: "agent-ns"}, + Spec: atev1alpha1.ActorTemplateSpec{ + Containers: []atev1alpha1.Container{ + { + Name: "with-probe", + Image: "main", + Readyz: &atev1alpha1.ContainerReadyz{ + HTTPGet: &atev1alpha1.HTTPGetAction{Path: "/health", Port: 8080}, + }, + }, + { + Name: "without-probe", + Image: "side", + }, + }, + }, + }) + if err != nil { + t.Fatalf("workloadSpecFromActorTemplate failed: %v", err) + } + + want := &ateletpb.WorkloadSpec{ + Containers: []*ateletpb.Container{ + { + Name: "with-probe", + Image: "main", + Readyz: &ateletpb.Readyz{ + HttpGet: &ateletpb.HTTPGetAction{Path: "/health", Port: 8080}, + }, + }, + { + Name: "without-probe", + Image: "side", + }, + }, + } + if diff := cmp.Diff(want, got, protocmp.Transform()); diff != "" { + t.Errorf("WorkloadSpec mismatch (-want +got):\n%s", diff) + } +} + func TestWorkloadSpecFromActorTemplateOptionalSecretKeyRefSkipsMissingSecret(t *testing.T) { optional := true got, err := workloadSpecFromActorTemplate(context.Background(), fake.NewSimpleClientset(), nil, &atev1alpha1.ActorTemplate{ diff --git a/cmd/atecontroller/internal/controllers/actortemplate_controller.go b/cmd/atecontroller/internal/controllers/actortemplate_controller.go index 372890116..f7a22c5a9 100644 --- a/cmd/atecontroller/internal/controllers/actortemplate_controller.go +++ b/cmd/atecontroller/internal/controllers/actortemplate_controller.go @@ -32,6 +32,14 @@ import ( const ( GoldenSnapshotCreationReason = "GoldenSnapshotCreation" + + // goldenSnapshotWarmup is the default wall-clock delay between resuming + // the golden actor and taking its snapshot, used as a coarse "give the + // workload time to finish initializing" fallback for templates without + // a readiness probe. Templates whose containers all declare readyz skip + // this wait — ResumeActor only returns once readyz reports 200, so the + // workload is already initialized by the time we get here. + goldenSnapshotWarmup = 20 * time.Second ) type ActorTemplateReconciler struct { @@ -109,7 +117,7 @@ func (r *ActorTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Reques } at.Status.Phase = atev1alpha1.PhaseWaitGoldenActor - at.Status.TakeGoldenSnapshotAt = metav1.NewTime(time.Now().Add(20 * time.Second)) + at.Status.TakeGoldenSnapshotAt = metav1.NewTime(time.Now().Add(goldenSnapshotWarmupFor(at))) if err := r.Status().Update(ctx, at); err != nil { return ctrl.Result{}, err } @@ -163,3 +171,20 @@ func (r *ActorTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Reques func (r *ActorTemplateReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr).For(&atev1alpha1.ActorTemplate{}).Complete(r) } + +// goldenSnapshotWarmupFor returns 0 when every container in the template has +// a readyz probe (so ResumeActor already blocked until the workload reported +// 200), and the default warmup otherwise. A template with no containers +// keeps the default — there is nothing to gate on. +func goldenSnapshotWarmupFor(at *atev1alpha1.ActorTemplate) time.Duration { + containers := at.Spec.Containers + if len(containers) == 0 { + return goldenSnapshotWarmup + } + for i := range containers { + if containers[i].Readyz == nil { + return goldenSnapshotWarmup + } + } + return 0 +} diff --git a/cmd/atecontroller/internal/controllers/actortemplate_controller_test.go b/cmd/atecontroller/internal/controllers/actortemplate_controller_test.go new file mode 100644 index 000000000..fa70af860 --- /dev/null +++ b/cmd/atecontroller/internal/controllers/actortemplate_controller_test.go @@ -0,0 +1,84 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package controllers + +import ( + "testing" + + atev1alpha1 "github.com/agent-substrate/substrate/pkg/api/v1alpha1" +) + +func TestGoldenSnapshotWarmupFor(t *testing.T) { + probe := &atev1alpha1.ContainerReadyz{ + HTTPGet: &atev1alpha1.HTTPGetAction{Port: 80}, + } + + tests := []struct { + name string + containers []atev1alpha1.Container + wantZero bool + }{ + { + name: "no containers keeps default warmup", + containers: nil, + wantZero: false, + }, + { + name: "all containers have readyz skips warmup", + containers: []atev1alpha1.Container{ + {Name: "a", Readyz: probe}, + {Name: "b", Readyz: probe}, + }, + wantZero: true, + }, + { + name: "single container with readyz skips warmup", + containers: []atev1alpha1.Container{ + {Name: "a", Readyz: probe}, + }, + wantZero: true, + }, + { + name: "mixed containers keep warmup", + containers: []atev1alpha1.Container{ + {Name: "a", Readyz: probe}, + {Name: "b"}, + }, + wantZero: false, + }, + { + name: "no readyz anywhere keeps warmup", + containers: []atev1alpha1.Container{ + {Name: "a"}, + {Name: "b"}, + }, + wantZero: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + at := &atev1alpha1.ActorTemplate{ + Spec: atev1alpha1.ActorTemplateSpec{Containers: tt.containers}, + } + got := goldenSnapshotWarmupFor(at) + if tt.wantZero && got != 0 { + t.Errorf("goldenSnapshotWarmupFor = %v, want 0", got) + } + if !tt.wantZero && got != goldenSnapshotWarmup { + t.Errorf("goldenSnapshotWarmupFor = %v, want %v", got, goldenSnapshotWarmup) + } + }) + } +} diff --git a/cmd/atelet/main.go b/cmd/atelet/main.go index 50802d7d7..9a8f88cda 100644 --- a/cmd/atelet/main.go +++ b/cmd/atelet/main.go @@ -705,11 +705,31 @@ func (s *AteomHerder) dialAteom(ctx context.Context, targetAteomUid string) (ate } // buildAteomWorkloadSpec projects the atelet-facing workload spec onto -// the ateom-facing one — currently just the container names. +// the ateom-facing one. func buildAteomWorkloadSpec(spec *ateletpb.WorkloadSpec) *ateompb.WorkloadSpec { out := &ateompb.WorkloadSpec{} for _, ctr := range spec.GetContainers() { - out.Containers = append(out.Containers, &ateompb.Container{Name: ctr.GetName()}) + out.Containers = append(out.Containers, &ateompb.Container{ + Name: ctr.GetName(), + Readyz: toAteomReadyz(ctr.GetReadyz()), + }) + } + return out +} + +// toAteomReadyz converts an ateletpb readyz probe into the ateompb wire +// type. Returns nil when the source is nil so containers without a probe +// stay unchanged on the wire to ateom. +func toAteomReadyz(in *ateletpb.Readyz) *ateompb.Readyz { + if in == nil { + return nil + } + out := &ateompb.Readyz{} + if hg := in.GetHttpGet(); hg != nil { + out.HttpGet = &ateompb.HTTPGetAction{ + Path: hg.GetPath(), + Port: hg.GetPort(), + } } return out } diff --git a/cmd/atelet/main_test.go b/cmd/atelet/main_test.go index 6be78fa51..cf5316018 100644 --- a/cmd/atelet/main_test.go +++ b/cmd/atelet/main_test.go @@ -28,8 +28,11 @@ import ( "github.com/agent-substrate/substrate/internal/ateompath" "github.com/agent-substrate/substrate/internal/proto/ateletpb" + "github.com/agent-substrate/substrate/internal/proto/ateompb" + "github.com/google/go-cmp/cmp" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "google.golang.org/protobuf/testing/protocmp" ) func TestWriteFileAtomic(t *testing.T) { @@ -377,3 +380,36 @@ func TestRPCBoundariesReject(t *testing.T) { wantInvalidArgument(t, "Restore", err) }) } + +func TestBuildAteomWorkloadSpecForwardsReadyz(t *testing.T) { + in := &ateletpb.WorkloadSpec{ + PauseImage: "pause", + Containers: []*ateletpb.Container{ + { + Name: "with-probe", + Image: "main", + Readyz: &ateletpb.Readyz{ + HttpGet: &ateletpb.HTTPGetAction{Path: "/health", Port: 8080}, + }, + }, + { + Name: "without-probe", + }, + }, + } + want := &ateompb.WorkloadSpec{ + Containers: []*ateompb.Container{ + { + Name: "with-probe", + Readyz: &ateompb.Readyz{ + HttpGet: &ateompb.HTTPGetAction{Path: "/health", Port: 8080}, + }, + }, + {Name: "without-probe"}, + }, + } + got := buildAteomWorkloadSpec(in) + if diff := cmp.Diff(want, got, protocmp.Transform()); diff != "" { + t.Errorf("buildAteomWorkloadSpec mismatch (-want +got):\n%s", diff) + } +} diff --git a/cmd/ateom-gvisor/main.go b/cmd/ateom-gvisor/main.go index 7e543c7ac..c9b2bae90 100644 --- a/cmd/ateom-gvisor/main.go +++ b/cmd/ateom-gvisor/main.go @@ -33,6 +33,7 @@ import ( "github.com/agent-substrate/substrate/internal/ateompath" "github.com/agent-substrate/substrate/internal/contextlogging" "github.com/agent-substrate/substrate/internal/proto/ateompb" + "github.com/agent-substrate/substrate/internal/readyz" "github.com/agent-substrate/substrate/internal/serverboot" "github.com/agent-substrate/substrate/internal/version" "github.com/google/nftables" @@ -227,6 +228,11 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload } } + // Block until every readyz-enabled container reports 200. + if err := readyz.WaitAll(ctx, req.GetSpec().GetContainers(), actorVethIP); err != nil { + return nil, fmt.Errorf("while waiting for container readyz: %w", err) + } + s.actorLogger.EmitLifecycleLog("Actor started", req.GetActorId(), req.GetActorTemplateName(), req.GetActorTemplateNamespace()) return &ateompb.RunWorkloadResponse{}, nil @@ -383,6 +389,11 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore } } + // Block until every readyz-enabled container reports 200. + if err := readyz.WaitAll(ctx, req.GetSpec().GetContainers(), actorVethIP); err != nil { + return nil, fmt.Errorf("while waiting for container readyz: %w", err) + } + s.actorLogger.EmitLifecycleLog("Actor restored", req.GetActorId(), req.GetActorTemplateName(), req.GetActorTemplateNamespace()) return &ateompb.RestoreWorkloadResponse{}, nil diff --git a/cmd/ateom-microvm/restore.go b/cmd/ateom-microvm/restore.go index 1c7c82b84..0e38319c9 100644 --- a/cmd/ateom-microvm/restore.go +++ b/cmd/ateom-microvm/restore.go @@ -30,6 +30,7 @@ import ( "github.com/agent-substrate/substrate/cmd/ateom-microvm/internal/kata" "github.com/agent-substrate/substrate/internal/ateompath" "github.com/agent-substrate/substrate/internal/proto/ateompb" + "github.com/agent-substrate/substrate/internal/readyz" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" ) @@ -184,6 +185,11 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore return nil, fmt.Errorf("while resuming restored guest: %w", err) } + // Block until every readyz-enabled container reports 200. + if err := readyz.WaitAll(ctx, containers, actorVethIP); err != nil { + return nil, fmt.Errorf("while waiting for container readyz: %w", err) + } + ra := &runningActor{chCmd: chCmd, apiSocket: apiSocket, baseID: srcID, restoreSourceDir: restoreDir} // Re-attach stdout/stderr forwarding: the restored guest's container + kata-agent diff --git a/cmd/ateom-microvm/run.go b/cmd/ateom-microvm/run.go index 2acdd57bd..92d04cffd 100644 --- a/cmd/ateom-microvm/run.go +++ b/cmd/ateom-microvm/run.go @@ -31,6 +31,7 @@ import ( "github.com/agent-substrate/substrate/cmd/ateom-microvm/internal/third_party/kata/agentpb" "github.com/agent-substrate/substrate/internal/ateompath" "github.com/agent-substrate/substrate/internal/proto/ateompb" + "github.com/agent-substrate/substrate/internal/readyz" specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "google.golang.org/grpc/codes" @@ -340,6 +341,11 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload return nil, err } + // Block until every readyz-enabled container reports 200. + if err := readyz.WaitAll(ctx, containers, actorVethIP); err != nil { + return nil, fmt.Errorf("while waiting for container readyz: %w", err) + } + ra := &runningActor{chCmd: chCmd, apiSocket: apiSocket, containerName: containerName, baseID: id, logAgent: ac} s.running[id] = ra diff --git a/demos/counter/counter-microvm.yaml.tmpl b/demos/counter/counter-microvm.yaml.tmpl index e9f4e31e2..2a6575333 100644 --- a/demos/counter/counter-microvm.yaml.tmpl +++ b/demos/counter/counter-microvm.yaml.tmpl @@ -99,6 +99,10 @@ spec: - name: counter image: ko://github.com/agent-substrate/substrate/demos/counter command: ["/ko-app/counter"] + readyz: + httpGet: + path: /readyz + port: 80 workerSelector: matchLabels: workload: counter-microvm diff --git a/demos/counter/counter.go b/demos/counter/counter.go index 6b99d5db2..de85661cd 100644 --- a/demos/counter/counter.go +++ b/demos/counter/counter.go @@ -33,7 +33,10 @@ import ( "github.com/spf13/pflag" ) -var requestCount uint64 +var ( + requestCount uint64 + ready atomic.Bool +) func main() { pflag.Parse() @@ -51,6 +54,18 @@ func main() { w.WriteHeader(http.StatusOK) w.Write([]byte(response)) }) + // /readyz is the endpoint the ateom-gvisor readyz probe polls. It returns + // 200 only once initialization (the random-file write) has completed. + // After a checkpoint+restore the atomic flag is part of the snapshot, so + // the endpoint returns 200 immediately on resume. + defaultMux.HandleFunc("/readyz", func(w http.ResponseWriter, _ *http.Request) { + if !ready.Load() { + http.Error(w, "not ready", http.StatusServiceUnavailable) + return + } + w.WriteHeader(http.StatusOK) + w.Write([]byte("ok\n")) + }) go func() { slog.InfoContext(ctx, "Starting counter server on port 80") @@ -68,6 +83,9 @@ func main() { slog.InfoContext(ctx, "Wrote content to random file", slog.String("fshash", hashRandomFile())) } + ready.Store(true) + slog.InfoContext(ctx, "Readyz now reports OK") + count := 0 slog.InfoContext(ctx, "Count", slog.Int("count", count), slog.String("fshash", hashRandomFile())) count++ diff --git a/demos/counter/counter.yaml.tmpl b/demos/counter/counter.yaml.tmpl index 7ff8ae594..652245b06 100644 --- a/demos/counter/counter.yaml.tmpl +++ b/demos/counter/counter.yaml.tmpl @@ -43,6 +43,10 @@ spec: - name: counter image: ko://github.com/agent-substrate/substrate/demos/counter command: ["/ko-app/counter"] + readyz: + httpGet: + path: /readyz + port: 80 workerSelector: matchLabels: workload: counter diff --git a/docs/api-guide.md b/docs/api-guide.md index b4fbe6fca..30ef4ab0f 100644 --- a/docs/api-guide.md +++ b/docs/api-guide.md @@ -88,7 +88,7 @@ The `ActorTemplate` defines the code, environment, and state-management policies | Field | Type | Description | | :--- | :--- | :--- | -| `containers` | `[]Container` | **Required.** The workload definition (image, command, env, ports). | +| `containers` | `[]Container` | **Required.** The workload definition (image, command, env, ports). Each container may also declare an optional `readyz` HTTP probe — see [Container Readiness Probe](#container-readiness-probe-readyz). | | `sandboxClass` | `string` | Optional. The sandbox runtime family this template's actors require: `gvisor` (default) or `microvm`. Only `WorkerPool`s whose `sandboxClass` matches are eligible. | | `workerSelector` | `*LabelSelector` | Optional. Gates which `WorkerPool`s actors from this template may use, by matching against each pool's labels. If unset, all pools are eligible (subject to the actor's own `worker_selector`). | | `snapshotsConfig` | `SnapshotsConfig` | **Required.** GCS bucket and folder where memory snapshots are stored. | @@ -110,6 +110,25 @@ Substrate bind-mounts a read-only, per-actor identity directory at **`/run/ate`* Read it fresh rather than caching it at process start. It is delivered as a per-actor bind mount, not an environment variable, precisely so it carries the correct ID after a resume from the golden snapshot — an env var (or a file baked into the image) would be frozen at the *golden* actor's ID, since it lives in the checkpointed process memory, and would therefore be identical for every actor of the template. +### Container Readiness Probe (`readyz`) + +Each entry in `containers` may declare an optional **HTTP readiness probe** so the platform only treats the actor as "started" once the workload is actually serving traffic. This mirrors the role of `readinessProbe.httpGet` on a Kubernetes Pod container, but the gate is enforced inside ateom (the in-pod sandbox driver) rather than by the kubelet. + +| Field | Type | Description | +| :--- | :--- | :--- | +| `readyz.httpGet.path` | `string` | Optional. URL path to GET. Defaults to `/readyz`. Must begin with `/` and contain only RFC 3986 path characters (no query string `?` or fragment `#`). | +| `readyz.httpGet.port` | `int32` | **Required.** TCP port on the container to probe (`1..65535`). | + +How it behaves: + +- **Where the probe runs.** ateom (gVisor or microvm) reaches the container at the actor's interior IP (`169.254.17.2` today) — one network hop, no DNS, no router involved. +- **Block-until-ready semantics.** `RunWorkload` (cold start) and `RestoreWorkload` (resume from snapshot) only return successfully after every container with a `readyz` block returns HTTP 200. A failure surfaces as a Run/Restore error and is retried by the control plane; the overall wait is bounded by an internal 30s deadline. +- **Aggressive polling.** The poll loop is tuned for single-millisecond detection latency: a keep-alive HTTP client with a ~500µs interval and 250ms per-request timeout. While the workload is still booting, kernel `RST`s return in microseconds, so the loop spends almost no time blocked; once the listener is up, the next attempt completes on veth-local latency. +- **Golden snapshot warm-up shortcut.** When **every** container in a template declares `readyz`, the actor template controller skips its default ~20s "give the workload time to settle" delay before taking the golden snapshot — `ResumeActor` already blocked until the workload reported 200, so the workload is known to be initialized. Templates that omit `readyz` on any container keep the 20s warm-up as a safety net. +- **Snapshot/restore interaction.** The TCP listener is part of the checkpointed RAM, so on resume `readyz` typically returns 200 on the first attempt, with no observable latency penalty. + +If `readyz` is omitted from a container, the prior "started == ready" behavior is preserved — the platform considers the container ready as soon as `runsc start` / `vm.boot` returns. + ### Example ```yaml @@ -128,6 +147,12 @@ spec: command: ["/app/server"] ports: - containerPort: 80 + # Optional: gate Run/Restore on the agent's HTTP readiness endpoint. + # See "Container Readiness Probe (readyz)" above. + readyz: + httpGet: + path: /readyz + port: 80 # sandboxClass defaults to gvisor; set to microvm to require micro-VM pools. sandboxClass: gvisor workerSelector: diff --git a/internal/proto/ateletpb/atelet.pb.go b/internal/proto/ateletpb/atelet.pb.go index 03b0fe750..005a54444 100644 --- a/internal/proto/ateletpb/atelet.pb.go +++ b/internal/proto/ateletpb/atelet.pb.go @@ -391,6 +391,7 @@ type Container struct { Image string `protobuf:"bytes,2,opt,name=image,proto3" json:"image,omitempty"` Command []string `protobuf:"bytes,3,rep,name=command,proto3" json:"command,omitempty"` Env []*EnvEntry `protobuf:"bytes,4,rep,name=env,proto3" json:"env,omitempty"` + Readyz *Readyz `protobuf:"bytes,5,opt,name=readyz,proto3" json:"readyz,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -453,6 +454,13 @@ func (x *Container) GetEnv() []*EnvEntry { return nil } +func (x *Container) GetReadyz() *Readyz { + if x != nil { + return x.Readyz + } + return nil +} + type EnvEntry struct { state protoimpl.MessageState `protogen:"open.v1"` Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` @@ -505,6 +513,107 @@ func (x *EnvEntry) GetValue() string { return "" } +// Readyz describes how to check that a container is ready to serve. +// Only HTTP is supported today. +type Readyz struct { + state protoimpl.MessageState `protogen:"open.v1"` + HttpGet *HTTPGetAction `protobuf:"bytes,1,opt,name=http_get,json=httpGet,proto3" json:"http_get,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *Readyz) Reset() { + *x = Readyz{} + mi := &file_atelet_proto_msgTypes[7] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *Readyz) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Readyz) ProtoMessage() {} + +func (x *Readyz) ProtoReflect() protoreflect.Message { + mi := &file_atelet_proto_msgTypes[7] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Readyz.ProtoReflect.Descriptor instead. +func (*Readyz) Descriptor() ([]byte, []int) { + return file_atelet_proto_rawDescGZIP(), []int{7} +} + +func (x *Readyz) GetHttpGet() *HTTPGetAction { + if x != nil { + return x.HttpGet + } + return nil +} + +// HTTPGetAction performs an HTTP GET against the container. +type HTTPGetAction struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Path to access on the HTTP server. Empty means "/readyz". + Path string `protobuf:"bytes,1,opt,name=path,proto3" json:"path,omitempty"` + // TCP port to connect to (1..65535). + Port int32 `protobuf:"varint,2,opt,name=port,proto3" json:"port,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *HTTPGetAction) Reset() { + *x = HTTPGetAction{} + mi := &file_atelet_proto_msgTypes[8] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *HTTPGetAction) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*HTTPGetAction) ProtoMessage() {} + +func (x *HTTPGetAction) ProtoReflect() protoreflect.Message { + mi := &file_atelet_proto_msgTypes[8] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use HTTPGetAction.ProtoReflect.Descriptor instead. +func (*HTTPGetAction) Descriptor() ([]byte, []int) { + return file_atelet_proto_rawDescGZIP(), []int{8} +} + +func (x *HTTPGetAction) GetPath() string { + if x != nil { + return x.Path + } + return "" +} + +func (x *HTTPGetAction) GetPort() int32 { + if x != nil { + return x.Port + } + return 0 +} + type RunResponse struct { state protoimpl.MessageState `protogen:"open.v1"` unknownFields protoimpl.UnknownFields @@ -513,7 +622,7 @@ type RunResponse struct { func (x *RunResponse) Reset() { *x = RunResponse{} - mi := &file_atelet_proto_msgTypes[7] + mi := &file_atelet_proto_msgTypes[9] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -525,7 +634,7 @@ func (x *RunResponse) String() string { func (*RunResponse) ProtoMessage() {} func (x *RunResponse) ProtoReflect() protoreflect.Message { - mi := &file_atelet_proto_msgTypes[7] + mi := &file_atelet_proto_msgTypes[9] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -538,7 +647,7 @@ func (x *RunResponse) ProtoReflect() protoreflect.Message { // Deprecated: Use RunResponse.ProtoReflect.Descriptor instead. func (*RunResponse) Descriptor() ([]byte, []int) { - return file_atelet_proto_rawDescGZIP(), []int{7} + return file_atelet_proto_rawDescGZIP(), []int{9} } type LocalCheckpointConfiguration struct { @@ -552,7 +661,7 @@ type LocalCheckpointConfiguration struct { func (x *LocalCheckpointConfiguration) Reset() { *x = LocalCheckpointConfiguration{} - mi := &file_atelet_proto_msgTypes[8] + mi := &file_atelet_proto_msgTypes[10] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -564,7 +673,7 @@ func (x *LocalCheckpointConfiguration) String() string { func (*LocalCheckpointConfiguration) ProtoMessage() {} func (x *LocalCheckpointConfiguration) ProtoReflect() protoreflect.Message { - mi := &file_atelet_proto_msgTypes[8] + mi := &file_atelet_proto_msgTypes[10] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -577,7 +686,7 @@ func (x *LocalCheckpointConfiguration) ProtoReflect() protoreflect.Message { // Deprecated: Use LocalCheckpointConfiguration.ProtoReflect.Descriptor instead. func (*LocalCheckpointConfiguration) Descriptor() ([]byte, []int) { - return file_atelet_proto_rawDescGZIP(), []int{8} + return file_atelet_proto_rawDescGZIP(), []int{10} } func (x *LocalCheckpointConfiguration) GetSnapshotPrefix() string { @@ -606,7 +715,7 @@ type ExternalCheckpointConfiguration struct { func (x *ExternalCheckpointConfiguration) Reset() { *x = ExternalCheckpointConfiguration{} - mi := &file_atelet_proto_msgTypes[9] + mi := &file_atelet_proto_msgTypes[11] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -618,7 +727,7 @@ func (x *ExternalCheckpointConfiguration) String() string { func (*ExternalCheckpointConfiguration) ProtoMessage() {} func (x *ExternalCheckpointConfiguration) ProtoReflect() protoreflect.Message { - mi := &file_atelet_proto_msgTypes[9] + mi := &file_atelet_proto_msgTypes[11] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -631,7 +740,7 @@ func (x *ExternalCheckpointConfiguration) ProtoReflect() protoreflect.Message { // Deprecated: Use ExternalCheckpointConfiguration.ProtoReflect.Descriptor instead. func (*ExternalCheckpointConfiguration) Descriptor() ([]byte, []int) { - return file_atelet_proto_rawDescGZIP(), []int{9} + return file_atelet_proto_rawDescGZIP(), []int{11} } func (x *ExternalCheckpointConfiguration) GetSnapshotUriPrefix() string { @@ -662,7 +771,7 @@ type CheckpointRequest struct { func (x *CheckpointRequest) Reset() { *x = CheckpointRequest{} - mi := &file_atelet_proto_msgTypes[10] + mi := &file_atelet_proto_msgTypes[12] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -674,7 +783,7 @@ func (x *CheckpointRequest) String() string { func (*CheckpointRequest) ProtoMessage() {} func (x *CheckpointRequest) ProtoReflect() protoreflect.Message { - mi := &file_atelet_proto_msgTypes[10] + mi := &file_atelet_proto_msgTypes[12] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -687,7 +796,7 @@ func (x *CheckpointRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use CheckpointRequest.ProtoReflect.Descriptor instead. func (*CheckpointRequest) Descriptor() ([]byte, []int) { - return file_atelet_proto_rawDescGZIP(), []int{10} + return file_atelet_proto_rawDescGZIP(), []int{12} } func (x *CheckpointRequest) GetTargetAteomUid() string { @@ -781,7 +890,7 @@ type CheckpointResponse struct { func (x *CheckpointResponse) Reset() { *x = CheckpointResponse{} - mi := &file_atelet_proto_msgTypes[11] + mi := &file_atelet_proto_msgTypes[13] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -793,7 +902,7 @@ func (x *CheckpointResponse) String() string { func (*CheckpointResponse) ProtoMessage() {} func (x *CheckpointResponse) ProtoReflect() protoreflect.Message { - mi := &file_atelet_proto_msgTypes[11] + mi := &file_atelet_proto_msgTypes[13] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -806,7 +915,7 @@ func (x *CheckpointResponse) ProtoReflect() protoreflect.Message { // Deprecated: Use CheckpointResponse.ProtoReflect.Descriptor instead. func (*CheckpointResponse) Descriptor() ([]byte, []int) { - return file_atelet_proto_rawDescGZIP(), []int{11} + return file_atelet_proto_rawDescGZIP(), []int{13} } type RestoreRequest struct { @@ -830,7 +939,7 @@ type RestoreRequest struct { func (x *RestoreRequest) Reset() { *x = RestoreRequest{} - mi := &file_atelet_proto_msgTypes[12] + mi := &file_atelet_proto_msgTypes[14] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -842,7 +951,7 @@ func (x *RestoreRequest) String() string { func (*RestoreRequest) ProtoMessage() {} func (x *RestoreRequest) ProtoReflect() protoreflect.Message { - mi := &file_atelet_proto_msgTypes[12] + mi := &file_atelet_proto_msgTypes[14] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -855,7 +964,7 @@ func (x *RestoreRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use RestoreRequest.ProtoReflect.Descriptor instead. func (*RestoreRequest) Descriptor() ([]byte, []int) { - return file_atelet_proto_rawDescGZIP(), []int{12} + return file_atelet_proto_rawDescGZIP(), []int{14} } func (x *RestoreRequest) GetTargetAteomUid() string { @@ -949,7 +1058,7 @@ type RestoreResponse struct { func (x *RestoreResponse) Reset() { *x = RestoreResponse{} - mi := &file_atelet_proto_msgTypes[13] + mi := &file_atelet_proto_msgTypes[15] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -961,7 +1070,7 @@ func (x *RestoreResponse) String() string { func (*RestoreResponse) ProtoMessage() {} func (x *RestoreResponse) ProtoReflect() protoreflect.Message { - mi := &file_atelet_proto_msgTypes[13] + mi := &file_atelet_proto_msgTypes[15] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -974,7 +1083,7 @@ func (x *RestoreResponse) ProtoReflect() protoreflect.Message { // Deprecated: Use RestoreResponse.ProtoReflect.Descriptor instead. func (*RestoreResponse) Descriptor() ([]byte, []int) { - return file_atelet_proto_rawDescGZIP(), []int{13} + return file_atelet_proto_rawDescGZIP(), []int{15} } var File_atelet_proto protoreflect.FileDescriptor @@ -1011,15 +1120,21 @@ const file_atelet_proto_rawDesc = "" + "containers\x18\x01 \x03(\v2\x11.atelet.ContainerR\n" + "containers\x12\x1f\n" + "\vpause_image\x18\x02 \x01(\tR\n" + - "pauseImage\"s\n" + + "pauseImage\"\x9b\x01\n" + "\tContainer\x12\x12\n" + "\x04name\x18\x01 \x01(\tR\x04name\x12\x14\n" + "\x05image\x18\x02 \x01(\tR\x05image\x12\x18\n" + "\acommand\x18\x03 \x03(\tR\acommand\x12\"\n" + - "\x03env\x18\x04 \x03(\v2\x10.atelet.EnvEntryR\x03env\"4\n" + + "\x03env\x18\x04 \x03(\v2\x10.atelet.EnvEntryR\x03env\x12&\n" + + "\x06readyz\x18\x05 \x01(\v2\x0e.atelet.ReadyzR\x06readyz\"4\n" + "\bEnvEntry\x12\x12\n" + "\x04name\x18\x01 \x01(\tR\x04name\x12\x14\n" + - "\x05value\x18\x02 \x01(\tR\x05value\"\r\n" + + "\x05value\x18\x02 \x01(\tR\x05value\":\n" + + "\x06Readyz\x120\n" + + "\bhttp_get\x18\x01 \x01(\v2\x15.atelet.HTTPGetActionR\ahttpGet\"7\n" + + "\rHTTPGetAction\x12\x12\n" + + "\x04path\x18\x01 \x01(\tR\x04path\x12\x12\n" + + "\x04port\x18\x02 \x01(\x05R\x04port\"\r\n" + "\vRunResponse\"G\n" + "\x1cLocalCheckpointConfiguration\x12'\n" + "\x0fsnapshot_prefix\x18\x01 \x01(\tR\x0esnapshotPrefix\"Q\n" + @@ -1072,7 +1187,7 @@ func file_atelet_proto_rawDescGZIP() []byte { } var file_atelet_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_atelet_proto_msgTypes = make([]protoimpl.MessageInfo, 16) +var file_atelet_proto_msgTypes = make([]protoimpl.MessageInfo, 18) var file_atelet_proto_goTypes = []any{ (CheckpointType)(0), // 0: atelet.CheckpointType (*RunRequest)(nil), // 1: atelet.RunRequest @@ -1082,44 +1197,48 @@ var file_atelet_proto_goTypes = []any{ (*WorkloadSpec)(nil), // 5: atelet.WorkloadSpec (*Container)(nil), // 6: atelet.Container (*EnvEntry)(nil), // 7: atelet.EnvEntry - (*RunResponse)(nil), // 8: atelet.RunResponse - (*LocalCheckpointConfiguration)(nil), // 9: atelet.LocalCheckpointConfiguration - (*ExternalCheckpointConfiguration)(nil), // 10: atelet.ExternalCheckpointConfiguration - (*CheckpointRequest)(nil), // 11: atelet.CheckpointRequest - (*CheckpointResponse)(nil), // 12: atelet.CheckpointResponse - (*RestoreRequest)(nil), // 13: atelet.RestoreRequest - (*RestoreResponse)(nil), // 14: atelet.RestoreResponse - nil, // 15: atelet.ArchAssets.FilesEntry - nil, // 16: atelet.SandboxAssets.AssetsEntry + (*Readyz)(nil), // 8: atelet.Readyz + (*HTTPGetAction)(nil), // 9: atelet.HTTPGetAction + (*RunResponse)(nil), // 10: atelet.RunResponse + (*LocalCheckpointConfiguration)(nil), // 11: atelet.LocalCheckpointConfiguration + (*ExternalCheckpointConfiguration)(nil), // 12: atelet.ExternalCheckpointConfiguration + (*CheckpointRequest)(nil), // 13: atelet.CheckpointRequest + (*CheckpointResponse)(nil), // 14: atelet.CheckpointResponse + (*RestoreRequest)(nil), // 15: atelet.RestoreRequest + (*RestoreResponse)(nil), // 16: atelet.RestoreResponse + nil, // 17: atelet.ArchAssets.FilesEntry + nil, // 18: atelet.SandboxAssets.AssetsEntry } var file_atelet_proto_depIdxs = []int32{ 5, // 0: atelet.RunRequest.spec:type_name -> atelet.WorkloadSpec 4, // 1: atelet.RunRequest.sandbox_assets:type_name -> atelet.SandboxAssets - 15, // 2: atelet.ArchAssets.files:type_name -> atelet.ArchAssets.FilesEntry - 16, // 3: atelet.SandboxAssets.assets:type_name -> atelet.SandboxAssets.AssetsEntry + 17, // 2: atelet.ArchAssets.files:type_name -> atelet.ArchAssets.FilesEntry + 18, // 3: atelet.SandboxAssets.assets:type_name -> atelet.SandboxAssets.AssetsEntry 6, // 4: atelet.WorkloadSpec.containers:type_name -> atelet.Container 7, // 5: atelet.Container.env:type_name -> atelet.EnvEntry - 5, // 6: atelet.CheckpointRequest.spec:type_name -> atelet.WorkloadSpec - 0, // 7: atelet.CheckpointRequest.type:type_name -> atelet.CheckpointType - 9, // 8: atelet.CheckpointRequest.local_config:type_name -> atelet.LocalCheckpointConfiguration - 10, // 9: atelet.CheckpointRequest.external_config:type_name -> atelet.ExternalCheckpointConfiguration - 5, // 10: atelet.RestoreRequest.spec:type_name -> atelet.WorkloadSpec - 0, // 11: atelet.RestoreRequest.type:type_name -> atelet.CheckpointType - 9, // 12: atelet.RestoreRequest.local_config:type_name -> atelet.LocalCheckpointConfiguration - 10, // 13: atelet.RestoreRequest.external_config:type_name -> atelet.ExternalCheckpointConfiguration - 2, // 14: atelet.ArchAssets.FilesEntry.value:type_name -> atelet.AssetFile - 3, // 15: atelet.SandboxAssets.AssetsEntry.value:type_name -> atelet.ArchAssets - 1, // 16: atelet.AteomHerder.Run:input_type -> atelet.RunRequest - 11, // 17: atelet.AteomHerder.Checkpoint:input_type -> atelet.CheckpointRequest - 13, // 18: atelet.AteomHerder.Restore:input_type -> atelet.RestoreRequest - 8, // 19: atelet.AteomHerder.Run:output_type -> atelet.RunResponse - 12, // 20: atelet.AteomHerder.Checkpoint:output_type -> atelet.CheckpointResponse - 14, // 21: atelet.AteomHerder.Restore:output_type -> atelet.RestoreResponse - 19, // [19:22] is the sub-list for method output_type - 16, // [16:19] is the sub-list for method input_type - 16, // [16:16] is the sub-list for extension type_name - 16, // [16:16] is the sub-list for extension extendee - 0, // [0:16] is the sub-list for field type_name + 8, // 6: atelet.Container.readyz:type_name -> atelet.Readyz + 9, // 7: atelet.Readyz.http_get:type_name -> atelet.HTTPGetAction + 5, // 8: atelet.CheckpointRequest.spec:type_name -> atelet.WorkloadSpec + 0, // 9: atelet.CheckpointRequest.type:type_name -> atelet.CheckpointType + 11, // 10: atelet.CheckpointRequest.local_config:type_name -> atelet.LocalCheckpointConfiguration + 12, // 11: atelet.CheckpointRequest.external_config:type_name -> atelet.ExternalCheckpointConfiguration + 5, // 12: atelet.RestoreRequest.spec:type_name -> atelet.WorkloadSpec + 0, // 13: atelet.RestoreRequest.type:type_name -> atelet.CheckpointType + 11, // 14: atelet.RestoreRequest.local_config:type_name -> atelet.LocalCheckpointConfiguration + 12, // 15: atelet.RestoreRequest.external_config:type_name -> atelet.ExternalCheckpointConfiguration + 2, // 16: atelet.ArchAssets.FilesEntry.value:type_name -> atelet.AssetFile + 3, // 17: atelet.SandboxAssets.AssetsEntry.value:type_name -> atelet.ArchAssets + 1, // 18: atelet.AteomHerder.Run:input_type -> atelet.RunRequest + 13, // 19: atelet.AteomHerder.Checkpoint:input_type -> atelet.CheckpointRequest + 15, // 20: atelet.AteomHerder.Restore:input_type -> atelet.RestoreRequest + 10, // 21: atelet.AteomHerder.Run:output_type -> atelet.RunResponse + 14, // 22: atelet.AteomHerder.Checkpoint:output_type -> atelet.CheckpointResponse + 16, // 23: atelet.AteomHerder.Restore:output_type -> atelet.RestoreResponse + 21, // [21:24] is the sub-list for method output_type + 18, // [18:21] is the sub-list for method input_type + 18, // [18:18] is the sub-list for extension type_name + 18, // [18:18] is the sub-list for extension extendee + 0, // [0:18] is the sub-list for field type_name } func init() { file_atelet_proto_init() } @@ -1127,11 +1246,11 @@ func file_atelet_proto_init() { if File_atelet_proto != nil { return } - file_atelet_proto_msgTypes[10].OneofWrappers = []any{ + file_atelet_proto_msgTypes[12].OneofWrappers = []any{ (*CheckpointRequest_LocalConfig)(nil), (*CheckpointRequest_ExternalConfig)(nil), } - file_atelet_proto_msgTypes[12].OneofWrappers = []any{ + file_atelet_proto_msgTypes[14].OneofWrappers = []any{ (*RestoreRequest_LocalConfig)(nil), (*RestoreRequest_ExternalConfig)(nil), } @@ -1141,7 +1260,7 @@ func file_atelet_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_atelet_proto_rawDesc), len(file_atelet_proto_rawDesc)), NumEnums: 1, - NumMessages: 16, + NumMessages: 18, NumExtensions: 0, NumServices: 1, }, diff --git a/internal/proto/ateletpb/atelet.proto b/internal/proto/ateletpb/atelet.proto index d210656d6..6c4046fcb 100644 --- a/internal/proto/ateletpb/atelet.proto +++ b/internal/proto/ateletpb/atelet.proto @@ -84,6 +84,7 @@ message Container { string image = 2; repeated string command = 3; repeated EnvEntry env = 4; + Readyz readyz = 5; } message EnvEntry { @@ -91,6 +92,20 @@ message EnvEntry { string value = 2; } +// Readyz describes how to check that a container is ready to serve. +// Only HTTP is supported today. +message Readyz { + HTTPGetAction http_get = 1; +} + +// HTTPGetAction performs an HTTP GET against the container. +message HTTPGetAction { + // Path to access on the HTTP server. Empty means "/readyz". + string path = 1; + // TCP port to connect to (1..65535). + int32 port = 2; +} + message RunResponse { } diff --git a/internal/proto/ateompb/ateom.pb.go b/internal/proto/ateompb/ateom.pb.go index c0183619c..158f277c9 100644 --- a/internal/proto/ateompb/ateom.pb.go +++ b/internal/proto/ateompb/ateom.pb.go @@ -171,6 +171,7 @@ func (x *WorkloadSpec) GetContainers() []*Container { type Container struct { state protoimpl.MessageState `protogen:"open.v1"` Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Readyz *Readyz `protobuf:"bytes,2,opt,name=readyz,proto3" json:"readyz,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -212,6 +213,114 @@ func (x *Container) GetName() string { return "" } +func (x *Container) GetReadyz() *Readyz { + if x != nil { + return x.Readyz + } + return nil +} + +// Readyz describes how to check that a container is ready to serve. +// Only HTTP is supported today. +type Readyz struct { + state protoimpl.MessageState `protogen:"open.v1"` + HttpGet *HTTPGetAction `protobuf:"bytes,1,opt,name=http_get,json=httpGet,proto3" json:"http_get,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *Readyz) Reset() { + *x = Readyz{} + mi := &file_ateom_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *Readyz) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Readyz) ProtoMessage() {} + +func (x *Readyz) ProtoReflect() protoreflect.Message { + mi := &file_ateom_proto_msgTypes[3] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Readyz.ProtoReflect.Descriptor instead. +func (*Readyz) Descriptor() ([]byte, []int) { + return file_ateom_proto_rawDescGZIP(), []int{3} +} + +func (x *Readyz) GetHttpGet() *HTTPGetAction { + if x != nil { + return x.HttpGet + } + return nil +} + +// HTTPGetAction performs an HTTP GET against the container. +type HTTPGetAction struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Path to access on the HTTP server. Empty means "/readyz". + Path string `protobuf:"bytes,1,opt,name=path,proto3" json:"path,omitempty"` + // TCP port to connect to (1..65535). + Port int32 `protobuf:"varint,2,opt,name=port,proto3" json:"port,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *HTTPGetAction) Reset() { + *x = HTTPGetAction{} + mi := &file_ateom_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *HTTPGetAction) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*HTTPGetAction) ProtoMessage() {} + +func (x *HTTPGetAction) ProtoReflect() protoreflect.Message { + mi := &file_ateom_proto_msgTypes[4] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use HTTPGetAction.ProtoReflect.Descriptor instead. +func (*HTTPGetAction) Descriptor() ([]byte, []int) { + return file_ateom_proto_rawDescGZIP(), []int{4} +} + +func (x *HTTPGetAction) GetPath() string { + if x != nil { + return x.Path + } + return "" +} + +func (x *HTTPGetAction) GetPort() int32 { + if x != nil { + return x.Port + } + return 0 +} + type RunWorkloadResponse struct { state protoimpl.MessageState `protogen:"open.v1"` unknownFields protoimpl.UnknownFields @@ -220,7 +329,7 @@ type RunWorkloadResponse struct { func (x *RunWorkloadResponse) Reset() { *x = RunWorkloadResponse{} - mi := &file_ateom_proto_msgTypes[3] + mi := &file_ateom_proto_msgTypes[5] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -232,7 +341,7 @@ func (x *RunWorkloadResponse) String() string { func (*RunWorkloadResponse) ProtoMessage() {} func (x *RunWorkloadResponse) ProtoReflect() protoreflect.Message { - mi := &file_ateom_proto_msgTypes[3] + mi := &file_ateom_proto_msgTypes[5] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -245,7 +354,7 @@ func (x *RunWorkloadResponse) ProtoReflect() protoreflect.Message { // Deprecated: Use RunWorkloadResponse.ProtoReflect.Descriptor instead. func (*RunWorkloadResponse) Descriptor() ([]byte, []int) { - return file_ateom_proto_rawDescGZIP(), []int{3} + return file_ateom_proto_rawDescGZIP(), []int{5} } type CheckpointWorkloadRequest struct { @@ -273,7 +382,7 @@ type CheckpointWorkloadRequest struct { func (x *CheckpointWorkloadRequest) Reset() { *x = CheckpointWorkloadRequest{} - mi := &file_ateom_proto_msgTypes[4] + mi := &file_ateom_proto_msgTypes[6] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -285,7 +394,7 @@ func (x *CheckpointWorkloadRequest) String() string { func (*CheckpointWorkloadRequest) ProtoMessage() {} func (x *CheckpointWorkloadRequest) ProtoReflect() protoreflect.Message { - mi := &file_ateom_proto_msgTypes[4] + mi := &file_ateom_proto_msgTypes[6] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -298,7 +407,7 @@ func (x *CheckpointWorkloadRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use CheckpointWorkloadRequest.ProtoReflect.Descriptor instead. func (*CheckpointWorkloadRequest) Descriptor() ([]byte, []int) { - return file_ateom_proto_rawDescGZIP(), []int{4} + return file_ateom_proto_rawDescGZIP(), []int{6} } func (x *CheckpointWorkloadRequest) GetActorTemplateNamespace() string { @@ -362,7 +471,7 @@ type CheckpointWorkloadResponse struct { func (x *CheckpointWorkloadResponse) Reset() { *x = CheckpointWorkloadResponse{} - mi := &file_ateom_proto_msgTypes[5] + mi := &file_ateom_proto_msgTypes[7] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -374,7 +483,7 @@ func (x *CheckpointWorkloadResponse) String() string { func (*CheckpointWorkloadResponse) ProtoMessage() {} func (x *CheckpointWorkloadResponse) ProtoReflect() protoreflect.Message { - mi := &file_ateom_proto_msgTypes[5] + mi := &file_ateom_proto_msgTypes[7] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -387,7 +496,7 @@ func (x *CheckpointWorkloadResponse) ProtoReflect() protoreflect.Message { // Deprecated: Use CheckpointWorkloadResponse.ProtoReflect.Descriptor instead. func (*CheckpointWorkloadResponse) Descriptor() ([]byte, []int) { - return file_ateom_proto_rawDescGZIP(), []int{5} + return file_ateom_proto_rawDescGZIP(), []int{7} } func (x *CheckpointWorkloadResponse) GetSnapshotFiles() []string { @@ -415,7 +524,7 @@ type RestoreWorkloadRequest struct { func (x *RestoreWorkloadRequest) Reset() { *x = RestoreWorkloadRequest{} - mi := &file_ateom_proto_msgTypes[6] + mi := &file_ateom_proto_msgTypes[8] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -427,7 +536,7 @@ func (x *RestoreWorkloadRequest) String() string { func (*RestoreWorkloadRequest) ProtoMessage() {} func (x *RestoreWorkloadRequest) ProtoReflect() protoreflect.Message { - mi := &file_ateom_proto_msgTypes[6] + mi := &file_ateom_proto_msgTypes[8] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -440,7 +549,7 @@ func (x *RestoreWorkloadRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use RestoreWorkloadRequest.ProtoReflect.Descriptor instead. func (*RestoreWorkloadRequest) Descriptor() ([]byte, []int) { - return file_ateom_proto_rawDescGZIP(), []int{6} + return file_ateom_proto_rawDescGZIP(), []int{8} } func (x *RestoreWorkloadRequest) GetActorTemplateNamespace() string { @@ -500,7 +609,7 @@ type RestoreWorkloadResponse struct { func (x *RestoreWorkloadResponse) Reset() { *x = RestoreWorkloadResponse{} - mi := &file_ateom_proto_msgTypes[7] + mi := &file_ateom_proto_msgTypes[9] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -512,7 +621,7 @@ func (x *RestoreWorkloadResponse) String() string { func (*RestoreWorkloadResponse) ProtoMessage() {} func (x *RestoreWorkloadResponse) ProtoReflect() protoreflect.Message { - mi := &file_ateom_proto_msgTypes[7] + mi := &file_ateom_proto_msgTypes[9] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -525,7 +634,7 @@ func (x *RestoreWorkloadResponse) ProtoReflect() protoreflect.Message { // Deprecated: Use RestoreWorkloadResponse.ProtoReflect.Descriptor instead. func (*RestoreWorkloadResponse) Descriptor() ([]byte, []int) { - return file_ateom_proto_rawDescGZIP(), []int{7} + return file_ateom_proto_rawDescGZIP(), []int{9} } var File_ateom_proto protoreflect.FileDescriptor @@ -547,9 +656,15 @@ const file_ateom_proto_rawDesc = "" + "\fWorkloadSpec\x120\n" + "\n" + "containers\x18\x01 \x03(\v2\x10.ateom.ContainerR\n" + - "containers\"\x1f\n" + + "containers\"F\n" + "\tContainer\x12\x12\n" + - "\x04name\x18\x01 \x01(\tR\x04name\"\x15\n" + + "\x04name\x18\x01 \x01(\tR\x04name\x12%\n" + + "\x06readyz\x18\x02 \x01(\v2\r.ateom.ReadyzR\x06readyz\"9\n" + + "\x06Readyz\x12/\n" + + "\bhttp_get\x18\x01 \x01(\v2\x14.ateom.HTTPGetActionR\ahttpGet\"7\n" + + "\rHTTPGetAction\x12\x12\n" + + "\x04path\x18\x01 \x01(\tR\x04path\x12\x12\n" + + "\x04port\x18\x02 \x01(\x05R\x04port\"\x15\n" + "\x13RunWorkloadResponse\"\xc7\x03\n" + "\x19CheckpointWorkloadRequest\x128\n" + "\x18actor_template_namespace\x18\x01 \x01(\tR\x16actorTemplateNamespace\x12.\n" + @@ -595,39 +710,43 @@ func file_ateom_proto_rawDescGZIP() []byte { return file_ateom_proto_rawDescData } -var file_ateom_proto_msgTypes = make([]protoimpl.MessageInfo, 11) +var file_ateom_proto_msgTypes = make([]protoimpl.MessageInfo, 13) var file_ateom_proto_goTypes = []any{ (*RunWorkloadRequest)(nil), // 0: ateom.RunWorkloadRequest (*WorkloadSpec)(nil), // 1: ateom.WorkloadSpec (*Container)(nil), // 2: ateom.Container - (*RunWorkloadResponse)(nil), // 3: ateom.RunWorkloadResponse - (*CheckpointWorkloadRequest)(nil), // 4: ateom.CheckpointWorkloadRequest - (*CheckpointWorkloadResponse)(nil), // 5: ateom.CheckpointWorkloadResponse - (*RestoreWorkloadRequest)(nil), // 6: ateom.RestoreWorkloadRequest - (*RestoreWorkloadResponse)(nil), // 7: ateom.RestoreWorkloadResponse - nil, // 8: ateom.RunWorkloadRequest.RuntimeAssetPathsEntry - nil, // 9: ateom.CheckpointWorkloadRequest.RuntimeAssetPathsEntry - nil, // 10: ateom.RestoreWorkloadRequest.RuntimeAssetPathsEntry + (*Readyz)(nil), // 3: ateom.Readyz + (*HTTPGetAction)(nil), // 4: ateom.HTTPGetAction + (*RunWorkloadResponse)(nil), // 5: ateom.RunWorkloadResponse + (*CheckpointWorkloadRequest)(nil), // 6: ateom.CheckpointWorkloadRequest + (*CheckpointWorkloadResponse)(nil), // 7: ateom.CheckpointWorkloadResponse + (*RestoreWorkloadRequest)(nil), // 8: ateom.RestoreWorkloadRequest + (*RestoreWorkloadResponse)(nil), // 9: ateom.RestoreWorkloadResponse + nil, // 10: ateom.RunWorkloadRequest.RuntimeAssetPathsEntry + nil, // 11: ateom.CheckpointWorkloadRequest.RuntimeAssetPathsEntry + nil, // 12: ateom.RestoreWorkloadRequest.RuntimeAssetPathsEntry } var file_ateom_proto_depIdxs = []int32{ 1, // 0: ateom.RunWorkloadRequest.spec:type_name -> ateom.WorkloadSpec - 8, // 1: ateom.RunWorkloadRequest.runtime_asset_paths:type_name -> ateom.RunWorkloadRequest.RuntimeAssetPathsEntry + 10, // 1: ateom.RunWorkloadRequest.runtime_asset_paths:type_name -> ateom.RunWorkloadRequest.RuntimeAssetPathsEntry 2, // 2: ateom.WorkloadSpec.containers:type_name -> ateom.Container - 1, // 3: ateom.CheckpointWorkloadRequest.spec:type_name -> ateom.WorkloadSpec - 9, // 4: ateom.CheckpointWorkloadRequest.runtime_asset_paths:type_name -> ateom.CheckpointWorkloadRequest.RuntimeAssetPathsEntry - 1, // 5: ateom.RestoreWorkloadRequest.spec:type_name -> ateom.WorkloadSpec - 10, // 6: ateom.RestoreWorkloadRequest.runtime_asset_paths:type_name -> ateom.RestoreWorkloadRequest.RuntimeAssetPathsEntry - 0, // 7: ateom.Ateom.RunWorkload:input_type -> ateom.RunWorkloadRequest - 4, // 8: ateom.Ateom.CheckpointWorkload:input_type -> ateom.CheckpointWorkloadRequest - 6, // 9: ateom.Ateom.RestoreWorkload:input_type -> ateom.RestoreWorkloadRequest - 3, // 10: ateom.Ateom.RunWorkload:output_type -> ateom.RunWorkloadResponse - 5, // 11: ateom.Ateom.CheckpointWorkload:output_type -> ateom.CheckpointWorkloadResponse - 7, // 12: ateom.Ateom.RestoreWorkload:output_type -> ateom.RestoreWorkloadResponse - 10, // [10:13] is the sub-list for method output_type - 7, // [7:10] is the sub-list for method input_type - 7, // [7:7] is the sub-list for extension type_name - 7, // [7:7] is the sub-list for extension extendee - 0, // [0:7] is the sub-list for field type_name + 3, // 3: ateom.Container.readyz:type_name -> ateom.Readyz + 4, // 4: ateom.Readyz.http_get:type_name -> ateom.HTTPGetAction + 1, // 5: ateom.CheckpointWorkloadRequest.spec:type_name -> ateom.WorkloadSpec + 11, // 6: ateom.CheckpointWorkloadRequest.runtime_asset_paths:type_name -> ateom.CheckpointWorkloadRequest.RuntimeAssetPathsEntry + 1, // 7: ateom.RestoreWorkloadRequest.spec:type_name -> ateom.WorkloadSpec + 12, // 8: ateom.RestoreWorkloadRequest.runtime_asset_paths:type_name -> ateom.RestoreWorkloadRequest.RuntimeAssetPathsEntry + 0, // 9: ateom.Ateom.RunWorkload:input_type -> ateom.RunWorkloadRequest + 6, // 10: ateom.Ateom.CheckpointWorkload:input_type -> ateom.CheckpointWorkloadRequest + 8, // 11: ateom.Ateom.RestoreWorkload:input_type -> ateom.RestoreWorkloadRequest + 5, // 12: ateom.Ateom.RunWorkload:output_type -> ateom.RunWorkloadResponse + 7, // 13: ateom.Ateom.CheckpointWorkload:output_type -> ateom.CheckpointWorkloadResponse + 9, // 14: ateom.Ateom.RestoreWorkload:output_type -> ateom.RestoreWorkloadResponse + 12, // [12:15] is the sub-list for method output_type + 9, // [9:12] is the sub-list for method input_type + 9, // [9:9] is the sub-list for extension type_name + 9, // [9:9] is the sub-list for extension extendee + 0, // [0:9] is the sub-list for field type_name } func init() { file_ateom_proto_init() } @@ -641,7 +760,7 @@ func file_ateom_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: unsafe.Slice(unsafe.StringData(file_ateom_proto_rawDesc), len(file_ateom_proto_rawDesc)), NumEnums: 0, - NumMessages: 11, + NumMessages: 13, NumExtensions: 0, NumServices: 1, }, diff --git a/internal/proto/ateompb/ateom.proto b/internal/proto/ateompb/ateom.proto index b367bec78..0260975b9 100644 --- a/internal/proto/ateompb/ateom.proto +++ b/internal/proto/ateompb/ateom.proto @@ -70,6 +70,21 @@ message WorkloadSpec { message Container { string name = 1; + Readyz readyz = 2; +} + +// Readyz describes how to check that a container is ready to serve. +// Only HTTP is supported today. +message Readyz { + HTTPGetAction http_get = 1; +} + +// HTTPGetAction performs an HTTP GET against the container. +message HTTPGetAction { + // Path to access on the HTTP server. Empty means "/readyz". + string path = 1; + // TCP port to connect to (1..65535). + int32 port = 2; } message RunWorkloadResponse { diff --git a/internal/readyz/readyz.go b/internal/readyz/readyz.go new file mode 100644 index 000000000..12640c0c1 --- /dev/null +++ b/internal/readyz/readyz.go @@ -0,0 +1,159 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package readyz polls a container's HTTP readiness endpoint from inside an +// ateom (currently ateom-gvisor; ateom-microvm will use the same code path +// once its actor networking lands). The intent is to detect the moment a +// container's HTTP server starts accepting connections with single-millisecond +// latency: while the server is still booting the kernel returns RST in +// microseconds, so a sub-millisecond poll loop spends almost no time blocked, +// and once the listen socket is up the next iteration completes the GET on +// veth-local latency. +package readyz + +import ( + "context" + "fmt" + "io" + "log/slog" + "net" + "net/http" + "strings" + "time" + + "github.com/agent-substrate/substrate/internal/proto/ateompb" + "golang.org/x/sync/errgroup" +) + +// Tuning knobs. Exported as vars so callers (or tests) can shorten the +// overall timeout in pathological setups; the defaults are sized for actor +// cold-start where the server may take a few seconds to bind. +const ( + OverallTimeout = 30 * time.Second + RequestTimeout = 250 * time.Millisecond + PollInterval = 500 * time.Microsecond + DefaultPath = "/readyz" + maxIdleConnsHost = 1 +) + +// HTTPClient builds a keep-alive HTTP client tuned for fast, repeated +// probing of a single endpoint. Exposed as a var so tests can substitute a +// transport that targets a test server's loopback address. +var HTTPClient = func() *http.Client { + tr := &http.Transport{ + DisableCompression: true, + MaxIdleConnsPerHost: maxIdleConnsHost, + DialContext: (&net.Dialer{Timeout: RequestTimeout}).DialContext, + ResponseHeaderTimeout: RequestTimeout, + } + return &http.Client{Transport: tr, Timeout: RequestTimeout} +} + +// WaitAll blocks until every container with a readyz probe set reports 200, +// or returns the first error. Containers without a probe are skipped (their +// absence means "no readiness gate"). +func WaitAll(ctx context.Context, containers []*ateompb.Container, actorIP string) error { + g, gctx := errgroup.WithContext(ctx) + for _, ac := range containers { + if ac.GetReadyz() == nil { + continue + } + ac := ac + g.Go(func() error { + return Wait(gctx, ac.GetName(), ac.GetReadyz(), actorIP) + }) + } + return g.Wait() +} + +// Wait polls the configured HTTP endpoint until it returns 200, the context +// is cancelled, or the overall deadline is exceeded. +func Wait(ctx context.Context, containerName string, probe *ateompb.Readyz, actorIP string) error { + url, err := URL(probe, actorIP) + if err != nil { + return fmt.Errorf("invalid readyz config for %q: %w", containerName, err) + } + + client := HTTPClient() + defer client.CloseIdleConnections() + + start := time.Now() + deadline := start.Add(OverallTimeout) + attempts := 0 + for { + if err := ctx.Err(); err != nil { + return fmt.Errorf("readyz cancelled for %q after %s (%d attempts): %w", + containerName, time.Since(start), attempts, err) + } + if time.Now().After(deadline) { + return fmt.Errorf("readyz for %q never returned 200 within %s (%d attempts)", + containerName, OverallTimeout, attempts) + } + + attempts++ + ok, _ := tryOnce(ctx, client, url) + if ok { + slog.InfoContext(ctx, "Readyz reached 200", + slog.String("container", containerName), + slog.String("url", url), + slog.Duration("elapsed", time.Since(start)), + slog.Int("attempts", attempts)) + return nil + } + + // Sleep instead of busy-loop. The interval bounds the worst-case + // detection delay; pre-readiness, each attempt also blocks for + // tens of µs in the kernel waiting for the RST, so the actual + // per-iteration period is somewhat longer than the sleep alone. + select { + case <-ctx.Done(): + case <-time.After(PollInterval): + } + } +} + +func tryOnce(ctx context.Context, client *http.Client, url string) (bool, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return false, err + } + resp, err := client.Do(req) + if err != nil { + return false, err + } + // Drain so the connection can be reused by the keep-alive pool. + _, _ = io.Copy(io.Discard, resp.Body) + resp.Body.Close() + return resp.StatusCode == http.StatusOK, nil +} + +// URL builds the probe endpoint URL. Exported so callers and tests can +// validate a probe spec before kicking off a Wait. +func URL(probe *ateompb.Readyz, actorIP string) (string, error) { + hg := probe.GetHttpGet() + if hg == nil { + return "", fmt.Errorf("httpGet is required") + } + port := hg.GetPort() + if port < 1 || port > 65535 { + return "", fmt.Errorf("invalid port %d", port) + } + path := hg.GetPath() + if path == "" { + path = DefaultPath + } else if !strings.HasPrefix(path, "/") { + path = "/" + path + } + return fmt.Sprintf("http://%s:%d%s", actorIP, port, path), nil +} diff --git a/internal/readyz/readyz_test.go b/internal/readyz/readyz_test.go new file mode 100644 index 000000000..4bd1fc56f --- /dev/null +++ b/internal/readyz/readyz_test.go @@ -0,0 +1,201 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package readyz + +import ( + "context" + "net" + "net/http" + "net/http/httptest" + "net/url" + "strconv" + "sync/atomic" + "testing" + "time" + + "github.com/agent-substrate/substrate/internal/proto/ateompb" +) + +func TestURL(t *testing.T) { + tests := []struct { + name string + probe *ateompb.Readyz + actorIP string + want string + wantErr bool + }{ + { + name: "default path", + probe: &ateompb.Readyz{HttpGet: &ateompb.HTTPGetAction{Port: 8080}}, + actorIP: "169.254.17.2", + want: "http://169.254.17.2:8080/readyz", + }, + { + name: "explicit path", + probe: &ateompb.Readyz{HttpGet: &ateompb.HTTPGetAction{Path: "/health", Port: 9000}}, + actorIP: "169.254.17.2", + want: "http://169.254.17.2:9000/health", + }, + { + name: "path without leading slash is normalized", + probe: &ateompb.Readyz{HttpGet: &ateompb.HTTPGetAction{Path: "ready", Port: 80}}, + actorIP: "10.0.0.1", + want: "http://10.0.0.1:80/ready", + }, + { + name: "missing httpGet", + probe: &ateompb.Readyz{}, + actorIP: "1.2.3.4", + wantErr: true, + }, + { + name: "port zero", + probe: &ateompb.Readyz{HttpGet: &ateompb.HTTPGetAction{Port: 0}}, + actorIP: "1.2.3.4", + wantErr: true, + }, + { + name: "port too large", + probe: &ateompb.Readyz{HttpGet: &ateompb.HTTPGetAction{Port: 70000}}, + actorIP: "1.2.3.4", + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := URL(tt.probe, tt.actorIP) + if (err != nil) != tt.wantErr { + t.Fatalf("err = %v, wantErr = %v", err, tt.wantErr) + } + if got != tt.want { + t.Errorf("URL = %q, want %q", got, tt.want) + } + }) + } +} + +func TestWait_ReturnsOnFirst200(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/readyz" { + http.Error(w, "not found", http.StatusNotFound) + return + } + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + ip, port := splitHostPort(t, srv.URL) + probe := &ateompb.Readyz{HttpGet: &ateompb.HTTPGetAction{Port: int32(port)}} + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + if err := Wait(ctx, "main", probe, ip); err != nil { + t.Fatalf("Wait returned error: %v", err) + } +} + +func TestWait_WaitsForServerToBecomeReady(t *testing.T) { + var ready atomic.Bool + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + if !ready.Load() { + http.Error(w, "not yet", http.StatusServiceUnavailable) + return + } + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + ip, port := splitHostPort(t, srv.URL) + probe := &ateompb.Readyz{HttpGet: &ateompb.HTTPGetAction{Port: int32(port)}} + + flipAt := time.Now().Add(50 * time.Millisecond) + go func() { + time.Sleep(time.Until(flipAt)) + ready.Store(true) + }() + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + start := time.Now() + if err := Wait(ctx, "main", probe, ip); err != nil { + t.Fatalf("Wait returned error: %v", err) + } + elapsed := time.Since(start) + if elapsed < 40*time.Millisecond { + t.Errorf("Wait returned suspiciously early: %v (server became ready at +50ms)", elapsed) + } + if elapsed > 500*time.Millisecond { + t.Errorf("Wait returned too late: %v (expected ~50ms + poll interval)", elapsed) + } +} + +func TestWait_ContextCancellation(t *testing.T) { + // Bind a port and immediately close to ensure connect-refused, so the + // poll loop is exercised but no server ever returns 200. + port := pickFreePort(t) + probe := &ateompb.Readyz{HttpGet: &ateompb.HTTPGetAction{Port: int32(port)}} + + ctx, cancel := context.WithCancel(context.Background()) + go func() { + time.Sleep(30 * time.Millisecond) + cancel() + }() + + err := Wait(ctx, "main", probe, "127.0.0.1") + if err == nil { + t.Fatalf("Wait returned nil, expected cancellation error") + } +} + +func TestWaitAll_SkipsContainersWithoutProbe(t *testing.T) { + // No server bound, but no probes => should return nil immediately. + containers := []*ateompb.Container{ + {Name: "a"}, + {Name: "b"}, + } + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond) + defer cancel() + if err := WaitAll(ctx, containers, "127.0.0.1"); err != nil { + t.Fatalf("WaitAll with no probes returned error: %v", err) + } +} + +func splitHostPort(t *testing.T, raw string) (string, int) { + t.Helper() + u, err := url.Parse(raw) + if err != nil { + t.Fatalf("parse url: %v", err) + } + host, portStr, err := net.SplitHostPort(u.Host) + if err != nil { + t.Fatalf("split host:port: %v", err) + } + port, err := strconv.Atoi(portStr) + if err != nil { + t.Fatalf("port atoi: %v", err) + } + return host, port +} + +func pickFreePort(t *testing.T) int { + t.Helper() + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen: %v", err) + } + port := l.Addr().(*net.TCPAddr).Port + l.Close() + return port +} diff --git a/manifests/ate-install/generated/ate.dev_actortemplates.yaml b/manifests/ate-install/generated/ate.dev_actortemplates.yaml index 02636823a..53207c27c 100644 --- a/manifests/ate-install/generated/ate.dev_actortemplates.yaml +++ b/manifests/ate-install/generated/ate.dev_actortemplates.yaml @@ -152,6 +152,37 @@ spec: x-kubernetes-validations: - message: Name must be a valid DNS label rule: '!format.dns1123Label().validate(self).hasValue()' + readyz: + description: |- + Readyz is an optional HTTP readiness probe. When set, the actor is not + considered ready (and Run/Restore RPCs do not return success) until the + container's HTTP endpoint returns 200. + properties: + httpGet: + description: HTTPGet specifies the HTTP request to perform + against the container. + properties: + path: + description: |- + Path to access on the HTTP server. Defaults to "/readyz" when empty. + If set, it must be a valid URL path starting with "/". Only characters + permitted by RFC 3986 path segments are accepted; query strings ("?") + and fragments ("#") must be omitted. + maxLength: 1024 + pattern: ^/[A-Za-z0-9\-._~!$&'()*+,;=:@/%]*$ + type: string + port: + description: Port to access on the container. + format: int32 + maximum: 65535 + minimum: 1 + type: integer + required: + - port + type: object + required: + - httpGet + type: object required: - image - name diff --git a/pkg/api/v1alpha1/actortemplate_types.go b/pkg/api/v1alpha1/actortemplate_types.go index 519015fdd..e1c3e5fb3 100644 --- a/pkg/api/v1alpha1/actortemplate_types.go +++ b/pkg/api/v1alpha1/actortemplate_types.go @@ -56,6 +56,42 @@ type Container struct { // +optional // +kubebuilder:validation:MaxItems=32 Env []EnvVar `json:"env,omitempty"` + + // Readyz is an optional HTTP readiness probe. When set, the actor is not + // considered ready (and Run/Restore RPCs do not return success) until the + // container's HTTP endpoint returns 200. + // + // +optional + Readyz *ContainerReadyz `json:"readyz,omitempty"` +} + +// ContainerReadyz configures the readiness signal for a container. +type ContainerReadyz struct { + // HTTPGet specifies the HTTP request to perform against the container. + // + // +required + HTTPGet *HTTPGetAction `json:"httpGet"` +} + +// HTTPGetAction describes an HTTP GET request to perform against the +// container's interior IP. Modeled after a subset of corev1.HTTPGetAction. +type HTTPGetAction struct { + // Path to access on the HTTP server. Defaults to "/readyz" when empty. + // If set, it must be a valid URL path starting with "/". Only characters + // permitted by RFC 3986 path segments are accepted; query strings ("?") + // and fragments ("#") must be omitted. + // + // +optional + // +kubebuilder:validation:MaxLength=1024 + // +kubebuilder:validation:Pattern=`^/[A-Za-z0-9\-._~!$&'()*+,;=:@/%]*$` + Path string `json:"path,omitempty"` + + // Port to access on the container. + // + // +required + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=65535 + Port int32 `json:"port"` } // EnvVar represents an environment variable supplied to a container in an diff --git a/pkg/api/v1alpha1/actortemplate_validation_test.go b/pkg/api/v1alpha1/actortemplate_validation_test.go index beca713c7..854662cff 100644 --- a/pkg/api/v1alpha1/actortemplate_validation_test.go +++ b/pkg/api/v1alpha1/actortemplate_validation_test.go @@ -364,6 +364,91 @@ func TestActorTemplateValidation(t *testing.T) { }, wantErr: true, errMsg: "Invalid value", + }, { + name: "valid Readyz with default path", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Port: 8080}, + } + }, + wantErr: false, + }, { + name: "valid Readyz with explicit path", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Path: "/health", Port: 8080}, + } + }, + wantErr: false, + }, { + name: "Readyz missing HTTPGet", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{} + }, + wantErr: true, + errMsg: "Required value", + }, { + name: "Readyz port zero", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Port: 0}, + } + }, + wantErr: true, + errMsg: "should be greater than or equal to 1", + }, { + name: "Readyz port too large", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Port: 65536}, + } + }, + wantErr: true, + errMsg: "should be less than or equal to 65535", + }, { + name: "Readyz Path with nested segments and percent encoding", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Path: "/v1/health/check%20me", Port: 80}, + } + }, + wantErr: false, + }, { + name: "Readyz Path missing leading slash", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Path: "readyz", Port: 80}, + } + }, + wantErr: true, + errMsg: "should match", + }, { + name: "Readyz Path with query string", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Path: "/readyz?check=1", Port: 80}, + } + }, + wantErr: true, + errMsg: "should match", + }, { + name: "Readyz Path with fragment", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Path: "/readyz#frag", Port: 80}, + } + }, + wantErr: true, + errMsg: "should match", + }, { + name: "Readyz Path with whitespace", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Path: "/ready z", Port: 80}, + } + }, + wantErr: true, + errMsg: "should match", }, { name: "valid SandboxClass microvm", mutate: func(at *ActorTemplate) { diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go index dec84c92d..687008f6c 100644 --- a/pkg/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go @@ -164,6 +164,11 @@ func (in *Container) DeepCopyInto(out *Container) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.Readyz != nil { + in, out := &in.Readyz, &out.Readyz + *out = new(ContainerReadyz) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Container. @@ -176,6 +181,26 @@ func (in *Container) DeepCopy() *Container { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ContainerReadyz) DeepCopyInto(out *ContainerReadyz) { + *out = *in + if in.HTTPGet != nil { + in, out := &in.HTTPGet, &out.HTTPGet + *out = new(HTTPGetAction) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ContainerReadyz. +func (in *ContainerReadyz) DeepCopy() *ContainerReadyz { + if in == nil { + return nil + } + out := new(ContainerReadyz) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvVar) DeepCopyInto(out *EnvVar) { *out = *in @@ -221,6 +246,21 @@ func (in *EnvVarSource) DeepCopy() *EnvVarSource { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *HTTPGetAction) DeepCopyInto(out *HTTPGetAction) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HTTPGetAction. +func (in *HTTPGetAction) DeepCopy() *HTTPGetAction { + if in == nil { + return nil + } + out := new(HTTPGetAction) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SandboxConfig) DeepCopyInto(out *SandboxConfig) { *out = *in From 8c4e2b31de3467f6870b83e431d54924d5858744 Mon Sep 17 00:00:00 2001 From: dberkov Date: Fri, 26 Jun 2026 16:34:32 -0700 Subject: [PATCH 2/2] Address review feedback on readyz MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Tighten CRD path pattern to require well-formed percent-escapes (%XX with exactly two hex digits) so malformed paths like /foo% or /bar%zz are rejected at admission rather than silently breaking http.NewRequestWithContext after a 30s timeout. - Default the path field at admission via +kubebuilder:default="/readyz" so the API server populates it when callers omit it; the runtime default in readyz.URL() remains as defense-in-depth for proto callers that bypass the CRD. - Capture last error from each probe attempt and surface it in the deadline/cancellation messages so persistent failures are visible instead of an opaque timeout. - Treat non-200 responses as the last error too (HTTP ) so a server stuck at 5xx surfaces in the timeout message. - Bump readyz PollInterval from 500µs to 1ms to cut probe syscalls ~2x per cold-start while still meeting the single-millisecond detection goal documented on the package. - Update package doc: ateom-microvm is wired in this PR alongside ateom-gvisor. - Drop "exported as vars" wording from the const-block comment. - Add CRD validation tests for /foo% and /bar%zz rejection, and a round-trip test for the /readyz default. --- internal/readyz/readyz.go | 40 ++++++++------ .../generated/ate.dev_actortemplates.yaml | 10 ++-- pkg/api/v1alpha1/actortemplate_types.go | 10 ++-- .../v1alpha1/actortemplate_validation_test.go | 53 +++++++++++++++++++ 4 files changed, 88 insertions(+), 25 deletions(-) diff --git a/internal/readyz/readyz.go b/internal/readyz/readyz.go index 12640c0c1..1700f3bd8 100644 --- a/internal/readyz/readyz.go +++ b/internal/readyz/readyz.go @@ -13,13 +13,12 @@ // limitations under the License. // Package readyz polls a container's HTTP readiness endpoint from inside an -// ateom (currently ateom-gvisor; ateom-microvm will use the same code path -// once its actor networking lands). The intent is to detect the moment a -// container's HTTP server starts accepting connections with single-millisecond -// latency: while the server is still booting the kernel returns RST in -// microseconds, so a sub-millisecond poll loop spends almost no time blocked, -// and once the listen socket is up the next iteration completes the GET on -// veth-local latency. +// ateom. The intent is to detect the moment a container's HTTP server +// starts accepting connections with single-millisecond latency: while the +// server is still booting the kernel returns RST in microseconds, so a +// sub-millisecond poll loop spends almost no time blocked, and once the +// listen socket is up the next iteration completes the GET on veth-local +// latency. package readyz import ( @@ -36,13 +35,13 @@ import ( "golang.org/x/sync/errgroup" ) -// Tuning knobs. Exported as vars so callers (or tests) can shorten the -// overall timeout in pathological setups; the defaults are sized for actor -// cold-start where the server may take a few seconds to bind. +// Tuning knobs. Sized for actor cold-start where the HTTP server may take +// a few seconds to bind; HTTPClient below is a var so tests can substitute +// a transport that targets a test server's loopback address. const ( OverallTimeout = 30 * time.Second RequestTimeout = 250 * time.Millisecond - PollInterval = 500 * time.Microsecond + PollInterval = 1 * time.Millisecond DefaultPath = "/readyz" maxIdleConnsHost = 1 ) @@ -91,18 +90,22 @@ func Wait(ctx context.Context, containerName string, probe *ateompb.Readyz, acto start := time.Now() deadline := start.Add(OverallTimeout) attempts := 0 + var lastErr error for { if err := ctx.Err(); err != nil { - return fmt.Errorf("readyz cancelled for %q after %s (%d attempts): %w", - containerName, time.Since(start), attempts, err) + return fmt.Errorf("readyz cancelled for %q after %s (%d attempts, last error: %v): %w", + containerName, time.Since(start), attempts, lastErr, err) } if time.Now().After(deadline) { - return fmt.Errorf("readyz for %q never returned 200 within %s (%d attempts)", - containerName, OverallTimeout, attempts) + return fmt.Errorf("readyz for %q never returned 200 within %s (%d attempts, last error: %v)", + containerName, OverallTimeout, attempts, lastErr) } attempts++ - ok, _ := tryOnce(ctx, client, url) + ok, err := tryOnce(ctx, client, url) + if err != nil { + lastErr = err + } if ok { slog.InfoContext(ctx, "Readyz reached 200", slog.String("container", containerName), @@ -135,7 +138,10 @@ func tryOnce(ctx context.Context, client *http.Client, url string) (bool, error) // Drain so the connection can be reused by the keep-alive pool. _, _ = io.Copy(io.Discard, resp.Body) resp.Body.Close() - return resp.StatusCode == http.StatusOK, nil + if resp.StatusCode != http.StatusOK { + return false, fmt.Errorf("HTTP %d", resp.StatusCode) + } + return true, nil } // URL builds the probe endpoint URL. Exported so callers and tests can diff --git a/manifests/ate-install/generated/ate.dev_actortemplates.yaml b/manifests/ate-install/generated/ate.dev_actortemplates.yaml index 53207c27c..aec84f9a9 100644 --- a/manifests/ate-install/generated/ate.dev_actortemplates.yaml +++ b/manifests/ate-install/generated/ate.dev_actortemplates.yaml @@ -163,13 +163,15 @@ spec: against the container. properties: path: + default: /readyz description: |- - Path to access on the HTTP server. Defaults to "/readyz" when empty. - If set, it must be a valid URL path starting with "/". Only characters - permitted by RFC 3986 path segments are accepted; query strings ("?") + Path to access on the HTTP server. Defaults to "/readyz". + Must be a valid URL path starting with "/". Only characters permitted + by RFC 3986 path segments are accepted; percent-escapes must be a + literal "%" followed by exactly two hex digits. Query strings ("?") and fragments ("#") must be omitted. maxLength: 1024 - pattern: ^/[A-Za-z0-9\-._~!$&'()*+,;=:@/%]*$ + pattern: ^/([A-Za-z0-9\-._~!$&'()*+,;=:@/]|%[0-9A-Fa-f]{2})*$ type: string port: description: Port to access on the container. diff --git a/pkg/api/v1alpha1/actortemplate_types.go b/pkg/api/v1alpha1/actortemplate_types.go index e1c3e5fb3..12e8754c9 100644 --- a/pkg/api/v1alpha1/actortemplate_types.go +++ b/pkg/api/v1alpha1/actortemplate_types.go @@ -76,14 +76,16 @@ type ContainerReadyz struct { // HTTPGetAction describes an HTTP GET request to perform against the // container's interior IP. Modeled after a subset of corev1.HTTPGetAction. type HTTPGetAction struct { - // Path to access on the HTTP server. Defaults to "/readyz" when empty. - // If set, it must be a valid URL path starting with "/". Only characters - // permitted by RFC 3986 path segments are accepted; query strings ("?") + // Path to access on the HTTP server. Defaults to "/readyz". + // Must be a valid URL path starting with "/". Only characters permitted + // by RFC 3986 path segments are accepted; percent-escapes must be a + // literal "%" followed by exactly two hex digits. Query strings ("?") // and fragments ("#") must be omitted. // // +optional + // +kubebuilder:default="/readyz" // +kubebuilder:validation:MaxLength=1024 - // +kubebuilder:validation:Pattern=`^/[A-Za-z0-9\-._~!$&'()*+,;=:@/%]*$` + // +kubebuilder:validation:Pattern=`^/([A-Za-z0-9\-._~!$&'()*+,;=:@/]|%[0-9A-Fa-f]{2})*$` Path string `json:"path,omitempty"` // Port to access on the container. diff --git a/pkg/api/v1alpha1/actortemplate_validation_test.go b/pkg/api/v1alpha1/actortemplate_validation_test.go index 854662cff..1558c9ddc 100644 --- a/pkg/api/v1alpha1/actortemplate_validation_test.go +++ b/pkg/api/v1alpha1/actortemplate_validation_test.go @@ -449,6 +449,24 @@ func TestActorTemplateValidation(t *testing.T) { }, wantErr: true, errMsg: "should match", + }, { + name: "Readyz Path with bare percent", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Path: "/foo%", Port: 80}, + } + }, + wantErr: true, + errMsg: "should match", + }, { + name: "Readyz Path with malformed percent-escape", + mutate: func(at *ActorTemplate) { + at.Spec.Containers[0].Readyz = &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Path: "/bar%zz", Port: 80}, + } + }, + wantErr: true, + errMsg: "should match", }, { name: "valid SandboxClass microvm", mutate: func(at *ActorTemplate) { @@ -486,6 +504,41 @@ func TestActorTemplateValidation(t *testing.T) { } } +func TestActorTemplateReadyzPathDefault(t *testing.T) { + ctx := t.Context() + + at := &ActorTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: "readyz-default", + Namespace: "default", + }, + Spec: ActorTemplateSpec{ + PauseImage: "gcr.io/gke-release/pause@sha256:bcbd57ba5653580ec647b16d8163cdd1112df3609129b01f912a8032e48265da", + Containers: []Container{{ + Name: "main", + Image: "busybox@sha256:326e0e090a9a4057e62a1b94236e7a2df2f2f76722f67232e0e47854e4df9c53", + Readyz: &ContainerReadyz{ + HTTPGet: &HTTPGetAction{Port: 8080}, + }, + }}, + SnapshotsConfig: SnapshotsConfig{Location: "gs://test-bucket/test-folder"}, + WorkerSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"pool": "test-pool"}}, + }, + } + if err := k8sClient.Create(ctx, at); err != nil { + t.Fatalf("create: %v", err) + } + defer func() { _ = k8sClient.Delete(ctx, at) }() + + got := &ActorTemplate{} + if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(at), got); err != nil { + t.Fatalf("get: %v", err) + } + if want, gotPath := "/readyz", got.Spec.Containers[0].Readyz.HTTPGet.Path; gotPath != want { + t.Errorf("Readyz.HTTPGet.Path = %q, want %q (CRD default)", gotPath, want) + } +} + func TestActorTemplateSpecImmutability(t *testing.T) { ctx := t.Context()