From fa991e279fee4fd209c8b455e84af372f261222d Mon Sep 17 00:00:00 2001 From: Benjamin Elder Date: Thu, 25 Jun 2026 17:39:33 -0700 Subject: [PATCH 1/5] ateom-microvm/ch: add virtio-fs and platform devices to the VM config The overlay rootfs serves each container's image read-only over virtio-fs, which needs a vhost-user fs device (a virtiofsd socket) and more than one PCI segment (the fs device sits on segment 1, kata's convention). Add FsConfig + PlatformConfig and the Fs/Platform fields to VmConfig; both are omitempty, so a config without them is serialized exactly as before. --- cmd/ateom-microvm/internal/ch/createvm.go | 47 +++++++++++++++-------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/cmd/ateom-microvm/internal/ch/createvm.go b/cmd/ateom-microvm/internal/ch/createvm.go index f24266653..51d0ea4a3 100644 --- a/cmd/ateom-microvm/internal/ch/createvm.go +++ b/cmd/ateom-microvm/internal/ch/createvm.go @@ -20,19 +20,35 @@ import ( ) // VmConfig is the body of /api/v1/vm.create — the subset of cloud-hypervisor's -// VmConfig ateom sets to boot a kata guest itself (the "ateom owns the boot" -// path, replacing the kata shim). Modeled on kata's clh driver -// (src/runtime/virtcontainers/clh.go) and the proven suspend-bench vmConfig. -// vm.create + vm.boot use PUT (empirically accepted by CH, like the bench). +// VmConfig ateom sets to boot the kata guest. Modeled on kata's clh driver +// (src/runtime/virtcontainers/clh.go). vm.create + vm.boot are issued with PUT. type VmConfig struct { - Cpus CpusConfig `json:"cpus"` - Memory MemoryConfig `json:"memory"` - Payload PayloadConfig `json:"payload"` - Disks []DiskConfig `json:"disks,omitempty"` - Rng *RngConfig `json:"rng,omitempty"` - Serial *ConsoleConfig `json:"serial,omitempty"` - Console *ConsoleConfig `json:"console,omitempty"` - Vsock *VsockConfig `json:"vsock,omitempty"` + Cpus CpusConfig `json:"cpus"` + Memory MemoryConfig `json:"memory"` + Payload PayloadConfig `json:"payload"` + Disks []DiskConfig `json:"disks,omitempty"` + Fs []FsConfig `json:"fs,omitempty"` + Rng *RngConfig `json:"rng,omitempty"` + Serial *ConsoleConfig `json:"serial,omitempty"` + Console *ConsoleConfig `json:"console,omitempty"` + Vsock *VsockConfig `json:"vsock,omitempty"` + Platform *PlatformConfig `json:"platform,omitempty"` +} + +// FsConfig is a virtio-fs device backed by a vhost-user (virtiofsd) socket. The +// overlay rootfs path uses it as the RO lower; the guest mounts it via the FsTag. +type FsConfig struct { + Tag string `json:"tag"` + Socket string `json:"socket"` + NumQueues int32 `json:"num_queues,omitempty"` + QueueSize int32 `json:"queue_size,omitempty"` + PciSegment int32 `json:"pci_segment,omitempty"` +} + +// PlatformConfig sets VM-wide platform options. NumPciSegments must be >1 when a +// virtio-fs device sits on a non-zero PCI segment (kata puts fs on segment 1). +type PlatformConfig struct { + NumPciSegments int32 `json:"num_pci_segments,omitempty"` } // CpusConfig sets the boot/max vCPU counts. @@ -56,10 +72,9 @@ type PayloadConfig struct { Cmdline string `json:"cmdline"` } -// DiskConfig is one virtio-blk disk. The kata guest image is disk 0 (/dev/vda, -// readonly); ateom appends the actor rootfs as disk 1 (/dev/vdb, writable). The -// guest sees disks in config order. NumQueues/QueueSize mirror kata's clh -// (num_queues = vcpus, queue_size = 1024). +// DiskConfig is one virtio-blk disk. The only disk is the kata guest image +// (/dev/vda, read-only); the actor rootfs is an overlay served over virtio-fs, not a +// disk. NumQueues/QueueSize mirror kata's clh (num_queues = vcpus, queue_size = 1024). type DiskConfig struct { Path string `json:"path"` Readonly bool `json:"readonly"` From 3f9e8cb301255b762a5e33e86fc12ea7b86be909 Mon Sep 17 00:00:00 2001 From: Benjamin Elder Date: Thu, 25 Jun 2026 17:57:20 -0700 Subject: [PATCH 2/5] ateom-microvm/kata: overlay rootfs helpers (virtio-fs RO lower + tmpfs upper) Helpers to assemble a container's rootfs as an overlay: its OCI image served read-only over virtio-fs (the lower) plus a guest tmpfs (the writable upper). - StartVirtiofsd: run virtiofsd in find-paths migration mode (so the fs device survives CH snapshot/restore), serving the per-sandbox shared dir. - ReconstructSharedDirFromImage: bind-mount a container's image into /rootfs under the shared dir (no host-side copy; virtiofsd serves it to the guest on demand), ensure the standard OCI mountpoints exist, and remount it read-only so the lower is immutable and byte-identical on every node (find-paths re-opens its inodes by path on restore). - CreateSandboxForActor: create the sandbox with the kataShared virtio-fs mount. - CreateCarrier: a created-but-unstarted container that binds the base to a stable per-container path the overlay uses as its lowerdir. - StartOverlayWorkload: create + start the container with an overlayfs rootfs whose upper/work live on a guest tmpfs. --- .../internal/kata/agentclient.go | 20 +- cmd/ateom-microvm/internal/kata/kata.go | 15 +- .../internal/kata/overlay_linux.go | 247 ++++++++++++++++++ 3 files changed, 264 insertions(+), 18 deletions(-) create mode 100644 cmd/ateom-microvm/internal/kata/overlay_linux.go diff --git a/cmd/ateom-microvm/internal/kata/agentclient.go b/cmd/ateom-microvm/internal/kata/agentclient.go index 42537f8f9..f7a242694 100644 --- a/cmd/ateom-microvm/internal/kata/agentclient.go +++ b/cmd/ateom-microvm/internal/kata/agentclient.go @@ -158,9 +158,9 @@ func (a *AgentClient) StartContainer(ctx context.Context, containerID string) er // CreateSandbox establishes the agent's sandbox context (sandbox id, hostname, // sandbox pidns) before any container is created. The kata shim normally issues -// this once at VM boot; on the ateom-owned-boot path (no shim) ateom must call it -// itself so the agent has a sandbox to attach containers to. Storages is empty — -// the actor rootfs arrives as a per-container "blk" storage, not a sandbox mount. +// this once at VM boot; ateom (no shim) must call it itself so the agent has a +// sandbox to attach containers to. Storages carries the shared virtio-fs mount +// (the overlay lowers); each container's rootfs is assembled per-container. // Mirrors grpc.AgentService/CreateSandbox (returns google.protobuf.Empty). func (a *AgentClient) CreateSandbox(ctx context.Context, req *agentpb.CreateSandboxRequest) error { if err := a.client.Call(ctx, "grpc.AgentService", "CreateSandbox", req, &emptypb.Empty{}); err != nil { @@ -169,10 +169,10 @@ func (a *AgentClient) CreateSandbox(ctx context.Context, req *agentpb.CreateSand return nil } -// UpdateInterface configures a guest network interface (the kata shim's job; on -// the owned-boot path ateom does it). The agent matches the link by HwAddr, then -// applies the name/IP/MTU. Mirrors grpc.AgentService/UpdateInterface (returns the -// resulting Interface). +// UpdateInterface configures a guest network interface (the kata shim's job, which +// ateom does itself). The agent matches the link by HwAddr, then applies the +// name/IP/MTU. Mirrors grpc.AgentService/UpdateInterface (returns the resulting +// Interface). func (a *AgentClient) UpdateInterface(ctx context.Context, iface *agentpb.Interface) error { req := &agentpb.UpdateInterfaceRequest{Interface: iface} if err := a.client.Call(ctx, "grpc.AgentService", "UpdateInterface", req, &agentpb.Interface{}); err != nil { @@ -208,7 +208,7 @@ func (a *AgentClient) AddARPNeighbors(ctx context.Context, neighbors []*agentpb. // buffered (up to max), so callers loop until it returns an error — the agent // returns an error/EOF-like status once the stream ends (container exit / connection // close). Mirrors grpc.AgentService/ReadStdout. The kata-agent keys the stream by -// ExecId, which the owned-boot path sets equal to ContainerId (see StartBlkWorkload). +// ExecId, which ateom sets equal to ContainerId. func (a *AgentClient) ReadStdout(ctx context.Context, containerID, execID string, max uint32) ([]byte, error) { resp := &agentpb.ReadStreamResponse{} req := &agentpb.ReadStreamRequest{ContainerId: containerID, ExecId: execID, Len: max} @@ -246,8 +246,8 @@ type StreamReader struct { } // NewStdioReader returns an io.Reader over the container's stdout (stderr=false) -// or stderr (stderr=true). execID matches the value passed to StartBlkWorkload -// (equal to containerID on the owned-boot path). +// or stderr (stderr=true). execID equals containerID (ateom sets ExecId == +// ContainerId when it creates the container). func NewStdioReader(ctx context.Context, ac *AgentClient, containerID, execID string, stderr bool) *StreamReader { return &StreamReader{ctx: ctx, ac: ac, containerID: containerID, execID: execID, stderr: stderr} } diff --git a/cmd/ateom-microvm/internal/kata/kata.go b/cmd/ateom-microvm/internal/kata/kata.go index f6008db23..0b09241ec 100644 --- a/cmd/ateom-microvm/internal/kata/kata.go +++ b/cmd/ateom-microvm/internal/kata/kata.go @@ -13,15 +13,14 @@ // limitations under the License. // Package kata holds the helpers ateom uses to boot and drive a kata guest in a -// cloud-hypervisor micro-VM WITHOUT the kata shim: ateom boots cloud-hypervisor -// itself (see internal/ch), then drives the stock kata-agent over its -// hybrid-vsock ttrpc API (DialAgent / AgentClient) to create the sandbox and -// start the actor's container on a writable virtio-blk rootfs (StartBlkWorkload). +// cloud-hypervisor micro-VM without the kata shim: ateom boots cloud-hypervisor +// itself (see internal/ch), then drives the stock kata-agent over its hybrid-vsock +// ttrpc API (DialAgent / AgentClient) to create the sandbox and assemble each +// container's overlay rootfs (overlay_linux.go). // -// It also renders the kata configuration.toml (for the agent kernel_params + -// guest sizing) from runtime-fetched assets (config.go), builds the actor's ext4 -// rootfs disk (BuildExt4Image), and sweeps leftover per-sandbox host-side state -// (CleanupSandboxState). +// It also renders the kata configuration.toml (for the agent kernel_params + guest +// sizing) from runtime-fetched assets (config.go) and sweeps leftover per-sandbox +// host-side state (CleanupSandboxState). package kata import ( diff --git a/cmd/ateom-microvm/internal/kata/overlay_linux.go b/cmd/ateom-microvm/internal/kata/overlay_linux.go new file mode 100644 index 000000000..62007e3d4 --- /dev/null +++ b/cmd/ateom-microvm/internal/kata/overlay_linux.go @@ -0,0 +1,247 @@ +//go:build linux + +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kata + +// Each container's rootfs is an overlay: its OCI image served read-only over virtio-fs +// (the lower) plus a guest tmpfs (the writable upper). The upper is in guest RAM, so +// rootfs writes ride along in the memory snapshot and persist across suspend/resume. +// This file holds the overlay-specific helpers. + +import ( + "context" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/agent-substrate/substrate/cmd/ateom-microvm/internal/third_party/kata/agentpb" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + // FsTag is the virtio-fs tag kata uses for the shared filesystem. The CH fs + // device Tag and the agent mount Source must both be this value. + FsTag = "kataShared" + // typeVirtioFS / virtioFSDriver are the agent fstype + driver for it. + typeVirtioFS = "virtiofs" + virtioFSDriver = "virtio-fs" + // guestSharedDir is where the agent mounts the kataShared tag in the guest; + // per-container rootfs then lives at //rootfs. + guestSharedDir = "/run/kata-containers/shared/containers/" +) + +// SharedDir is the host directory virtiofsd serves into the guest as the RO base. +// Its layout (/rootfs) is what find-paths re-opens by path on restore. +func SharedDir(id string) string { + return filepath.Join("/run/kata-containers/shared/sandboxes", id, "shared") +} + +// VirtiofsdSocketPath is the vhost-user-fs socket CH connects to for the fs device. +func VirtiofsdSocketPath(id string) string { return filepath.Join(VMDir(id), "virtiofsd.sock") } + +// OverlayUpperBase is the in-guest mount point for one container's overlay upper/work. +// It lives under /run (tmpfs) so the upper's writes are in guest RAM and ride along in +// the memory-only snapshot (rootfs writes persist). Keyed on the container id, which is +// stable across the actor's restore lineage. +func OverlayUpperBase(containerID string) string { return "/run/ateom-upper/" + containerID } + +// GuestSharedRootfs is the in-guest path the kataShared mount exposes a container's +// rootfs at. A carrier container with this as Root.Path makes the agent bind it to +// /run/kata-containers//rootfs — a stable per-container path the overlay then +// uses as its lowerdir. +func GuestSharedRootfs(containerID string) string { return guestSharedDir + containerID + "/rootfs" } + +// VirtiofsdOptions configures StartVirtiofsd. +type VirtiofsdOptions struct { + Binary string // virtiofsd executable; defaults to "virtiofsd" + SocketPath string // vhost-user socket CH connects to (VirtiofsdSocketPath) + SharedDir string // directory to serve (SharedDir(id)) + Log io.Writer +} + +// StartVirtiofsd launches virtiofsd in find-paths migration mode serving o.SharedDir +// on o.SocketPath, and waits for the socket to appear. The returned cmd outlives the +// caller's ctx (CH demand-pages from it under the running VM); the caller owns it. +func StartVirtiofsd(ctx context.Context, o VirtiofsdOptions) (*exec.Cmd, error) { + bin := o.Binary + if bin == "" { + bin = "virtiofsd" + } + _ = os.Remove(o.SocketPath) + cmd := exec.Command(bin, + "--socket-path="+o.SocketPath, + "--shared-dir="+o.SharedDir, + "--cache=auto", + "--thread-pool-size=1", + "--announce-submounts", + "--migration-mode", "find-paths", + ) + cmd.Stdout = o.Log + cmd.Stderr = o.Log + if err := cmd.Start(); err != nil { + return nil, fmt.Errorf("starting virtiofsd: %w", err) + } + deadline := time.Now().Add(10 * time.Second) + for time.Now().Before(deadline) { + if _, err := os.Stat(o.SocketPath); err == nil { + return cmd, nil + } + select { + case <-ctx.Done(): + _ = cmd.Process.Kill() + return nil, ctx.Err() + case <-time.After(50 * time.Millisecond): + } + } + _ = cmd.Process.Kill() + return nil, fmt.Errorf("virtiofsd socket %q did not appear", o.SocketPath) +} + +// ReconstructSharedDirFromImage bind-mounts a container's OCI image rootfs at +// /rootfs under SharedDir(restoreID) so virtiofsd serves it as the read-only lower. +// The bind copies nothing on the host (virtiofsd serves files to the guest on demand). +// The path is identical on every node — find-paths migration re-opens the lower by path +// — given a deterministic image unpack. cid is stable across the actor's lineage. +func ReconstructSharedDirFromImage(ctx context.Context, bundleRootfs, restoreID, cid string) error { + if cid == "" { + return fmt.Errorf("ReconstructSharedDirFromImage: empty container id") + } + dst := filepath.Join(SharedDir(restoreID), cid, "rootfs") + // Drop any stale bind first (lazy if busy), then ensure a clean mountpoint. Not + // RemoveAll: that would chase a live bind into bundleRootfs. + if err := exec.Command("umount", dst).Run(); err != nil { + _ = exec.Command("umount", "-l", dst).Run() + } + if err := os.MkdirAll(dst, 0o755); err != nil { + return fmt.Errorf("creating shared dir %q: %w", dst, err) + } + cmd := exec.CommandContext(ctx, "mount", "--bind", bundleRootfs, dst) + var stderr strings.Builder + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("bind-mounting image rootfs %q -> %q: %w (%s)", bundleRootfs, dst, err, strings.TrimSpace(stderr.String())) + } + // Ensure the standard OCI mountpoints exist even for minimal images: the container + // mounts /proc,/sys,/dev over them, and find-paths re-opens the lower by path on + // restore, so the layout must match on every node. (Bind still writable; ignore EEXIST.) + for _, d := range []string{"proc", "sys", "dev"} { + _ = os.MkdirAll(filepath.Join(dst, d), 0o755) + } + // Remount read-only: the lower is immutable, so all writes go to the tmpfs upper and + // it stays byte-identical across reconstructions (required by find-paths migration). + ro := exec.CommandContext(ctx, "mount", "-o", "remount,bind,ro", dst) + var roErr strings.Builder + ro.Stderr = &roErr + if err := ro.Run(); err != nil { + return fmt.Errorf("remounting overlay lower read-only %q: %w (%s)", dst, err, strings.TrimSpace(roErr.String())) + } + return nil +} + +// CreateSandboxForActor creates the guest sandbox with the kataShared virtio-fs mount +// (the RO base backing every container's rootfs). Mirrors kata startSandbox. +func (a *AgentClient) CreateSandboxForActor(ctx context.Context, sandboxID, hostname string) error { + return a.CreateSandbox(ctx, &agentpb.CreateSandboxRequest{ + Hostname: hostname, + SandboxId: sandboxID, + Storages: []*agentpb.Storage{{ + Driver: virtioFSDriver, + Source: FsTag, + Fstype: typeVirtioFS, + MountPoint: guestSharedDir, + }}, + }) +} + +// CreateCarrier creates a "carrier" container (id == cid): rootfs = the kataShared +// virtio-fs base for that container, created but NOT started. This makes the agent's +// setup_bundle bind the base to /run/kata-containers//rootfs — the stable path the +// overlay uses as its lowerdir (a bare virtio-fs submount is not reliably visible there). +func (a *AgentClient) CreateCarrier(ctx context.Context, cid string, spec *specs.Spec) error { + pbSpec := SpecToAgentPB(spec) + // Readonly: the carrier only exists to materialize the base bind; its rootfs (the + // overlay lower) must stay immutable. Overlay writes go to the tmpfs upper. + pbSpec.Root = &agentpb.Root{Path: GuestSharedRootfs(cid), Readonly: true} + if pbSpec.Linux != nil { + pbSpec.Linux.CgroupsPath = "/ateomchv/" + cid + "-carrier" + } + if err := a.CreateContainer(ctx, &agentpb.CreateContainerRequest{ + ContainerId: cid, + ExecId: cid, + OCI: pbSpec, + }); err != nil { + return fmt.Errorf("creating carrier container %q: %w", cid, err) + } + return nil +} + +// StartOverlayWorkload creates + starts one container with an overlayfs rootfs: +// lower = the carrier's resolved bind (/run/kata-containers//rootfs from the RO +// virtio-fs base), upper/work = /{fs,work} on a guest tmpfs so rootfs writes +// land in guest RAM (captured by the memory-only snapshot → persist). The agent creates +// the upper/work dirs (create_directory) before mounting the overlay. +func (a *AgentClient) StartOverlayWorkload(ctx context.Context, cid, workloadID, upperBase string, spec *specs.Spec) error { + const createDir = "io.katacontainers.volume.overlayfs.create_directory" + sharedBase := "/run/kata-containers/" + cid + "/rootfs" + base := "/run/kata-containers/" + workloadID + lower := base + "/lower" + ovlRoot := base + "/rootfs" + upper := upperBase + "/fs" + work := upperBase + "/work" + + storages := []*agentpb.Storage{ + { + Driver: virtioFSDriver, + Source: sharedBase, + MountPoint: lower, + Fstype: "bind", + Options: []string{"bind"}, + }, + { + Driver: "overlayfs", + Source: "overlay", + Fstype: "overlay", + MountPoint: ovlRoot, + DriverOptions: []string{createDir + "=" + upper, createDir + "=" + work}, + Options: []string{"lowerdir=" + lower, "upperdir=" + upper, "workdir=" + work}, + }, + } + pbSpec := SpecToAgentPB(spec) + pbSpec.Root = &agentpb.Root{Path: ovlRoot, Readonly: false} + // Per-workload cgroup: the shaped spec carries the actor-wide /ateomchv/ + // (spec.go), which collides across an actor's containers — mirror the carrier's + // per-id path so each workload gets its own cgroup. + if pbSpec.Linux != nil { + pbSpec.Linux.CgroupsPath = "/ateomchv/" + workloadID + } + + if err := a.CreateContainer(ctx, &agentpb.CreateContainerRequest{ + ContainerId: workloadID, + ExecId: workloadID, + Storages: storages, + OCI: pbSpec, + }); err != nil { + return fmt.Errorf("creating overlay workload %q: %w", workloadID, err) + } + if err := a.StartContainer(ctx, workloadID); err != nil { + return fmt.Errorf("starting overlay workload %q: %w", workloadID, err) + } + return nil +} From 1546c8ecf6ddca665c0b2ea475e163a69694c3e0 Mon Sep 17 00:00:00 2001 From: Benjamin Elder Date: Thu, 25 Jun 2026 17:57:20 -0700 Subject: [PATCH 3/5] ateom-microvm: run multi-container actors on an overlay rootfs Run all of an actor's containers in the one micro-VM (the pod sandbox), each with its own overlay rootfs (virtio-fs RO lower + guest-tmpfs upper) rather than a per-container disk. Because the writable upper is a guest tmpfs, rootfs writes are part of the CH memory snapshot and persist across suspend/resume alongside process memory. - RunWorkload: bind each container's image into the shared dir and start one virtiofsd; create the sandbox, then a carrier + overlay workload per container. - CheckpointWorkload: pause + snapshot memory; the tmpfs upper rides along, so there is no per-container disk to ship. - RestoreWorkload: reconstruct each read-only lower from the local OCI bundle, start virtiofsd, repoint the snapshot config's per-VMDir paths (vsock, serial, fs socket), and OnDemand-restore + resume. This replaces the per-container disk path: remove the disk builder (BuildExt4Image), the blk workload (StartBlkWorkload), and the now-obsolete blk integration test. --- cmd/ateom-microvm/checkpoint.go | 51 +- cmd/ateom-microvm/internal/kata/disk.go | 140 ------ cmd/ateom-microvm/internal/kata/disk_test.go | 76 --- cmd/ateom-microvm/internal/kata/specconv.go | 56 +-- cmd/ateom-microvm/restore.go | 168 +++---- cmd/ateom-microvm/run.go | 347 ++++++++----- cmd/ateom-microvm/service_integration_test.go | 473 ------------------ 7 files changed, 307 insertions(+), 1004 deletions(-) delete mode 100644 cmd/ateom-microvm/internal/kata/disk.go delete mode 100644 cmd/ateom-microvm/internal/kata/disk_test.go delete mode 100644 cmd/ateom-microvm/service_integration_test.go diff --git a/cmd/ateom-microvm/checkpoint.go b/cmd/ateom-microvm/checkpoint.go index 7461b73d3..511cd0332 100644 --- a/cmd/ateom-microvm/checkpoint.go +++ b/cmd/ateom-microvm/checkpoint.go @@ -32,15 +32,15 @@ import ( // CheckpointWorkload suspends the actor and writes a portable CH snapshot. // -// Contract with atelet (mirrors ateom-gvisor): after we return, atelet uploads -// the checkpoint dir to object storage, then tears down bundles and resets the -// actor dir. +// Contract with atelet: after we return, atelet uploads the checkpoint dir to object +// storage, then tears down bundles and resets the actor dir. // -// ateom drives the ateom-owned CH's REST api-socket: pause -> snapshot -// file:// (config.json + state.json + sparse memory-ranges) -> -// tear the VMM down. The actor's rootfs lives on the host-backed /dev/vdb, not a -// guest tmpfs overlay-upper, so the snapshot is naturally memory-only and small — -// no RAM-backed upper to wipe and no balloon to inflate before snapshot. +// ateom drives the CH REST api-socket: pause -> snapshot file:// +// (config.json + state.json + sparse memory-ranges) -> tear the VMM down. Each +// container's rootfs is overlay(virtio-fs RO lower + guest-tmpfs upper), so the +// writable upper lives in guest RAM and is captured by the memory snapshot — process +// memory and rootfs writes both persist across suspend/resume. The RO lower is +// reconstructed from the OCI image at restore, so nothing rootfs-related ships here. func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.CheckpointWorkloadRequest) (*ateompb.CheckpointWorkloadResponse, error) { s.lock.Lock() defer s.lock.Unlock() @@ -79,9 +79,9 @@ func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.Chec } // Record the FROZEN base id (the id the guest's virtio-fs find-paths are pinned - // to, /rootfs). For a cold (owned-boot) actor this is its own id; for a - // restored actor it is the golden id propagated via ra.baseID (set from the - // snapshot we restored from). RestoreWorkload reads this to lay the + // to, /rootfs). For a cold-run actor this is its own id; for a restored + // actor it is the golden id propagated via ra.baseID (set from the snapshot we + // restored from). RestoreWorkload reads this to lay the // reconstructed-from-image base at the path the guest expects. We can NOT derive // it from config.json (its socket paths get rewritten to the current id on every // restore, losing the invariant golden id). @@ -120,25 +120,13 @@ func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.Chec slog.String("id", id), slog.Duration("merge", time.Since(tMerge))) } - // reset-to-golden support: save the actor's /dev/vdb AS-OF this (paused, - // consistent) snapshot as a verbatim golden template, so future restores can - // recreate the disk byte-identical to what the snapshot's guest RAM expects - // while discarding the actor's later rootfs writes. Saved once (the first/golden - // checkpoint) and kept; best-effort (without it, restore reopens the live disk = - // continuity). TODO: ship the template with the snapshot for cross-node restore - // (it's golden, shipped once per template, like the OCI base). - actorDir := ateompath.ActorPath(ns, name, id) - if tmpl := filepath.Join(actorDir, goldenRootfsDiskName); fileMissing(tmpl) { - if cerr := copyDiskFile(ctx, filepath.Join(actorDir, actorRootfsDiskName), tmpl); cerr != nil { - slog.WarnContext(ctx, "Failed to save golden rootfs template; restore will reopen live disk", slog.Any("err", cerr)) - } else { - slog.InfoContext(ctx, "Saved golden rootfs disk template", slog.String("id", id)) - } - } + // Nothing rootfs-related ships: the overlay's writable upper is a guest tmpfs, so + // the actor's rootfs writes are already in the memory snapshot above, and the RO + // lower is reconstructed from the OCI image at restore (it never changes). // Report exactly the files we wrote so atelet ships precisely the CH snapshot - // (config.json + state.json + memory-ranges + base-id), not gVisor's fixed set. - // Memory-only: the RO base is reconstructed from the OCI image at restore. + // (config.json + state.json + memory-ranges + base-id). The RO base is + // reconstructed from the OCI image at restore. snapshotFiles, err := listFiles(checkpointDir) if err != nil { return nil, fmt.Errorf("while listing snapshot files: %w", err) @@ -151,7 +139,7 @@ func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.Chec dTeardown := time.Since(tTeardown) delete(s.running, id) - // Tear down the per-activation actor network (mirrors gVisor). + // Tear down the per-activation actor network. if err := s.cleanupActorNetwork(ctx); err != nil { slog.WarnContext(ctx, "Failed to clean up actor network after checkpoint", slog.Any("err", err)) } @@ -207,6 +195,11 @@ func (s *AteomService) teardownActor(ctx context.Context, id string, ra *running _ = ra.chCmd.Process.Kill() _, _ = ra.chCmd.Process.Wait() } + // Kill the virtiofsd serving the overlay RO lower (after CH, its only client). + if ra.vfsdCmd != nil && ra.vfsdCmd.Process != nil { + _ = ra.vfsdCmd.Process.Kill() + _, _ = ra.vfsdCmd.Process.Wait() + } } // Sweep any leftover per-sandbox host-side state + orphaned per-sandbox diff --git a/cmd/ateom-microvm/internal/kata/disk.go b/cmd/ateom-microvm/internal/kata/disk.go deleted file mode 100644 index ae03353f7..000000000 --- a/cmd/ateom-microvm/internal/kata/disk.go +++ /dev/null @@ -1,140 +0,0 @@ -//go:build linux - -// Copyright 2026 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kata - -import ( - "context" - "fmt" - "io/fs" - "os" - "os/exec" - "path/filepath" - "strconv" -) - -// rootfsDiskScratchBytes is the free-space headroom added on top of a bundle's -// contents when sizing its writable rootfs disk: room for the actor to write -// during a single activation. It stays sparse (unused space is holes), so it -// costs nothing in the image file or the memory-only snapshot. -const rootfsDiskScratchBytes = 512 << 20 - -// rootfsDiskGeometry walks srcDir and returns the ext4 image size (MiB) and the -// inode count to build a writable rootfs disk holding that tree plus headroom for -// ext4 metadata and the actor's in-activation scratch writes. Both are -// DETERMINISTIC functions of the tree's apparent contents (summed regular-file -// sizes and entry count, NOT host block allocation), so the cold-boot build and -// the restore-time rebuild from the same OCI image produce an identically-sized -// disk — required because the guest resumes with the ext4 superblock cached in RAM. -func rootfsDiskGeometry(srcDir string) (sizeMiB int, inodes int64, err error) { - var contentBytes, entries int64 - if werr := filepath.WalkDir(srcDir, func(_ string, d fs.DirEntry, err error) error { - if err != nil { - return err - } - entries++ // every entry (file, dir, symlink, device) needs an inode - if d.Type().IsRegular() { - info, ierr := d.Info() - if ierr != nil { - return ierr - } - contentBytes += info.Size() - } - return nil - }); werr != nil { - return 0, 0, werr - } - - const ( - mib = 1 << 20 - inodeSizeBytes = 256 // ext4 default; over-estimates the table if it's 128 - ) - // One inode per entry plus 25% and a fixed reserve, so the actor can create new - // files during its activation without exhausting inodes (the default - // size-derived ratio can starve a file-heavy rootfs). - inodes = entries + entries/4 + 8192 - // Contents + the eagerly-written inode table + ~6% for bitmaps/directory/extent - // metadata + the scratch reserve. Unused space stays sparse (holes). - sizeBytes := contentBytes + inodes*inodeSizeBytes + contentBytes/16 + rootfsDiskScratchBytes - sizeMiB = int((sizeBytes + mib - 1) / mib) - return sizeMiB, inodes, nil -} - -// BuildExt4Image creates a raw ext4 disk image at outPath, sized dynamically from -// srcDir (see rootfsDiskGeometry), pre-populated with srcDir's contents in a single -// mkfs pass (`mkfs.ext4 -d ...`). This is how the ateom-owned-boot path -// turns the actor's OCI bundle rootfs into a writable virtio-blk disk (/dev/vdb): -// the guest mounts it as the container rootfs, so rootfs writes land on this -// host-backed file (off guest RAM) -> memory-only CH snapshot, no balloon. -// -// The size is a deterministic function of srcDir's contents, so the cold-boot -// build and the restore-time rebuild from the same OCI image agree (the guest -// resumes with the ext4 superblock cached in RAM, which must match the disk). -// -// Requires mkfs.ext4 (e2fsprogs) on PATH in the worker image. The image is -// recreated from scratch each call (reset-to-golden recreates it from the golden -// bundle), so any prior file at outPath is truncated. -// -// mkfs.ext4 -d copies srcDir's tree (perms, ownership, symlinks, xattrs) into the -// new filesystem without needing a loop mount or root's mount privileges — it -// writes the filesystem structures directly to the image file. -func BuildExt4Image(ctx context.Context, srcDir, outPath string) error { - if fi, err := os.Stat(srcDir); err != nil || !fi.IsDir() { - return fmt.Errorf("BuildExt4Image: source %q is not a directory: %v", srcDir, err) - } - sizeMiB, inodes, err := rootfsDiskGeometry(srcDir) - if err != nil { - return fmt.Errorf("BuildExt4Image: sizing from %q: %w", srcDir, err) - } - - // Truncate to size first so mkfs writes into a sparse file of the right size - // (mkfs.ext4 also accepts a size argument, but a pre-sized file is unambiguous - // and keeps the on-disk size predictable for the snapshot config). - if err := os.Remove(outPath); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("BuildExt4Image: removing stale image %q: %w", outPath, err) - } - f, err := os.OpenFile(outPath, os.O_CREATE|os.O_RDWR, 0o600) - if err != nil { - return fmt.Errorf("BuildExt4Image: creating image %q: %w", outPath, err) - } - if err := f.Truncate(int64(sizeMiB) * 1024 * 1024); err != nil { - f.Close() - return fmt.Errorf("BuildExt4Image: sizing image %q: %w", outPath, err) - } - f.Close() - - // -F: don't prompt (operating on a regular file, not a block device). - // -q: quiet. -d: populate from srcDir. -N: fix the inode count to the tree's - // entries + slack (the default size-derived ratio can starve a file-heavy - // rootfs of inodes). -E lazy_*=0: write tables eagerly so the image is fully - // materialized (deterministic on-disk bytes, important for the reset-to-golden - // "verbatim copy" approach). -O ^has_journal: a reset-each-restore rootfs gains - // nothing from a journal and it adds nondeterminism. - args := []string{ - "-F", "-q", - "-N", strconv.FormatInt(inodes, 10), - "-E", "lazy_itable_init=0,lazy_journal_init=0", - "-O", "^has_journal", - "-d", srcDir, - outPath, - strconv.Itoa(sizeMiB) + "M", - } - cmd := exec.CommandContext(ctx, "mkfs.ext4", args...) - if out, err := cmd.CombinedOutput(); err != nil { - return fmt.Errorf("BuildExt4Image: mkfs.ext4 %v: %w: %s", args, err, out) - } - return nil -} diff --git a/cmd/ateom-microvm/internal/kata/disk_test.go b/cmd/ateom-microvm/internal/kata/disk_test.go deleted file mode 100644 index f06fb483b..000000000 --- a/cmd/ateom-microvm/internal/kata/disk_test.go +++ /dev/null @@ -1,76 +0,0 @@ -//go:build linux - -// Copyright 2026 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package kata - -import ( - "os" - "path/filepath" - "testing" -) - -func TestRootfsDiskGeometry(t *testing.T) { - dir := t.TempDir() - if err := os.Mkdir(filepath.Join(dir, "sub"), 0o755); err != nil { - t.Fatal(err) - } - if err := os.WriteFile(filepath.Join(dir, "a"), make([]byte, 1<<20), 0o644); err != nil { - t.Fatal(err) - } - if err := os.WriteFile(filepath.Join(dir, "sub", "b"), make([]byte, 2<<20), 0o644); err != nil { - t.Fatal(err) - } - if err := os.Symlink("a", filepath.Join(dir, "link")); err != nil { - t.Fatal(err) - } - // Entries the walk should count: the root dir, sub, a, sub/b, link = 5. - const wantEntries = 5 - - size1, inodes1, err := rootfsDiskGeometry(dir) - if err != nil { - t.Fatalf("rootfsDiskGeometry: %v", err) - } - - // Determinism is required: the cold-boot build and the restore-time rebuild - // must produce an identically-sized disk for the same tree. - size2, inodes2, err := rootfsDiskGeometry(dir) - if err != nil { - t.Fatal(err) - } - if size1 != size2 || inodes1 != inodes2 { - t.Errorf("non-deterministic geometry: (%d MiB, %d inodes) vs (%d MiB, %d inodes)", size1, inodes1, size2, inodes2) - } - - // Size must cover the ~3 MiB of contents plus the scratch reserve. - if floorMiB := rootfsDiskScratchBytes/(1<<20) + 3; size1 < floorMiB { - t.Errorf("size %d MiB below expected floor %d MiB", size1, floorMiB) - } - - // Inodes must cover every entry plus the reserve (so a file-heavy rootfs can - // still create files), and never fewer than the entries present. - if inodes1 < wantEntries { - t.Errorf("inodes %d < %d entries", inodes1, wantEntries) - } - if inodes1 < 8192 { - t.Errorf("inodes %d missing the fixed reserve", inodes1) - } -} - -func TestRootfsDiskGeometryMissingDir(t *testing.T) { - if _, _, err := rootfsDiskGeometry(filepath.Join(t.TempDir(), "does-not-exist")); err == nil { - t.Fatal("rootfsDiskGeometry on a missing dir: want error, got nil") - } -} diff --git a/cmd/ateom-microvm/internal/kata/specconv.go b/cmd/ateom-microvm/internal/kata/specconv.go index 4ac75f992..cfcceb00e 100644 --- a/cmd/ateom-microvm/internal/kata/specconv.go +++ b/cmd/ateom-microvm/internal/kata/specconv.go @@ -17,63 +17,19 @@ package kata import ( - "context" - "fmt" - "github.com/agent-substrate/substrate/cmd/ateom-microvm/internal/third_party/kata/agentpb" specs "github.com/opencontainers/runtime-spec/specs-go" ) -// StartBlkWorkload starts the actor container with its rootfs backed by a single -// boot-time virtio-blk disk (devPath, e.g. "/dev/vdb") — the virtio-blk-rootfs -// path. There is NO overlay, NO virtio-fs, NO tmpfs upper: the agent direct-mounts -// devPath (ext4) as the container rootfs, so rootfs writes land on the host-backed -// disk file (off guest RAM) and the CH snapshot stays memory-only with no balloon. -// -// One "blk" storage: source is the /dev node (kata's block storage handler mounts -// it directly when source starts with /dev — no uevent/auto-enumeration wait, -// unlike a hotplugged disk), fstype ext4, mounted at the container rootfs path. -// The spec's Root.Path is set to that mount point, which the agent's setup_bundle -// then uses as the container root. -func (a *AgentClient) StartBlkWorkload(ctx context.Context, containerID, devPath string, spec *specs.Spec) error { - rootfs := "/run/kata-containers/" + containerID + "/rootfs" - storages := []*agentpb.Storage{ - { - Driver: "blk", - Source: devPath, - Fstype: "ext4", - MountPoint: rootfs, - Options: []string{"rw"}, - }, - } - - pbSpec := SpecToAgentPB(spec) - pbSpec.Root = &agentpb.Root{Path: rootfs, Readonly: false} - - if err := a.CreateContainer(ctx, &agentpb.CreateContainerRequest{ - ContainerId: containerID, - ExecId: containerID, - Storages: storages, - OCI: pbSpec, - }); err != nil { - return fmt.Errorf("creating blk workload %q: %w", containerID, err) - } - if err := a.StartContainer(ctx, containerID); err != nil { - return fmt.Errorf("starting blk workload %q: %w", containerID, err) - } - return nil -} - // SpecToAgentPB converts an OCI runtime spec into the kata-agent's protobuf Spec -// (agentpb.Spec) for a CreateContainer ttrpc call. The shim normally does this -// conversion; ateom does it itself when it drives the agent directly ("be your -// own hook scheduler"). A blind json round-trip does NOT work: agentpb's Spec -// JSON tags are PascalCase (from oci.proto), while OCI config.json is lowercase. +// (agentpb.Spec) for a CreateContainer ttrpc call. A blind json round-trip does NOT +// work: agentpb's Spec JSON tags are PascalCase (from oci.proto), while OCI +// config.json is lowercase. // // Only the fields the kata-agent needs to create + start a container are mapped -// (process, root, mounts, linux namespaces/resources/cgroup/masked+readonly -// paths). The container rootfs is provided out-of-band as storages; the caller -// is expected to set the returned spec's Root.Path to the overlay mount point. +// (process, root, mounts, linux namespaces/resources/cgroup/masked+readonly paths). +// The container rootfs is provided out-of-band as storages; the caller sets the +// returned spec's Root.Path to the overlay mount point. func SpecToAgentPB(s *specs.Spec) *agentpb.Spec { if s == nil { return nil diff --git a/cmd/ateom-microvm/restore.go b/cmd/ateom-microvm/restore.go index 1c7c82b84..015730944 100644 --- a/cmd/ateom-microvm/restore.go +++ b/cmd/ateom-microvm/restore.go @@ -37,16 +37,17 @@ import ( // RestoreWorkload restores the actor on a (possibly different) pod by relaunching // cloud-hypervisor directly from the downloaded snapshot and resuming. // -// Contract with atelet: the memory-only snapshot dir (config.json + state.json + -// memory-ranges + base-id) has been downloaded to RestoreStateDir. +// Contract with atelet: the snapshot dir (config.json + state.json + memory-ranges + +// base-id) has been downloaded to RestoreStateDir. // -// There is NO virtiofsd and NO shared-dir to reconstruct — the rootfs is the -// writable /dev/vdb disk, which CH reopens from the path recorded in the snapshot -// config.json. Steps: rewrite the vsock socket path to this actor's VMDir, -// reset /dev/vdb to the golden disk template (or rebuild it from the OCI image), -// rebuild the tap (the snapshot's virtio-net is fd-backed → fresh net_fds), -// relaunch CH with --restore, and resume. Guest RAM (incl. the actor's in-memory -// state and frozen network config) comes back from the memory-only snapshot. +// Each container's rootfs is overlay(virtio-fs RO lower + guest-tmpfs upper). Steps: +// reconstruct each RO lower from the local OCI bundle (atelet re-unpacked the golden +// image) at the frozen find-paths path and start the virtiofsd serving them; rewrite +// the snapshot config's per-VMDir paths (vsock + serial + fs socket) to this actor's; +// rebuild the tap (the snapshot's virtio-net is fd-backed → fresh net_fds); relaunch +// CH with --restore (OnDemand), and resume. Guest RAM — incl. the actor's in-memory +// state, the tmpfs rootfs upper (so rootfs writes PERSIST), and the frozen network +// config — comes back from the memory snapshot. func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.RestoreWorkloadRequest) (resp *ateompb.RestoreWorkloadResponse, retErr error) { s.lock.Lock() defer s.lock.Unlock() @@ -77,48 +78,35 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore return nil, fmt.Errorf("while creating VM dir: %w", err) } - // Recreate the /dev/vdb backing file the snapshot references (the actor dir), - // reset-to-golden. Two ways, both byte-consistent with the golden snapshot's - // guest ext4 cache: - // - same-node: a verbatim golden template (copyDiskFile) — guaranteed identical. - // - cross-node: rebuild from the OCI image atelet unpacked to the bundle at - // restore (mkfs.ext4 -d is LAYOUT-deterministic for identical inputs, so the - // data blocks land at the same offsets the guest cache expects; only the - // superblock UUID/timestamps differ, which are cached in RAM and not re-read). - // Either way the actor's prior rootfs writes are discarded (gVisor semantics). + // Reconstruct each container's overlay RO lower from the LOCAL OCI bundle (atelet + // re-unpacked the golden image; the lower is the immutable golden image) at the + // frozen find-paths location SharedDir(id)//rootfs, and start the one virtiofsd + // serving them. The writable upper is a guest tmpfs restored from the memory + // snapshot (rootfs writes persist), so there is no disk to rebuild or repoint; the + // fs socket in the snapshot config is repointed to this VMDir by + // rewriteSnapshotSocketPaths above. cross-node consistency relies on a deterministic + // unpack of the same image at the same /rootfs path. containers := req.GetSpec().GetContainers() - if len(containers) != 1 { - return nil, status.Errorf(codes.Unimplemented, "ateom-microvm supports exactly one container, got %d", len(containers)) + if len(containers) == 0 { + return nil, status.Error(codes.InvalidArgument, "actor spec has no containers") } - actorDir := ateompath.ActorPath(ns, name, id) - diskPath := filepath.Join(actorDir, actorRootfsDiskName) - if tmpl := filepath.Join(actorDir, goldenRootfsDiskName); !fileMissing(tmpl) { - if err := copyDiskFile(ctx, tmpl, diskPath); err != nil { - return nil, fmt.Errorf("while resetting rootfs disk to golden (template): %w", err) - } - slog.InfoContext(ctx, "Reset actor rootfs disk to golden (template)", slog.String("id", id)) - } else { - bundleRootfs := filepath.Join(ateompath.OCIBundlePath(ns, name, id, containers[0].GetName()), "rootfs") - // Cross-node restore rebuilds from the bundle (no local golden template), - // so re-inject DNS here too; the same-node golden-copy path above already - // carries it from the golden boot. - if err := writeGuestResolvConf(bundleRootfs); err != nil { - return nil, fmt.Errorf("while writing guest resolv.conf: %w", err) - } - if err := kata.BuildExt4Image(ctx, bundleRootfs, diskPath); err != nil { - return nil, fmt.Errorf("while reconstructing rootfs disk from image: %w", err) - } - slog.InfoContext(ctx, "Reconstructed actor rootfs disk from image", slog.String("id", id)) + if len(containers) > maxActorContainers { + return nil, status.Errorf(codes.Unimplemented, "ateom-microvm supports at most %d containers, got %d", maxActorContainers, len(containers)) } - - // Repoint the snapshot config's writable /dev/vdb disk at THIS actor's - // reconstructed backing file. The golden snapshot recorded the golden actor's - // per-actor disk path, which is stale on any pod restoring a different actor - // (and absent on any node that never ran the golden) — unlike /dev/vda, the - // content-addressed kata image whose path is identical on every node. - if err := repointActorRootfsDisk(restoreDir, diskPath); err != nil { - return nil, fmt.Errorf("while repointing actor rootfs disk in snapshot config: %w", err) + ctrs, err := s.buildActorContainers(ns, name, id, containers) + if err != nil { + return nil, err + } + vfsdCmd, err := s.stageOverlayLowers(ctx, rr, id, ctrs) + if err != nil { + return nil, err } + defer func() { + if retErr != nil && vfsdCmd.Process != nil { + _ = vfsdCmd.Process.Kill() + _, _ = vfsdCmd.Process.Wait() + } + }() // Networking: rebuild the per-activation veth + tap; the snapshot's virtio-net // is fd-backed, so CH needs fresh tap FDs (net_fds) on restore. @@ -157,7 +145,7 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore } // Relaunch CH and restore with the tap FDs attached (SCM_RIGHTS). CH reopens - // /dev/vda (image) + /dev/vdb (actor rootfs) from the snapshot config paths. + // /dev/vda (image) + each /dev/vd{b+i} (actor rootfs) from the snapshot config paths. apiSocket := filepath.Join(kata.VMDir(id), "clh-api-restore.sock") chCmd, client, err := ch.LaunchVMM(ctx, ch.LaunchVMMOptions{ Binary: rr.chBinary, APISocket: apiSocket, Stdout: slogWriter{ctx}, Stderr: slogWriter{ctx}, @@ -184,12 +172,13 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore return nil, fmt.Errorf("while resuming restored guest: %w", err) } - ra := &runningActor{chCmd: chCmd, apiSocket: apiSocket, baseID: srcID, restoreSourceDir: restoreDir} + ra := &runningActor{chCmd: chCmd, vfsdCmd: vfsdCmd, apiSocket: apiSocket, baseID: srcID, restoreSourceDir: restoreDir} - // Re-attach stdout/stderr forwarding: the restored guest's container + kata-agent - // are alive, so a fresh dial over this actor's vsock resumes ReadStdout/ReadStderr - // (same containerID==execID==id as the cold run). Best-effort — a failed dial must - // not fail the restore (the actor is already running); forwarding is just skipped. + // Re-attach stdout/stderr forwarding for each container: the restored guest's + // containers + kata-agent are alive, so a fresh dial over this actor's vsock + // resumes ReadStdout/ReadStderr. The overlay workload's container/exec id is + // _ovl (same as the cold run). Best-effort — a failed dial must not fail the + // restore (the actor is already running); forwarding is just skipped. vsockPath := kata.VsockSocketPath(id) logAC, dialErr := dialAgentRetry(ctx, vsockPath, 15*time.Second) if dialErr != nil { @@ -197,22 +186,24 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore slog.String("id", id), slog.Any("err", dialErr)) } else { ra.logAgent = logAC - s.startActorLogForwarding(logAC, id, name, ns, containers[0].GetName()) + for _, c := range containers { + s.startActorLogForwarding(logAC, id, overlayWorkloadID(c.GetName()), c.GetName(), name, ns) + } } s.running[id] = ra s.actorLogger.EmitLifecycleLog("Actor restored", id, name, ns) - slog.InfoContext(ctx, "Actor restored (owned-boot, virtio-blk rootfs)", + slog.InfoContext(ctx, "Actor restored (overlay rootfs)", slog.String("id", id), slog.Duration("total", time.Since(tStart))) return &ateompb.RestoreWorkloadResponse{}, nil } -// rewriteSnapshotSocketPaths repoints the snapshot config.json's per-sandbox -// hybrid-vsock socket from the source actor's VMDir to the restoring actor's -// VMDir, so the socket we create is the one CH reopens. The kernel and /dev/vda -// kata image are content-addressed static files with identical paths on every -// node, so they need no rewrite; the writable /dev/vdb actor rootfs disk is -// per-actor and is repointed separately (see repointActorRootfsDisk). +// rewriteSnapshotSocketPaths repoints the snapshot config.json's per-VMDir paths from +// the source actor's VMDir to the restoring actor's: the hybrid-vsock socket, the +// File serial console, and each virtio-fs (overlay RO lower) socket, so the sockets/ +// files we create are the ones CH reopens. The kernel and /dev/vda kata image are +// content-addressed static files with identical paths on every node, so they need no +// rewrite, and the overlay has no per-actor disk to repoint. func rewriteSnapshotSocketPaths(snapshotDir, id string) error { cfgPath := filepath.Join(snapshotDir, "config.json") b, err := os.ReadFile(cfgPath) @@ -226,8 +217,8 @@ func rewriteSnapshotSocketPaths(snapshotDir, id string) error { if vsock, ok := cfg["vsock"].(map[string]any); ok { vsock["socket"] = kata.VsockSocketPath(id) } - // The owned-boot path captures the guest serial console to a file under the - // source actor's VMDir (Serial{Mode:"File"}). On restore that path is stale + // ateom captures the guest serial console to a file under the source actor's + // VMDir (Serial{Mode:"File"}). On restore that path is stale // (points at the golden/source pod's VMDir), so CH's CreateConsoleDevice fails // (No such file or directory). Repoint it at this actor's VMDir. if serial, ok := cfg["serial"].(map[string]any); ok { @@ -235,6 +226,15 @@ func rewriteSnapshotSocketPaths(snapshotDir, id string) error { serial["file"] = filepath.Join(kata.VMDir(id), "serial.log") } } + // The overlay RO lower is served by a per-VMDir virtiofsd socket; the snapshot + // recorded the golden actor's, so repoint each fs device at this actor's VMDir. + if fss, ok := cfg["fs"].([]any); ok { + for _, f := range fss { + if fm, ok := f.(map[string]any); ok { + fm["socket"] = kata.VirtiofsdSocketPath(id) + } + } + } out, err := json.Marshal(cfg) if err != nil { return err @@ -244,45 +244,3 @@ func rewriteSnapshotSocketPaths(snapshotDir, id string) error { } return nil } - -// repointActorRootfsDisk rewrites the snapshot config.json so the writable -// /dev/vdb actor rootfs disk points at this actor's reconstructed backing file -// (diskPath). The actor rootfs disk lives under the actor's per-actor directory -// (keyed by actor id), so the golden snapshot's recorded path is the GOLDEN -// actor's — stale on any pod restoring a different actor, and absent on any node -// that never ran the golden. (This is the disk analogue of the serial.file -// repoint in rewriteSnapshotSocketPaths.) The disk is identified by basename so -// the read-only /dev/vda kata image (a content-addressed static file) is left -// untouched; it is an error if no actor rootfs disk is present to repoint. -func repointActorRootfsDisk(snapshotDir, diskPath string) error { - cfgPath := filepath.Join(snapshotDir, "config.json") - b, err := os.ReadFile(cfgPath) - if err != nil { - return err - } - var cfg map[string]any - if err := json.Unmarshal(b, &cfg); err != nil { - return fmt.Errorf("parsing %q: %w", cfgPath, err) - } - rewrote := false - if disks, ok := cfg["disks"].([]any); ok { - for _, d := range disks { - dm, ok := d.(map[string]any) - if !ok { - continue - } - if p, _ := dm["path"].(string); filepath.Base(p) == actorRootfsDiskName { - dm["path"] = diskPath - rewrote = true - } - } - } - if !rewrote { - return fmt.Errorf("no %q disk found in %q to repoint", actorRootfsDiskName, cfgPath) - } - out, err := json.Marshal(cfg) - if err != nil { - return err - } - return os.WriteFile(cfgPath, out, 0o600) -} diff --git a/cmd/ateom-microvm/run.go b/cmd/ateom-microvm/run.go index 2acdd57bd..d66abc2bf 100644 --- a/cmd/ateom-microvm/run.go +++ b/cmd/ateom-microvm/run.go @@ -41,8 +41,6 @@ import ( // cloud-hypervisor process directly (booted by RunWorkload or relaunched by // RestoreWorkload), so it tracks that process and its api-socket for teardown. type runningActor struct { - containerName string - // baseID is the FROZEN base sandbox id propagated across this actor's restore // lineage. For a cold-run actor this is the actor's own id; for a restored // actor it is the id read from the snapshot's base-id file (the golden id, @@ -52,15 +50,18 @@ type runningActor struct { // ateom owns this CH process (booted at Run or relaunched at Restore). chCmd *exec.Cmd + // vfsdCmd is the virtiofsd serving the overlay RO lower (the CH fs device + // demand-pages from it for the actor's lifetime). ateom owns it; teardownActor + // kills it after the CH process. + vfsdCmd *exec.Cmd // apiSocket is the CH api-socket for this ateom-owned VMM. apiSocket string // restoreSourceDir is the snapshot dir this actor was OnDemand-restored from - // (the base CH is demand-paging from). Set only on the owned-boot virtio-blk - // path when restored via OnDemand. CheckpointWorkload overlays CH's new (sparse, - // faulted-only) snapshot onto this base to produce a COMPLETE snapshot (CH's - // OnDemand snapshot alone drops the un-faulted pages). Empty for cold-run actors - // (their snapshot is already complete). + // (CH demand-pages its guest RAM from it). Set when restored via OnDemand. + // CheckpointWorkload overlays CH's new (sparse, faulted-only) snapshot onto this + // base to produce a COMPLETE snapshot (CH's OnDemand snapshot alone drops the + // un-faulted pages). Empty for cold-run actors (their snapshot is already complete). restoreSourceDir string // logAgent is the kata-agent ttrpc client kept open for the lifetime of the @@ -86,42 +87,37 @@ const baseIDFile = "base-id" // Asset names in RunWorkloadRequest.runtime_asset_paths (set by atelet's // fetchRuntimeAssets, keyed by the ActorTemplate runtime asset names). const ( - assetCH = "cloud-hypervisor" - assetKernel = "kata-kernel" - assetImage = "kata-image" - assetConfig = "kata-config" + assetCH = "cloud-hypervisor" + assetKernel = "kata-kernel" + assetImage = "kata-image" + assetConfig = "kata-config" + assetVirtiofsd = "virtiofsd" ) -// actorRootfsDiskName is the actor's writable rootfs disk file under the actor -// dir; it is the /dev/vdb backing path recorded in the snapshot config.json and -// reopened verbatim on restore. -const actorRootfsDiskName = "actor-rootfs.ext4" - -// goldenRootfsDiskName is the verbatim copy of the actor's /dev/vdb disk AS-OF the -// golden snapshot, kept under the actor dir. reset-to-golden recreates /dev/vdb -// from it on restore (byte-identical to what the snapshot's guest RAM/ext4 cache -// expects), discarding the actor's later rootfs writes — gVisor semantics. -const goldenRootfsDiskName = "golden-rootfs.ext4" - -// fileMissing reports whether path does not exist. -func fileMissing(path string) bool { - _, err := os.Stat(path) - return os.IsNotExist(err) -} +// maxActorContainers is a sanity cap on containers per actor (all share the one +// micro-VM + virtiofsd). 25 is far above any real pod. +const maxActorContainers = 25 -// copyDiskFile copies a (sparse) disk image verbatim, preserving holes so the -// (mostly-empty) ext4 image doesn't materialize its scratch blocks. Used to -// save/restore the golden rootfs disk template. -func copyDiskFile(ctx context.Context, src, dst string) error { - tmp := dst + ".tmp" - _ = os.Remove(tmp) - if out, err := exec.CommandContext(ctx, "cp", "--sparse=always", src, tmp).CombinedOutput(); err != nil { - return fmt.Errorf("cp %s -> %s: %w: %s", src, tmp, err, out) - } - if err := os.Rename(tmp, dst); err != nil { - return fmt.Errorf("rename %s -> %s: %w", tmp, dst, err) - } - return nil +// overlayWorkloadID is the kata containerID of a container's overlay WORKLOAD, +// distinct from its carrier container (the carrier keeps the bare container name so +// the agent binds the RO base to /run/kata-containers//rootfs; the workload +// overlays on top). Stable across the restore lineage (container names don't change). +// +// The "_ovl" separator is deliberately a character that is invalid in a Kubernetes +// container name (DNS-1123 labels are [a-z0-9-]): the carrier id is the bare name, so a +// workload id can never equal a carrier id (a bare name has no "_") nor another workload +// id (names are unique within an actor) — even for containers named "x" and "x-ovl". A +// "-ovl" suffix would let "x"'s workload id collide with the "x-ovl" carrier id. +func overlayWorkloadID(name string) string { return name + "_ovl" } + +// actorContainer is one of the actor's containers prepared for the shared micro-VM: +// its name (also the kata containerID + the overlay lower's find-paths subdir), the +// host OCI bundle rootfs that backs the RO lower, and its OCI spec. The writable +// overlay upper is a guest tmpfs (OverlayUpperBase(name)), so there is no host disk. +type actorContainer struct { + name string + bundleRootfs string + spec *specs.Spec } // resolvedRuntime holds the concrete binary/config paths for a request, taken @@ -129,6 +125,7 @@ func copyDiskFile(ctx context.Context, src, dst string) error { type resolvedRuntime struct { chBinary string // path to the cloud-hypervisor binary configFile string // path to the kata configuration.toml + virtiofsd string // path to virtiofsd (overlay RO lower); "" => "virtiofsd" on PATH } // firstNonEmpty returns the first non-empty string, or "" if all are empty. @@ -147,13 +144,14 @@ func (s *AteomService) resolveRuntime(paths map[string]string) resolvedRuntime { return resolvedRuntime{ chBinary: firstNonEmpty(paths[assetCH], s.chBinary), configFile: firstNonEmpty(paths[assetConfig], s.kataConfig), + virtiofsd: paths[assetVirtiofsd], } } -// writeGuestResolvConf copies the worker pod's /etc/resolv.conf into the bundle -// rootfs (before it's packed into the ext4 disk) so the guest gets cluster DNS: -// ateom drops atelet's resolv.conf bind and sends no CreateSandbox.Dns, so the -// guest can otherwise reach IPs but not resolve names. +// writeGuestResolvConf copies the worker pod's /etc/resolv.conf into a container's +// bundle rootfs (the overlay RO lower) so the guest gets cluster DNS: ateom drops +// atelet's resolv.conf bind and sends no CreateSandbox.Dns, so the guest can +// otherwise reach IPs but not resolve names. func writeGuestResolvConf(rootfs string) error { content, err := os.ReadFile("/etc/resolv.conf") if err != nil { @@ -172,20 +170,18 @@ func writeGuestResolvConf(rootfs string) error { return nil } -// RunWorkload boots the actor as a cloud-hypervisor micro-VM that ateom owns. +// RunWorkload boots the actor as a cloud-hypervisor micro-VM and starts its containers. // -// ateom boots cloud-hypervisor itself — no kata shim — and gives the actor a -// writable boot-time virtio-blk disk (/dev/vdb, built from the OCI bundle rootfs) -// as its container rootfs. Rootfs data lives on that host-backed disk rather than -// a guest tmpfs overlay-upper, so the CH snapshot is memory-only with no balloon -// needed to reclaim a RAM-backed upper. It replicates the kata clh boot (vm.create -// kernel+image, add-net, vm.boot) and the shim's post-boot work (agent -// CreateSandbox + guest network config) before driving the kata-agent to start the -// blk-rootfs container. +// ateom boots cloud-hypervisor directly (no kata shim) and gives each container an +// overlay rootfs: its OCI image read-only over virtio-fs (the lower) plus a guest +// tmpfs (the writable upper). It drives the kata clh boot (vm.create kernel+image+fs, +// add-net, vm.boot) and the post-boot setup the shim would otherwise do (agent +// CreateSandbox + guest network config) before having the kata-agent assemble and +// start each container. // -// Contract with atelet (mirrors ateom-gvisor): -// - The runtime assets (guest kernel, guest OS image, cloud-hypervisor, base -// kata config) are on disk and passed as runtime asset paths. +// Contract with atelet: +// - The runtime assets (guest kernel, guest OS image, cloud-hypervisor, virtiofsd, +// base kata config) are on disk and passed as runtime asset paths. // - The OCI bundle (config.json + populated rootfs/) is prepared per container. func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkloadRequest) (resp *ateompb.RunWorkloadResponse, retErr error) { s.lock.Lock() @@ -197,24 +193,25 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload s.actorLogger.EmitLifecycleLog("Actor starting", id, name, ns) - // KNOWN GAP vs the gVisor runtime: it runs multiple containers per actor; this - // runtime is single-container for now. Multi-container is a mechanical extension - // (one boot-time virtio-blk rootfs disk + agent CreateContainer per container, - // sharing the one guest/sandbox) and is tracked as follow-up work. + // All of the actor's containers share the one micro-VM (which is the pod + // sandbox): each gets its own overlay rootfs and its own kata-agent + // CreateContainer/StartContainer, driven below after the shared boot + + // CreateSandbox + guest networking. containers := req.GetSpec().GetContainers() - if len(containers) != 1 { - return nil, status.Errorf(codes.Unimplemented, "ateom-microvm supports exactly one container, got %d", len(containers)) + if len(containers) == 0 { + return nil, status.Error(codes.InvalidArgument, "actor spec has no containers") + } + if len(containers) > maxActorContainers { + return nil, status.Errorf(codes.Unimplemented, "ateom-microvm supports at most %d containers, got %d", maxActorContainers, len(containers)) } - containerName := containers[0].GetName() - // Owned-boot builds the CH vm.create itself, so it needs the guest kernel + - // image paths directly. + // ateom builds the CH vm.create itself, so it needs the guest kernel + image + // paths directly. paths := req.GetRuntimeAssetPaths() kernel, image := paths[assetKernel], paths[assetImage] if kernel == "" || image == "" { - return nil, fmt.Errorf("owned-boot requires %q and %q asset paths", assetKernel, assetImage) + return nil, fmt.Errorf("ateom-microvm requires %q and %q asset paths", assetKernel, assetImage) } - actorDir := ateompath.ActorPath(ns, name, id) rr := s.resolveRuntime(paths) // Networking (host side): per-activation veth into the interior netns. The @@ -230,20 +227,11 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload } }() - bundle := ateompath.OCIBundlePath(ns, name, id, containerName) - spec, err := ensureKataCompatibleSpec(bundle, id, ateompath.AteomNetNSPath(s.podUID)) + // Prepare each container's OCI spec + record its bundle rootfs (the overlay RO + // lower). No host disk — the rootfs is overlay(virtio-fs lower + guest-tmpfs upper). + ctrs, err := s.buildActorContainers(ns, name, id, containers) if err != nil { - return nil, fmt.Errorf("while preparing kata OCI spec: %w", err) - } - - // Build the actor's writable rootfs as a raw ext4 virtio-blk disk from the - // atelet-populated OCI bundle rootfs. This becomes /dev/vdb. - diskPath := filepath.Join(actorDir, actorRootfsDiskName) - if err := writeGuestResolvConf(filepath.Join(bundle, "rootfs")); err != nil { - return nil, fmt.Errorf("while writing guest resolv.conf: %w", err) - } - if err := kata.BuildExt4Image(ctx, filepath.Join(bundle, "rootfs"), diskPath); err != nil { - return nil, fmt.Errorf("while building actor rootfs disk: %w", err) + return nil, err } // Guest sizing + agent kernel params from the kata config. @@ -258,6 +246,20 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload return nil, fmt.Errorf("while creating VM dir: %w", err) } + // Stage the overlay RO lowers (bind each image into the shared dir) + start the + // virtiofsd that serves them. CH connects to it at vm.create and demand-pages for + // the actor's lifetime, so ateom owns the process (killed in teardownActor). + vfsdCmd, err := s.stageOverlayLowers(ctx, rr, id, ctrs) + if err != nil { + return nil, err + } + defer func() { + if retErr != nil && vfsdCmd.Process != nil { + _ = vfsdCmd.Process.Kill() + _, _ = vfsdCmd.Process.Wait() + } + }() + // Launch a bare VMM (CH + api-socket); ateom owns this process for teardown. apiSocket := filepath.Join(kata.VMDir(id), "clh-api.sock") chCmd, client, err := ch.LaunchVMM(ctx, ch.LaunchVMMOptions{ @@ -276,11 +278,12 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload } }() - // Assemble the CH VmConfig (kata-compatible cmdline, RO image on /dev/vda + - // writable rootfs on /dev/vdb). serialLog is also read on a failed agent dial + // Assemble the CH VmConfig (kata-compatible cmdline, RO kata image on /dev/vda + + // the virtio-fs device for the overlay RO lower; no actor virtio-blk disks — the + // writable upper is a guest tmpfs). serialLog is also read on a failed agent dial // below, so keep it here. serialLog := filepath.Join(kata.VMDir(id), "serial.log") - vmCfg := buildVMConfig(id, kernel, image, diskPath, kparams, serialLog, memMiB, vcpus) + vmCfg := buildVMConfig(id, kernel, image, kparams, serialLog, memMiB, vcpus) if err := client.CreateVM(ctx, vmCfg); err != nil { return nil, fmt.Errorf("while creating VM: %w", err) } @@ -308,7 +311,7 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload if err := client.BootVM(ctx); err != nil { return nil, fmt.Errorf("while booting VM: %w", err) } - slog.InfoContext(ctx, "Micro-VM booted (owned-boot)", slog.String("id", id), slog.String("api", apiSocket)) + slog.InfoContext(ctx, "Micro-VM booted", slog.String("id", id), slog.String("api", apiSocket)) // Dial the kata-agent over hybrid-vsock. The agent only starts listening once // the guest's init reaches kata-containers.target — well after CH creates the @@ -335,25 +338,81 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload } }() - // Post-boot kata-agent setup: sandbox, guest networking, start the container. - if err := s.startActorContainer(ctx, ac, id, vsockPath, spec); err != nil { + // Post-boot kata-agent setup: sandbox, guest networking, start each container. + if err := s.startActorContainers(ctx, ac, id, vsockPath, ctrs); err != nil { return nil, err } - ra := &runningActor{chCmd: chCmd, apiSocket: apiSocket, containerName: containerName, baseID: id, logAgent: ac} + ra := &runningActor{chCmd: chCmd, vfsdCmd: vfsdCmd, apiSocket: apiSocket, baseID: id, logAgent: ac} s.running[id] = ra - // Forward the actor container's stdout/stderr into the pod logs (parity with - // ateom-gvisor). StartBlkWorkload uses containerID==execID==id, so the agent - // keys the streams by id. The goroutines read over ac for the actor's lifetime - // and exit (io.EOF) when teardownActor closes ac. - s.startActorLogForwarding(ac, id, name, ns, containerName) + // Forward each container's stdout/stderr into the pod logs. The overlay workload's + // container/exec id is _ovl (see startOverlayContainer), so key the streams by + // that and tag with the display container name. The goroutines read over ac for the + // actor's lifetime and exit (io.EOF) when teardownActor closes ac. + for _, c := range ctrs { + s.startActorLogForwarding(ac, id, overlayWorkloadID(c.name), c.name, name, ns) + } s.actorLogger.EmitLifecycleLog("Actor started", id, name, ns) - slog.InfoContext(ctx, "Actor started (owned-boot, virtio-blk rootfs)", slog.String("id", id)) + slog.InfoContext(ctx, "Actor started (overlay rootfs)", slog.String("id", id)) return &ateompb.RunWorkloadResponse{}, nil } +// buildActorContainers prepares each of the actor's containers for the shared +// micro-VM: it loads the OCI spec from the per-container bundle, injects guest DNS, +// and records the bundle rootfs that backs the overlay's RO lower. No host disk is +// built — the rootfs is overlay(virtio-fs RO lower + guest-tmpfs upper); the lowers +// are bound into virtiofsd's shared dir in stageOverlayLowers after the sandbox state +// is clean. Both RunWorkload and RestoreWorkload go through here. +func (s *AteomService) buildActorContainers(ns, name, id string, containers []*ateompb.Container) ([]actorContainer, error) { + netnsPath := ateompath.AteomNetNSPath(s.podUID) + ctrs := make([]actorContainer, len(containers)) + for i, c := range containers { + cn := c.GetName() + bundle := ateompath.OCIBundlePath(ns, name, id, cn) + spec, err := ensureKataCompatibleSpec(bundle, id, netnsPath) + if err != nil { + return nil, fmt.Errorf("while preparing kata OCI spec for %q: %w", cn, err) + } + bundleRootfs := filepath.Join(bundle, "rootfs") + // Write cluster DNS into the lower before it's served over virtio-fs: ateom + // drops atelet's resolv.conf bind and sends no CreateSandbox.Dns, so without + // this the guest can reach IPs but not resolve names. Doing it here covers both + // run and restore (both reconstruct the lower from the bundle). + if err := writeGuestResolvConf(bundleRootfs); err != nil { + return nil, fmt.Errorf("while writing guest resolv.conf for %q: %w", cn, err) + } + ctrs[i] = actorContainer{name: cn, bundleRootfs: bundleRootfs, spec: spec} + } + return ctrs, nil +} + +// stageOverlayLowers makes each container's RO lower available to virtiofsd by +// bind-mounting its OCI image rootfs into virtiofsd's find-paths location +// (SharedDir(id)//rootfs), then starts the one virtiofsd that serves them all. +// Must run AFTER CleanupSandboxState (which wipes SharedDir) and the VM dir exists. +// The returned virtiofsd cmd outlives this call (CH demand-pages from it); the caller +// owns it (tracked on runningActor, killed in teardownActor). +func (s *AteomService) stageOverlayLowers(ctx context.Context, rr resolvedRuntime, id string, ctrs []actorContainer) (*exec.Cmd, error) { + for _, c := range ctrs { + if err := kata.ReconstructSharedDirFromImage(ctx, c.bundleRootfs, id, c.name); err != nil { + return nil, fmt.Errorf("while staging overlay lower for %q: %w", c.name, err) + } + } + vfsdLog, _ := os.OpenFile(filepath.Join(kata.VMDir(id), "virtiofsd.log"), os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600) + vfsdCmd, err := kata.StartVirtiofsd(ctx, kata.VirtiofsdOptions{ + Binary: rr.virtiofsd, + SocketPath: kata.VirtiofsdSocketPath(id), + SharedDir: kata.SharedDir(id), + Log: vfsdLog, + }) + if err != nil { + return nil, fmt.Errorf("while starting virtiofsd: %w", err) + } + return vfsdCmd, nil +} + // guestConfig reads guest sizing + agent kernel params from the resolved kata // config, enabling the debug console (vsock 1026) for in-guest diagnostics and, // with kataDebug, raising the agent log level. @@ -373,16 +432,13 @@ func (s *AteomService) guestConfig(rr resolvedRuntime) (memMiB, vcpus int, kpara return cfg.MemoryMiB, cfg.VCPUs, kparams, nil } -// buildVMConfig assembles the cloud-hypervisor VmConfig for the owned boot. The -// kernel cmdline replicates kata's clh boot cmdline (verified against a live kata -// snapshot's payload.cmdline): beyond the root/clh base params it MUST include -// systemd.unit=kata-containers.target (else systemd boots the default target and -// powers off — the guest exits ~6s in) and mask systemd-networkd (the agent owns -// eth0). The console is ARCH-SPECIFIC: ttyAMA0 (PL011) on arm64, ttyS0 (8250) on -// amd64 — the wrong one => "unable to open an initial console". The config's -// kernel_params are appended; serial is captured to serialLog for boot debugging. -// The RO guest image is /dev/vda, the writable rootfs /dev/vdb. -func buildVMConfig(id, kernel, image, diskPath, kparams, serialLog string, memMiB, vcpus int) ch.VmConfig { +// buildVMConfig assembles the cloud-hypervisor VmConfig. The kernel cmdline replicates +// kata's clh boot cmdline; beyond the base params it must set +// systemd.unit=kata-containers.target (else the guest powers off ~6s in) and mask +// systemd-networkd (the agent owns eth0). The console is arch-specific: ttyAMA0 on +// arm64, ttyS0 on amd64. /dev/vda is the RO guest image; the actor rootfs's RO lower is +// the virtio-fs device on PCI segment 1 (hence num_pci_segments=2), with no actor disks. +func buildVMConfig(id, kernel, image, kparams, serialLog string, memMiB, vcpus int) ch.VmConfig { console := "ttyS0" if runtime.GOARCH == "arm64" { console = "ttyAMA0" @@ -399,22 +455,28 @@ func buildVMConfig(id, kernel, image, diskPath, kparams, serialLog string, memMi Payload: ch.PayloadConfig{Kernel: kernel, Cmdline: cmdline}, Disks: []ch.DiskConfig{ {Path: image, Readonly: true, ImageType: "Raw", NumQueues: int32(vcpus), QueueSize: 1024}, - {Path: diskPath, Readonly: false, ImageType: "Raw", NumQueues: int32(vcpus), QueueSize: 1024}, }, - Rng: &ch.RngConfig{Src: "/dev/urandom"}, - Serial: &ch.ConsoleConfig{Mode: "File", File: serialLog}, - Vsock: &ch.VsockConfig{Cid: 3, Socket: kata.VsockSocketPath(id)}, + Fs: []ch.FsConfig{{ + Tag: kata.FsTag, Socket: kata.VirtiofsdSocketPath(id), + NumQueues: 1, QueueSize: 1024, PciSegment: 1, + }}, + Platform: &ch.PlatformConfig{NumPciSegments: 2}, + Rng: &ch.RngConfig{Src: "/dev/urandom"}, + Serial: &ch.ConsoleConfig{Mode: "File", File: serialLog}, + Vsock: &ch.VsockConfig{Cid: 3, Socket: kata.VsockSocketPath(id)}, } } -// startActorContainer performs the post-boot kata-agent setup the shim normally -// does at boot: establish the sandbox, configure guest networking (eth0 -// IP/MAC/MTU + routes), and start the actor container on its /dev/vdb rootfs. On -// failure it dumps guest diagnostics over the debug console. -func (s *AteomService) startActorContainer(ctx context.Context, ac *kata.AgentClient, id, vsockPath string, spec *specs.Spec) error { - // Establish the agent sandbox (the shim normally does this at boot). +// startActorContainers performs the post-boot kata-agent setup the shim normally +// does at boot: establish the sandbox once (mounting the kataShared virtio-fs base), +// configure guest networking (eth0 IP/MAC/MTU + routes) once, then start each +// container on its own overlay rootfs. On failure it dumps guest diagnostics. +func (s *AteomService) startActorContainers(ctx context.Context, ac *kata.AgentClient, id, vsockPath string, ctrs []actorContainer) error { + // Establish the agent sandbox + the kataShared virtio-fs mount (the RO base for + // every container's overlay lower). All containers share it, so use the first + // container's hostname. sbCtx, sbCancel := context.WithTimeout(ctx, 20*time.Second) - err := ac.CreateSandbox(sbCtx, &agentpb.CreateSandboxRequest{Hostname: spec.Hostname, SandboxId: id}) + err := ac.CreateSandboxForActor(sbCtx, id, ctrs[0].spec.Hostname) sbCancel() if err != nil { return fmt.Errorf("while creating agent sandbox: %w", err) @@ -431,16 +493,38 @@ func (s *AteomService) startActorContainer(ctx context.Context, ac *kata.AgentCl return fmt.Errorf("while configuring guest network: %w", err) } - // Start the actor with its rootfs on /dev/vdb (single blk storage). + for _, c := range ctrs { + if err := startOverlayContainer(ctx, ac, vsockPath, c); err != nil { + return err + } + } + return nil +} + +// startOverlayContainer brings up one container's rootfs as overlay(virtio-fs RO +// lower + guest-tmpfs upper): a carrier container (id == name) eager-binds the RO base +// to /run/kata-containers//rootfs, then the workload (id == _ovl) overlays +// it with a tmpfs upper. On failure it dumps the guest overlay state. +func startOverlayContainer(ctx context.Context, ac *kata.AgentClient, vsockPath string, c actorContainer) error { + carrierCtx, carrierCancel := context.WithTimeout(ctx, 30*time.Second) + err := ac.CreateCarrier(carrierCtx, c.name, c.spec) + carrierCancel() + if err != nil { + dump := kata.DebugConsoleDump(ctx, vsockPath, "echo '== shared/containers =='; ls -la /run/kata-containers/shared/containers/ 2>&1 | head -40") + slog.ErrorContext(ctx, "carrier create failed; dump", slog.String("container", c.name), slog.String("dump", dump)) + return fmt.Errorf("while creating carrier %q: %w", c.name, err) + } + + upperBase := kata.OverlayUpperBase(c.name) wlCtx, wlCancel := context.WithTimeout(ctx, 30*time.Second) - err = ac.StartBlkWorkload(wlCtx, id, "/dev/vdb", spec) + err = ac.StartOverlayWorkload(wlCtx, c.name, overlayWorkloadID(c.name), upperBase, c.spec) wlCancel() if err != nil { dump := kata.DebugConsoleDump(ctx, vsockPath, - "echo '== /dev/vdb =='; ls -l /dev/vdb 2>&1; blkid /dev/vdb 2>&1; "+ - "echo '== mounts =='; grep kata /proc/mounts 2>&1") - slog.ErrorContext(ctx, "blk workload failed; dump", slog.String("dump", dump)) - return fmt.Errorf("while starting blk workload: %w", err) + "echo '== upper =='; ls -la "+upperBase+" 2>&1; echo '== lower =='; ls /run/kata-containers/"+c.name+"/rootfs/ 2>&1 | head; "+ + "echo '== mounts =='; grep -E 'kata|overlay' /proc/mounts 2>&1") + slog.ErrorContext(ctx, "overlay workload failed; dump", slog.String("container", c.name), slog.String("dump", dump)) + return fmt.Errorf("while starting overlay workload %q: %w", c.name, err) } return nil } @@ -450,17 +534,18 @@ func (s *AteomService) startActorContainer(ctx context.Context, ac *kata.AgentCl // ReadStdout/ReadStderr) through the shared actorlog forwarder, which annotates // each line with the actor's ate.dev/* labels and writes it to the pod's stdout. // -// The streams are keyed by containerID==execID==id (the value StartBlkWorkload -// passed); lines are tagged with the container name (ate.dev/container_name). The -// reader contexts are context.Background() — the goroutines are NOT bound to the RPC -// that started them; they terminate when ac is closed (by teardownActor), which -// makes the in-flight ReadStdout/ReadStderr fail and the StreamReader return -// io.EOF, ending WrapContainerLogs. This keeps the agent connection (which ttrpc -// allows concurrent Calls on) alive for forwarding while guaranteeing no goroutine -// outlives the connection. -func (s *AteomService) startActorLogForwarding(ac *kata.AgentClient, id, name, ns, containerName string) { - go s.actorLogger.WrapContainerLogs(kata.NewStdioReader(context.Background(), ac, id, id, false), id, name, ns, containerName) - go s.actorLogger.WrapContainerLogs(kata.NewStdioReader(context.Background(), ac, id, id, true), id, name, ns, containerName) +// The streams are keyed by streamID == the kata containerID==execID (the overlay +// workload id); lines are tagged with actorID + containerName +// (ate.dev/container_name) so a multi-container actor demultiplexes. +// The reader contexts are context.Background() — the goroutines are NOT bound to the +// RPC that started them; they terminate when ac is closed (by teardownActor), which +// makes the in-flight ReadStdout/ReadStderr fail and the StreamReader return io.EOF, +// ending WrapContainerLogs. This keeps the agent connection (which ttrpc allows +// concurrent Calls on) alive for forwarding while guaranteeing no goroutine outlives +// the connection. +func (s *AteomService) startActorLogForwarding(ac *kata.AgentClient, actorID, streamID, containerName, name, ns string) { + go s.actorLogger.WrapContainerLogs(kata.NewStdioReader(context.Background(), ac, streamID, streamID, false), actorID, name, ns, containerName) + go s.actorLogger.WrapContainerLogs(kata.NewStdioReader(context.Background(), ac, streamID, streamID, true), actorID, name, ns, containerName) } // dialAgentRetry polls DialAgent until the kata-agent answers the hybrid-vsock diff --git a/cmd/ateom-microvm/service_integration_test.go b/cmd/ateom-microvm/service_integration_test.go deleted file mode 100644 index ab79446ef..000000000 --- a/cmd/ateom-microvm/service_integration_test.go +++ /dev/null @@ -1,473 +0,0 @@ -//go:build linux - -// Copyright 2026 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "context" - "encoding/json" - "fmt" - "os" - "os/exec" - "path/filepath" - "strings" - "testing" - "time" - - "github.com/agent-substrate/substrate/cmd/ateom-microvm/internal/ch" - "github.com/agent-substrate/substrate/cmd/ateom-microvm/internal/kata" - "github.com/agent-substrate/substrate/internal/actorlog" - "github.com/agent-substrate/substrate/internal/ateompath" - "github.com/agent-substrate/substrate/internal/proto/ateompb" - "github.com/vishvananda/netns" -) - -// TestServiceRunBlkRootfs covers the owned-boot cold-run path: ateom boots -// cloud-hypervisor itself and gives the actor a writable boot-time virtio-blk -// rootfs (/dev/vdb), then drives the kata-agent to start the container. It -// exercises only run (no checkpoint/restore). Unlike TestServiceE2E it MUST pass -// the guest kernel + image + base-config asset paths, because owned-boot builds -// the CH vm.create itself rather than reading configuration.toml. -// -// Gated behind KATA_INTEGRATION=1. Required env: -// -// KATA_ROOTFS_SRC= a populated actor rootfs (becomes /dev/vdb) -// KATA_KERNEL= guest kernel (vmlinux.container) -// KATA_IMAGE= guest OS image (kata-containers.img, /dev/vda) -// KATA_CONFIG= a stock kata clh configuration.toml (for kernel_params + sizing) -// -// Optional: KATA_CH / KATA_VIRTIOFSD (defaults provided). Run as root on a host -// with kata + /dev/kvm + mkfs.ext4 (e2fsprogs): -// -// sudo KATA_INTEGRATION=1 KATA_ROOTFS_SRC=/path/to/rootfs KATA_KERNEL=... KATA_IMAGE=... \ -// KATA_CONFIG=... ./ateom-microvm.test -test.v -test.run BlkRootfs -func TestServiceRunBlkRootfs(t *testing.T) { - if os.Getenv("KATA_INTEGRATION") != "1" { - t.Skip("set KATA_INTEGRATION=1 to run (requires kata + /dev/kvm + root + e2fsprogs)") - } - rootfsSrc := os.Getenv("KATA_ROOTFS_SRC") - if rootfsSrc == "" { - t.Fatal("KATA_ROOTFS_SRC is required") - } - kernel, image, cfg := os.Getenv("KATA_KERNEL"), os.Getenv("KATA_IMAGE"), os.Getenv("KATA_CONFIG") - if kernel == "" || image == "" || cfg == "" { - t.Fatal("KATA_KERNEL, KATA_IMAGE, and KATA_CONFIG are required for the owned-boot path") - } - chBin := envOrTest("KATA_CH", "/usr/local/bin/cloud-hypervisor") - - ns, name := "default", "e2e-blk" - id := fmt.Sprintf("ateomchv-blk-%d", os.Getpid()) - container := "app" - - bundle := ateompath.OCIBundlePath(ns, name, id, container) - rootfs := filepath.Join(bundle, "rootfs") - if err := os.MkdirAll(rootfs, 0o755); err != nil { - t.Fatal(err) - } - if out, err := exec.Command("cp", "-a", rootfsSrc+"/.", rootfs+"/").CombinedOutput(); err != nil { - t.Fatalf("copying rootfs: %v: %s", err, out) - } - writeMinimalGvisorStyleSpec(t, bundle) - - podUID := "testpod-blk" - _ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID)) - interiorNetNS, err := createNetNSWithoutSwitching(ateompath.AteomNetNSName(podUID)) - if err != nil { - t.Fatalf("creating interior netns: %v", err) - } - svc := NewService(podUID, chBin, "", true, interiorNetNS, actorlog.NewActorLogger(actorlog.NewSyncedWriter(os.Stdout), false)) - ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second) - defer cancel() - - t.Cleanup(func() { - cctx, c := context.WithTimeout(context.Background(), 20*time.Second) - svc.teardownActor(cctx, id, svc.running[id], nil) - c() - _ = os.RemoveAll(ateompath.ActorPath(ns, name, id)) - _ = os.RemoveAll(kata.VMDir(id)) - _ = interiorNetNS.Close() - _ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID)) - }) - - if _, err := svc.RunWorkload(ctx, &ateompb.RunWorkloadRequest{ - ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id, - Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}}, - RuntimeAssetPaths: map[string]string{ - assetKernel: kernel, - assetImage: image, - assetConfig: cfg, - assetCH: chBin, - }, - }); err != nil { - // Best-effort: dump the guest serial console (captured to VMDir/serial.log) - // so a boot failure shows the kernel/agent output. - if b, rerr := os.ReadFile(filepath.Join(kata.VMDir(id), "serial.log")); rerr == nil { - t.Logf("[serial.log tail]\n%s", lastLines(string(b), 60)) - } - t.Fatalf("RunWorkload (owned-boot): %v", err) - } - t.Log("RunWorkload OK (owned-boot: CH booted by ateom, actor rootfs on /dev/vdb)") - - // Liveness: the ateom-owned CH must be up and the VM Running. - client := ch.NewClient(filepath.Join(kata.VMDir(id), "clh-api.sock")) - if err := client.WaitReady(ctx, 10*time.Second); err != nil { - t.Fatalf("owned CH not ready: %v", err) - } - // Confirm the actor's rootfs really came from /dev/vdb (a marker visible via - // the guest debug console — the actor's own files live on the blk disk). - dump := kata.DebugConsoleDump(ctx, kata.VsockSocketPath(id), - "echo '== vdb =='; blkid /dev/vdb 2>&1; echo '== rootfs mount =='; grep vdb /proc/mounts 2>&1; echo '== ip =='; ip -4 addr show eth0 2>&1") - t.Logf("[guest] %s", dump) -} - -// TestServiceCheckpointRestoreBlkRootfs exercises memory-only snapshot + restore -// with in-RAM continuity: the owned-boot actor snapshots MEMORY-ONLY (no -// shared-dir.tar, no balloon) and restores with its guest RAM intact. It writes a -// sentinel into guest tmpfs (/run = RAM), checkpoints, -// ships the snapshot dir, restores on a fresh CH process, and reads the sentinel -// back — if RAM continuity holds it survives. Same gating/env as -// TestServiceRunBlkRootfs. -func TestServiceCheckpointRestoreBlkRootfs(t *testing.T) { - if os.Getenv("KATA_INTEGRATION") != "1" { - t.Skip("set KATA_INTEGRATION=1 to run (requires kata + /dev/kvm + root + e2fsprogs)") - } - rootfsSrc := os.Getenv("KATA_ROOTFS_SRC") - kernel, image, cfg := os.Getenv("KATA_KERNEL"), os.Getenv("KATA_IMAGE"), os.Getenv("KATA_CONFIG") - if rootfsSrc == "" || kernel == "" || image == "" || cfg == "" { - t.Fatal("KATA_ROOTFS_SRC, KATA_KERNEL, KATA_IMAGE, KATA_CONFIG are required") - } - chBin := envOrTest("KATA_CH", "/usr/local/bin/cloud-hypervisor") - - ns, name := "default", "e2e-blkcr" - id := fmt.Sprintf("ateomchv-blkcr-%d", os.Getpid()) - container := "app" - - bundle := ateompath.OCIBundlePath(ns, name, id, container) - rootfs := filepath.Join(bundle, "rootfs") - if err := os.MkdirAll(rootfs, 0o755); err != nil { - t.Fatal(err) - } - if out, err := exec.Command("cp", "-a", rootfsSrc+"/.", rootfs+"/").CombinedOutput(); err != nil { - t.Fatalf("copying rootfs: %v: %s", err, out) - } - writeMinimalGvisorStyleSpec(t, bundle) - - podUID := "testpod-blkcr" - _ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID)) - interiorNetNS, err := createNetNSWithoutSwitching(ateompath.AteomNetNSName(podUID)) - if err != nil { - t.Fatalf("creating interior netns: %v", err) - } - svc := NewService(podUID, chBin, "", true, interiorNetNS, actorlog.NewActorLogger(actorlog.NewSyncedWriter(os.Stdout), false)) - ctx, cancel := context.WithTimeout(context.Background(), 240*time.Second) - defer cancel() - t.Cleanup(func() { - cctx, c := context.WithTimeout(context.Background(), 20*time.Second) - svc.teardownActor(cctx, id, svc.running[id], nil) - c() - _ = os.RemoveAll(ateompath.ActorPath(ns, name, id)) - _ = os.RemoveAll(kata.VMDir(id)) - _ = interiorNetNS.Close() - _ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID)) - }) - - assets := map[string]string{assetKernel: kernel, assetImage: image, assetConfig: cfg, assetCH: chBin} - if _, err := svc.RunWorkload(ctx, &ateompb.RunWorkloadRequest{ - ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id, - Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}}, - RuntimeAssetPaths: assets, - }); err != nil { - t.Fatalf("RunWorkload: %v", err) - } - t.Log("RunWorkload OK") - - // Write an in-RAM (tmpfs /run) sentinel via the guest debug console. - const sentinel = "BLKROOT_CONTINUITY_OK_4242" - vsock := kata.VsockSocketPath(id) - _ = kata.DebugConsoleDump(ctx, vsock, "echo "+sentinel+" > /run/blkroot-sentinel; sync; echo wrote") - if got := kata.DebugConsoleDump(ctx, vsock, "cat /run/blkroot-sentinel"); !strings.Contains(got, sentinel) { - t.Fatalf("sentinel not readable pre-checkpoint: %q", got) - } - t.Log("wrote in-RAM sentinel") - - // CheckpointWorkload — memory-only, no balloon/wipe. - if _, err := svc.CheckpointWorkload(ctx, &ateompb.CheckpointWorkloadRequest{ - ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id, - Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}}, - }); err != nil { - t.Fatalf("CheckpointWorkload: %v", err) - } - checkpointDir := ateompath.CheckpointStateDir(ns, name, id) - for _, f := range []string{"config.json", "state.json", "memory-ranges", "base-id"} { - if _, err := os.Stat(filepath.Join(checkpointDir, f)); err != nil { - t.Fatalf("checkpoint missing %q: %v", f, err) - } - } - if _, err := os.Stat(filepath.Join(checkpointDir, "shared-dir.tar")); err == nil { - t.Error("snapshot has shared-dir.tar — owned-boot must be MEMORY-ONLY (no virtio-fs base)") - } - t.Log("CheckpointWorkload OK (memory-only: config/state/memory-ranges/base-id, no shared-dir.tar)") - - // Ship snapshot dir -> restore dir (simulating atelet object-storage round trip). - restoreDir := ateompath.RestoreStateDir(ns, name, id) - if err := os.MkdirAll(restoreDir, 0o700); err != nil { - t.Fatal(err) - } - if out, err := exec.Command("cp", "-a", checkpointDir+"/.", restoreDir+"/").CombinedOutput(); err != nil { - t.Fatalf("shipping snapshot: %v: %s", err, out) - } - - // RestoreWorkload — reopen /dev/vdb, no virtiofsd/reconstruct. - if _, err := svc.RestoreWorkload(ctx, &ateompb.RestoreWorkloadRequest{ - ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id, - Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}}, - RuntimeAssetPaths: assets, - }); err != nil { - t.Fatalf("RestoreWorkload: %v", err) - } - client := ch.NewClient(filepath.Join(kata.VMDir(id), "clh-api-restore.sock")) - if err := client.WaitReady(ctx, 10*time.Second); err != nil { - t.Fatalf("restored CH not ready: %v", err) - } - t.Log("RestoreWorkload OK") - - // In-RAM continuity: the sentinel written before checkpoint must survive. - got := kata.DebugConsoleDump(ctx, vsock, "cat /run/blkroot-sentinel") - if !strings.Contains(got, sentinel) { - t.Fatalf("RAM continuity FAILED: sentinel gone after restore #1: %q", got) - } - t.Logf("cycle1 OK: memory-only snapshot + restore, in-RAM continuity (%q)", strings.TrimSpace(got)) - - // --- SECOND cycle: checkpoint-AFTER-restore. This is the OnDemand diff-snapshot - // case — CH writes only the faulted delta and CheckpointWorkload overlays it onto - // the restore source to rebuild a COMPLETE snapshot. If the merge is wrong the - // snapshot is incomplete and restore #2 boots a corrupt guest (sentinel gone / - // unreachable). Write a SECOND sentinel first so we also prove pages dirtied in - // THIS activation are captured by the merge. --- - const sentinel2 = "BLKROOT_CYCLE2_OK_8888" - _ = kata.DebugConsoleDump(ctx, vsock, "echo "+sentinel2+" > /run/blkroot-sentinel2; sync") - if _, err := svc.CheckpointWorkload(ctx, &ateompb.CheckpointWorkloadRequest{ - ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id, - Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}}, - }); err != nil { - t.Fatalf("CheckpointWorkload #2 (merge): %v", err) - } - // Ship the merged snapshot (overwrites restoreDir AFTER the merge read it). - if out, err := exec.Command("cp", "-a", checkpointDir+"/.", restoreDir+"/").CombinedOutput(); err != nil { - t.Fatalf("shipping snapshot #2: %v: %s", err, out) - } - if _, err := svc.RestoreWorkload(ctx, &ateompb.RestoreWorkloadRequest{ - ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id, - Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}}, - RuntimeAssetPaths: assets, - }); err != nil { - t.Fatalf("RestoreWorkload #2: %v", err) - } - client2 := ch.NewClient(filepath.Join(kata.VMDir(id), "clh-api-restore.sock")) - if err := client2.WaitReady(ctx, 10*time.Second); err != nil { - t.Fatalf("restored CH #2 not ready: %v", err) - } - // BOTH sentinels must survive: sentinel (from cycle 1, an un-faulted source page - // recovered by the overlay) AND sentinel2 (dirtied this cycle, in CH's delta). - g1 := kata.DebugConsoleDump(ctx, vsock, "cat /run/blkroot-sentinel") - g2 := kata.DebugConsoleDump(ctx, vsock, "cat /run/blkroot-sentinel2") - if !strings.Contains(g1, sentinel) { - t.Fatalf("merge INCOMPLETE: cycle-1 sentinel lost after restore #2 (un-faulted source page dropped): %q", g1) - } - if !strings.Contains(g2, sentinel2) { - t.Fatalf("merge lost the cycle-2 delta: sentinel2 gone after restore #2: %q", g2) - } - t.Logf("OnDemand-merge OK: 2-cycle suspend/resume, both sentinels survived (%q | %q)", - strings.TrimSpace(g1), strings.TrimSpace(g2)) -} - -// TestServiceResetToGoldenBlkRootfs exercises reset-to-golden. From the -// golden snapshot, each restore recreates /dev/vdb byte-identical to the golden -// disk template, so an actor's rootfs writes do NOT persist into the next -// activation, while in-RAM state from the golden snapshot DOES. Two restores from -// the same golden snapshot: restore#1 writes a disk sentinel (runtime); restore#2 -// must NOT see it (disk reset), while the RAM sentinel survives both. -func TestServiceResetToGoldenBlkRootfs(t *testing.T) { - if os.Getenv("KATA_INTEGRATION") != "1" { - t.Skip("set KATA_INTEGRATION=1 to run (requires kata + /dev/kvm + root + e2fsprogs)") - } - rootfsSrc := os.Getenv("KATA_ROOTFS_SRC") - kernel, image, cfg := os.Getenv("KATA_KERNEL"), os.Getenv("KATA_IMAGE"), os.Getenv("KATA_CONFIG") - if rootfsSrc == "" || kernel == "" || image == "" || cfg == "" { - t.Fatal("KATA_ROOTFS_SRC, KATA_KERNEL, KATA_IMAGE, KATA_CONFIG are required") - } - chBin := envOrTest("KATA_CH", "/usr/local/bin/cloud-hypervisor") - - ns, name := "default", "e2e-blkrtg" - id := fmt.Sprintf("ateomchv-blkrtg-%d", os.Getpid()) - container := "app" - - bundle := ateompath.OCIBundlePath(ns, name, id, container) - if err := os.MkdirAll(filepath.Join(bundle, "rootfs"), 0o755); err != nil { - t.Fatal(err) - } - if out, err := exec.Command("cp", "-a", rootfsSrc+"/.", filepath.Join(bundle, "rootfs")+"/").CombinedOutput(); err != nil { - t.Fatalf("copying rootfs: %v: %s", err, out) - } - writeMinimalGvisorStyleSpec(t, bundle) - - podUID := "testpod-blkrtg" - _ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID)) - interiorNetNS, err := createNetNSWithoutSwitching(ateompath.AteomNetNSName(podUID)) - if err != nil { - t.Fatalf("creating interior netns: %v", err) - } - svc := NewService(podUID, chBin, "", true, interiorNetNS, actorlog.NewActorLogger(actorlog.NewSyncedWriter(os.Stdout), false)) - ctx, cancel := context.WithTimeout(context.Background(), 300*time.Second) - defer cancel() - t.Cleanup(func() { - cctx, c := context.WithTimeout(context.Background(), 20*time.Second) - svc.teardownActor(cctx, id, svc.running[id], nil) - c() - _ = os.RemoveAll(ateompath.ActorPath(ns, name, id)) - _ = os.RemoveAll(kata.VMDir(id)) - _ = interiorNetNS.Close() - _ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID)) - }) - - assets := map[string]string{assetKernel: kernel, assetImage: image, assetConfig: cfg, assetCH: chBin} - runReq := &ateompb.RunWorkloadRequest{ - ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id, - Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}}, - RuntimeAssetPaths: assets, - } - restoreReq := &ateompb.RestoreWorkloadRequest{ - ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id, - Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}}, - RuntimeAssetPaths: assets, - } - vsock := kata.VsockSocketPath(id) - const ramSentinel = "RAM_GOLDEN_OK_7777" - rootfsDir := "/run/kata-containers/" + id + "/rootfs" - const diskSentinel = "DISK_WRITE_SHOULD_RESET_9999" - - // --- Golden: run, plant an in-RAM sentinel, checkpoint (saves golden snapshot - // + golden disk template), tear down. --- - if _, err := svc.RunWorkload(ctx, runReq); err != nil { - t.Fatalf("RunWorkload: %v", err) - } - _ = kata.DebugConsoleDump(ctx, vsock, "echo "+ramSentinel+" > /run/ram-sentinel; sync") - if _, err := svc.CheckpointWorkload(ctx, &ateompb.CheckpointWorkloadRequest{ - ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id, - Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}}, - }); err != nil { - t.Fatalf("CheckpointWorkload: %v", err) - } - // golden disk template must have been saved. - if _, err := os.Stat(filepath.Join(ateompath.ActorPath(ns, name, id), "golden-rootfs.ext4")); err != nil { - t.Fatalf("golden rootfs template not saved: %v", err) - } - checkpointDir := ateompath.CheckpointStateDir(ns, name, id) - restoreDir := ateompath.RestoreStateDir(ns, name, id) - if err := os.MkdirAll(restoreDir, 0o700); err != nil { - t.Fatal(err) - } - if out, err := exec.Command("cp", "-a", checkpointDir+"/.", restoreDir+"/").CombinedOutput(); err != nil { - t.Fatalf("shipping snapshot: %v: %s", err, out) - } - t.Log("golden checkpoint OK (snapshot + golden disk template saved)") - - // --- Restore #1: disk reset from golden template; write a disk sentinel at - // runtime, confirm it lands, then tear down (discard). --- - if _, err := svc.RestoreWorkload(ctx, restoreReq); err != nil { - t.Fatalf("RestoreWorkload #1: %v", err) - } - if got := kata.DebugConsoleDump(ctx, vsock, "cat /run/ram-sentinel"); !strings.Contains(got, ramSentinel) { - t.Fatalf("restore#1 RAM continuity failed: %q", got) - } - _ = kata.DebugConsoleDump(ctx, vsock, "echo "+diskSentinel+" > "+rootfsDir+"/disk-sentinel; sync") - if got := kata.DebugConsoleDump(ctx, vsock, "cat "+rootfsDir+"/disk-sentinel"); !strings.Contains(got, diskSentinel) { - t.Fatalf("restore#1 disk sentinel did not land: %q", got) - } - t.Log("restore#1 OK: RAM sentinel present, disk sentinel written") - tdCtx, tdCancel := context.WithTimeout(ctx, 20*time.Second) - svc.teardownActor(tdCtx, id, svc.running[id], ch.NewClient(filepath.Join(kata.VMDir(id), "clh-api-restore.sock"))) - tdCancel() - delete(svc.running, id) - - // --- Restore #2: disk reset AGAIN from golden template — the disk sentinel - // from restore#1 must be GONE, while the golden RAM sentinel still survives. --- - if _, err := svc.RestoreWorkload(ctx, restoreReq); err != nil { - t.Fatalf("RestoreWorkload #2: %v", err) - } - if got := kata.DebugConsoleDump(ctx, vsock, "cat /run/ram-sentinel"); !strings.Contains(got, ramSentinel) { - t.Fatalf("restore#2 RAM continuity failed: %q", got) - } - got := kata.DebugConsoleDump(ctx, vsock, "cat "+rootfsDir+"/disk-sentinel 2>&1; echo END") - if strings.Contains(got, diskSentinel) { - t.Fatalf("reset-to-golden FAILED: disk sentinel persisted after restore#2: %q", got) - } - t.Logf("reset-to-golden OK: discarded the rootfs write (disk sentinel gone) while RAM continuity held: %q", strings.TrimSpace(got)) -} - -func lastLines(s string, n int) string { - lines := strings.Split(strings.TrimRight(s, "\n"), "\n") - if len(lines) > n { - lines = lines[len(lines)-n:] - } - return strings.Join(lines, "\n") + "\n" -} - -func envOrTest(key, def string) string { - if v := os.Getenv(key); v != "" { - return v - } - return def -} - -// writeMinimalGvisorStyleSpec writes a deliberately minimal OCI spec (no -// linux.resources / cgroupsPath) so the test exercises ensureKataCompatibleSpec. -func writeMinimalGvisorStyleSpec(t *testing.T, bundle string) { - t.Helper() - spec := map[string]any{ - "ociVersion": "1.0.2", - "process": map[string]any{ - "user": map[string]any{"uid": 0, "gid": 0}, - "args": []string{"sleep", "3600"}, - "env": []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"}, - "cwd": "/", - "capabilities": map[string]any{ - "bounding": []string{"CAP_KILL", "CAP_AUDIT_WRITE", "CAP_NET_BIND_SERVICE"}, - "effective": []string{"CAP_KILL", "CAP_AUDIT_WRITE", "CAP_NET_BIND_SERVICE"}, - "permitted": []string{"CAP_KILL", "CAP_AUDIT_WRITE", "CAP_NET_BIND_SERVICE"}, - }, - }, - "root": map[string]any{"path": "rootfs", "readonly": false}, - "hostname": "ateomchv", - "mounts": []map[string]any{ - {"destination": "/proc", "type": "proc", "source": "proc"}, - {"destination": "/dev", "type": "tmpfs", "source": "tmpfs"}, - {"destination": "/sys", "type": "sysfs", "source": "sysfs", "options": []string{"nosuid", "noexec", "nodev", "ro"}}, - }, - "linux": map[string]any{ - "namespaces": []map[string]any{ - {"type": "pid"}, {"type": "network"}, {"type": "ipc"}, {"type": "uts"}, {"type": "mount"}, - }, - }, - } - b, err := json.MarshalIndent(spec, "", " ") - if err != nil { - t.Fatal(err) - } - if err := os.WriteFile(filepath.Join(bundle, "config.json"), b, 0o600); err != nil { - t.Fatal(err) - } -} From 59544fb19a3a4354b3345275520ee519499e0062 Mon Sep 17 00:00:00 2001 From: Benjamin Elder Date: Thu, 25 Jun 2026 18:38:37 -0700 Subject: [PATCH 4/5] demos,hack,manifests,docs: ship virtiofsd + kata 3.32 assets for the overlay rootfs The overlay rootfs serves the image over virtio-fs, so the asset set gains virtiofsd and moves to kata 3.32. virtiofsd is built from a pinned source commit because the vhost-0.16 snapshot/restore fix isn't in a release tag yet (tracking: gitlab.com/virtio-fs/virtiofsd work_items/236). assemble.sh builds it and the stagers upload it to rustfs (kind) and GCS (GKE); the counter-microvm SandboxConfig lists the virtiofsd asset for arm64 + amd64, and the sandboxconfig-assets VAP (with its envtest) now requires virtiofsd for every microvm architecture. The overlay formats nothing on the host, so it runs on the committed debian:stable-slim worker base: drop the custom worker base (hack/ateom-base) and its use in run-microvm-demo.sh. Update the asset README and architecture doc for the overlay. --- .ko.yaml | 10 +-- demos/counter/counter-microvm.yaml.tmpl | 43 +++++++---- docs/architecture.md | 4 +- hack/ateom-base/Dockerfile | 22 ------ hack/microvm-assets/README.md | 15 ++-- hack/microvm-assets/assemble.sh | 52 ++++++++++--- hack/microvm-assets/stage-to-gcs.sh | 2 +- hack/microvm-assets/stage-to-rustfs.sh | 2 +- hack/run-microvm-demo.sh | 74 +++++-------------- .../ate-install/sandboxconfig-validation.yaml | 4 +- .../v1alpha1/sandboxconfig_validation_test.go | 15 +++- 11 files changed, 120 insertions(+), 123 deletions(-) delete mode 100644 hack/ateom-base/Dockerfile diff --git a/.ko.yaml b/.ko.yaml index 8f61d60ea..4afb2e7e1 100644 --- a/.ko.yaml +++ b/.ko.yaml @@ -21,11 +21,7 @@ defaultPlatforms: baseImageOverrides: github.com/agent-substrate/substrate/demos/sandbox: alpine github.com/agent-substrate/substrate/demos/agent-secret: alpine - # ateom-microvm owns the cloud-hypervisor boot and builds the actor's writable - # virtio-blk rootfs at runtime, which needs mkfs.ext4 (e2fsprogs) plus glibc for - # the fetched cloud-hypervisor binary. The committed debian:stable-slim base has - # glibc + coreutils but NOT mkfs.ext4, so this default cannot build the rootfs on - # its own. hack/run-microvm-demo.sh builds hack/ateom-base (debian-slim + - # e2fsprogs) and overrides this base at build time via KO_CONFIG_PATH, so running - # the demo never edits this file. The committed default stays debian:stable-slim. + # ateom-microvm needs glibc (for the fetched cloud-hypervisor binary) and mount/umount + # (to bind the image into the virtiofsd shared dir) — both in debian:stable-slim but + # not in the distroless static default. github.com/agent-substrate/substrate/cmd/ateom-microvm: debian:stable-slim diff --git a/demos/counter/counter-microvm.yaml.tmpl b/demos/counter/counter-microvm.yaml.tmpl index e9f4e31e2..ee8bf6a85 100644 --- a/demos/counter/counter-microvm.yaml.tmpl +++ b/demos/counter/counter-microvm.yaml.tmpl @@ -16,15 +16,18 @@ # in-RAM (atomic uint64), so a successful suspend/resume across pods shows the # count continuing — proving the guest memory snapshot round-tripped. # -# The sandbox binaries (cloud-hypervisor, guest kernel, guest rootfs, base -# configuration.toml) live on a cluster-scoped SandboxConfig, FETCHED at runtime +# The sandbox binaries (cloud-hypervisor, virtiofsd, guest kernel, guest rootfs, +# base configuration.toml) live on a cluster-scoped SandboxConfig, FETCHED at runtime # from the cluster object store bucket ${BUCKET_NAME} under kata-assets/ (rustfs on -# kind, GCS on GKE) — NOT baked into the worker image. ateom owns the -# cloud-hypervisor boot itself and gives the actor a writable virtio-blk rootfs, so -# neither the kata shim nor virtiofsd is needed. The per-arch sha256 values below -# are the asset sets produced by hack/microvm-assets/assemble.sh; atelet selects -# the block matching the node's architecture, and each cluster's bucket holds that -# arch's binaries at these paths (staged by run-microvm-demo[-kind].sh). +# kind, GCS on GKE) — NOT baked into the worker image. ateom boots cloud-hypervisor +# itself and gives the actor an overlay rootfs (virtio-fs RO lower + guest-tmpfs +# upper), so it fetches virtiofsd (built from upstream main — vhost 0.16; the +# snapshot/restore fix is not in a release tag yet). The kata containerd shim is NOT +# fetched (ateom drives the kata-agent directly). kata assets are 3.32.0. +# The per-arch sha256 values below are the asset sets produced by +# hack/microvm-assets/assemble.sh; atelet selects the block matching the node's +# architecture, and each cluster's bucket holds that arch's binaries at these paths +# (staged by run-microvm-demo[-kind].sh). apiVersion: v1 kind: Namespace @@ -44,25 +47,39 @@ spec: cloud-hypervisor: url: "gs://${BUCKET_NAME}/kata-assets/cloud-hypervisor" sha256: "bf004ddc1a148f47caa87ac49a783b8dbd6bf9bc27abe522ed197df7b982d3b1" + # virtiofsd serves the overlay RO lower (virtio-fs); built from a pinned source + # commit (vhost 0.16 — the snapshot/restore fix is not in a release tag yet). + virtiofsd: + url: "gs://${BUCKET_NAME}/kata-assets/virtiofsd" + # virtiofsd is built from source (pinned commit in assemble.sh), so its binary + # is not byte-reproducible across toolchains/arches and can't carry a fixed pin. + # run-microvm-demo.sh computes this from the freshly-staged binary at deploy. + sha256: "${VIRTIOFSD_SHA256}" kata-kernel: url: "gs://${BUCKET_NAME}/kata-assets/vmlinux" - sha256: "a44d663f4ddad20a35527a3578fadef9beb23c1e5cb720e85d6928d6de70d3a1" + sha256: "f437320bab94f19105d12b932aa29735f0d54d2588218872254367f312c1027c" kata-image: url: "gs://${BUCKET_NAME}/kata-assets/rootfs.img" - sha256: "7ebd652760c881374c0a761d34addcb76d9a650e35c10c01b780ebcdd9a1f2aa" + sha256: "31ffb41177571c5654d3a28a2728eaac9d6d3daed90bb993f64e0b4b3ca6b235" kata-config: url: "gs://${BUCKET_NAME}/kata-assets/configuration-clh.toml" - sha256: "df504d9be0ed01765fdc8a9467955e1e671eb97724443f65a524bf914ccb818b" + sha256: "8a09a40543a527dbdc3ff26d229bae0de9aebb655475c28d7e5482dbedefa030" + # amd64 assets are kata 3.32 + virtiofsd (assemble.sh ARCH=amd64), matching arm64. amd64: cloud-hypervisor: url: "gs://${BUCKET_NAME}/kata-assets/cloud-hypervisor" sha256: "829af01ff075bb96c4f183905134c453a88d68cbabdc6b87df21098842581ee9" + virtiofsd: + url: "gs://${BUCKET_NAME}/kata-assets/virtiofsd" + # Built from source (see the arm64 note above); sha injected at deploy by + # run-microvm-demo.sh from the freshly-staged binary. + sha256: "${VIRTIOFSD_SHA256}" kata-kernel: url: "gs://${BUCKET_NAME}/kata-assets/vmlinux" - sha256: "a5f0af5fe536cd52c3ca214d15d81c577e5c5dc672947ab7980b91ddcb7c9d71" + sha256: "43701715ae2885f936bbe5c66a2de7c14dc51de7d19412d04833e4bbcf205bd0" kata-image: url: "gs://${BUCKET_NAME}/kata-assets/rootfs.img" - sha256: "ca9e06621b7edd2e056607c04db8bcebd92ad37ad4e37d18b8247d851feb0fae" + sha256: "e9548ff64f51c120791d3a2d1a81ebfd275df2bf0737368bd3e6381a6e967855" kata-config: url: "gs://${BUCKET_NAME}/kata-assets/configuration-clh.toml" sha256: "8cce580e5abf78c05c8e9b929c24a524b9a81fc47be4e2e4f38dcae5ef052be6" diff --git a/docs/architecture.md b/docs/architecture.md index 1e44dc84c..a00419f57 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -336,9 +336,9 @@ The node-level subsystem manages the physical execution of sandboxes and the mov A `WorkerPool` selects a **sandbox class** (`spec.sandboxClass`), and each class has a matching `ateom` herder image. The sandbox binaries themselves are not baked into the worker image — they are fetched at runtime from a cluster-scoped [`SandboxConfig`](api-guide.md#3-sandboxconfig-sandbox-binaries) and pinned into each snapshot's manifest so restores stay reproducible across runtime upgrades. - * **gVisor** (`ateom-gvisor`, the default): runs the workload under `runsc`. Suspend/resume uses gVisor's checkpoint/restore of the sandboxed process tree. + * **gVisor** (`ateom-gvisor`, the default): Runs the workload under `runsc` for kernel-level sandboxing. Suspend and resume leverage gVisor's native checkpoint/restore of the sandboxed process tree. - * **micro-VM** (`ateom-microvm`): runs the workload inside a [Kata Containers](https://katacontainers.io/) guest (Kata 3.31 guest assets) on the [Cloud Hypervisor](https://www.cloudhypervisor.org/) VMM. `ateom` owns the Cloud Hypervisor boot directly — there is **no Kata shim and no containerd daemon**: it launches Cloud Hypervisor, boots the guest kernel + OS image, and then drives the Kata agent over its hybrid-vsock ttrpc API itself (creating the sandbox, configuring guest networking, and starting the container). The actor's container rootfs is a writable boot-time virtio-blk disk (`/dev/vdb`) that `ateom` builds with `mkfs.ext4` from the OCI bundle, so rootfs writes land off guest RAM on a host-backed disk. Suspend captures a Cloud Hypervisor **memory-only snapshot** of the running guest (no memory balloon); resume relaunches Cloud Hypervisor with its **OnDemand** (userfaultfd) memory restore — demand-paging from the snapshot while a diff-merge folds newly-faulted pages back in to keep the snapshot complete — so full in-RAM state comes back on any worker, including a different node. On each restore `/dev/vdb` is recreated byte-identical to the golden image (**reset-to-golden**), so rootfs writes are discarded across suspend/resume (matching gVisor's semantics) while in-RAM state persists. The actor container's stdout/stderr is forwarded to the pod log with `ate.dev/*` labels (parity with `ateom-gvisor`). Micro-VM workers require `/dev/kvm` and nested-virtualization-capable nodes; the controller adds the KVM device mount and pins these pods to nodes labeled `ate.dev/sandboxClass=microvm`. See [`hack/microvm-assets/`](../hack/microvm-assets/) for assembling the asset set. + * **micro-VM** (`ateom-microvm`): Runs the workload inside a [Kata Containers](https://katacontainers.io/) guest on the [Cloud Hypervisor](https://www.cloudhypervisor.org/) VMM. Suspend and resume capture a memory-only VM snapshot and restore it on-demand using `userfaultfd` memory demand-paging, with container rootfs writes captured in guest RAM via a `tmpfs` overlay. ### Networking Stack (`atenet` + Envoy) diff --git a/hack/ateom-base/Dockerfile b/hack/ateom-base/Dockerfile deleted file mode 100644 index 9ac88ca0d..000000000 --- a/hack/ateom-base/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2026 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -FROM debian:stable-slim -# e2fsprogs provides mkfs.ext4, which the ateom-microvm worker uses to build the -# actor's writable /dev/vdb ext4 rootfs from the OCI bundle. debian-slim also -# provides coreutils (cp --sparse) and glibc for the fetched cloud-hypervisor binary. -RUN apt-get update \ - && apt-get install -y --no-install-recommends e2fsprogs \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/hack/microvm-assets/README.md b/hack/microvm-assets/README.md index 5a43e095c..3f21a73a8 100644 --- a/hack/microvm-assets/README.md +++ b/hack/microvm-assets/README.md @@ -1,11 +1,14 @@ # Micro-VM runtime assets + counter demo (kind, fetch-not-bake) The `microvm` runtime (`cmd/ateom-microvm`, kata + cloud-hypervisor) fetches its -toolchain at runtime — nothing kata-specific is baked into the worker image. ateom owns -the cloud-hypervisor boot and gives the actor a writable virtio-blk rootfs, so neither the -kata shim nor virtiofsd is needed. The asset set is just four files: +toolchain at runtime — nothing kata-specific is baked into the worker image. ateom drives +the kata-agent directly (no kata shim, no containerd). Each actor container's rootfs is an +overlay of a read-only lower (the OCI image, served into the guest over virtio-fs by +`virtiofsd`) and a writable upper on a guest tmpfs, so `virtiofsd` is part of the asset +set. The asset set is five files: - `cloud-hypervisor` — the VMM binary (fetched from its release) +- `virtiofsd` — the virtio-fs daemon serving the RO lower (built from source; see `assemble.sh`) - `vmlinux` — the guest kernel (from kata-static) - `rootfs.img` — the guest rootfs image (from kata-static) - `configuration-clh.toml` — the base kata config (from kata-static) @@ -16,9 +19,9 @@ available, `hack/create-kind-cluster.sh` mounts it into the node and labels the `ate.dev/sandboxClass=microvm`. > [!TIP] -> `hack/run-microvm-demo.sh` automates the full bring-up below (ateom-base image, ko base -> override, assets, control plane, demo apply) for kind OR GKE without editing committed -> files. The steps here are the manual equivalent. +> `hack/run-microvm-demo.sh` automates the full bring-up below (assets, control plane, +> demo apply) for kind OR GKE without editing committed files. The steps here are the +> manual equivalent. ## Steps (run on a KVM-capable Linux host matching the node arch) diff --git a/hack/microvm-assets/assemble.sh b/hack/microvm-assets/assemble.sh index e12a4d1b3..0b71469fe 100755 --- a/hack/microvm-assets/assemble.sh +++ b/hack/microvm-assets/assemble.sh @@ -18,14 +18,22 @@ # ateom-microvm fetches at runtime (fetch-not-bake). Run this on a Linux # host of the TARGET arch. # -# Produces, under $OUT, the four assets named as the SandboxConfig expects, plus -# their sha256 sums (paste into demos/counter/counter-microvm.yaml.tmpl): -# cloud-hypervisor vmlinux rootfs.img configuration-clh.toml +# Produces, under $OUT, the five assets named as the SandboxConfig expects: +# cloud-hypervisor virtiofsd vmlinux rootfs.img configuration-clh.toml +# The four DOWNLOADED assets are reproducible, so paste their sha256 sums into the +# manifest (demos/counter/counter-microvm.yaml.tmpl). virtiofsd is built from source +# (non-reproducible bytes), so its sha is NOT pinned there — run-microvm-demo.sh +# computes it from the staged binary and injects it at deploy. # -# ateom owns the cloud-hypervisor boot and gives the actor a writable virtio-blk -# rootfs, so neither the kata shim nor virtiofsd is part of the asset set. +# ateom drives the kata-agent directly (the kata containerd shim is NOT an asset). The +# actor rootfs is overlay(virtio-fs RO lower + guest-tmpfs upper), so virtiofsd IS an +# asset; it is built from source (pinned commit, see VIRTIOFSD_COMMIT below) because the +# vhost-0.16 snapshot/restore fix (REPLY_ACK) is not in a release tag yet — the +# kata-bundled v1.13.3 virtiofsd hangs CH's restore handshake. Tracking issue: +# https://gitlab.com/virtio-fs/virtiofsd/-/work_items/236 — switch to a release once it +# lands. Building it needs rust (rustup) + libcap-ng-dev libseccomp-dev pkg-config. # -# Env: ARCH (arm64|amd64, default arm64), KATA_VER (3.31.0), CH_VER (v52.0), +# Env: ARCH (arm64|amd64, default arm64), KATA_VER (3.32.0), CH_VER (v52.0), # OUT (default ./bin/microvm-assets/$ARCH, under the gitignored bin/). set -o errexit -o nounset -o pipefail @@ -33,7 +41,7 @@ set -o errexit -o nounset -o pipefail ROOT="$(git rev-parse --show-toplevel)" ARCH="${ARCH:-arm64}" -KATA_VER="${KATA_VER:-3.31.0}" +KATA_VER="${KATA_VER:-3.32.0}" CH_VER="${CH_VER:-v52.0}" OUT="${OUT:-${ROOT}/bin/microvm-assets/$ARCH}" WORK="$(mktemp -d)" @@ -64,12 +72,36 @@ curl -fSL -o "${OUT}/cloud-hypervisor" \ "https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${CH_VER}/${CH_ASSET}" chmod +x "${OUT}/cloud-hypervisor" +# virtiofsd pinned commit. The vhost-0.16 / vhost-user-backend-0.22 snapshot-restore +# fix (REPLY_ACK) is upstream but not in a release tag yet (tracking issue +# https://gitlab.com/virtio-fs/virtiofsd/-/work_items/236) — the kata-bundled v1.13.3 +# (old vhost) hangs CH's restore handshake. Pin a known-good commit until a release +# carries the fix. +VIRTIOFSD_COMMIT="acb3d506a9f1b256fff7327023df85570caf1e75" +echo ">> Building virtiofsd @ ${VIRTIOFSD_COMMIT} (vhost 0.16)..." +# Build deps (Debian): apt-get install -y git libcap-ng-dev libseccomp-dev pkg-config; rust via rustup. +if ! command -v cargo >/dev/null 2>&1; then + echo "cargo not found; install rust (rustup) + libcap-ng-dev libseccomp-dev pkg-config" >&2 + exit 1 +fi +git clone https://gitlab.com/virtio-fs/virtiofsd.git +( + cd virtiofsd + git checkout --quiet "${VIRTIOFSD_COMMIT}" + grep -E '^(vhost|vhost-user-backend) =' Cargo.toml # expect vhost 0.16 / backend 0.22 + cargo build --release +) +cp "virtiofsd/target/release/virtiofsd" "${OUT}/virtiofsd" +chmod +x "${OUT}/virtiofsd" + echo echo ">> Assets assembled in ${OUT}:" cd "${OUT}" -for f in cloud-hypervisor vmlinux rootfs.img configuration-clh.toml; do +for f in cloud-hypervisor virtiofsd vmlinux rootfs.img configuration-clh.toml; do [ -f "$f" ] || { echo "MISSING: $f" >&2; exit 1; } done +"${OUT}/virtiofsd" --version 2>/dev/null | head -1 || true echo -echo ">> sha256 (paste into demos/counter/counter-microvm.yaml.tmpl runtime.assets):" -sha256sum cloud-hypervisor vmlinux rootfs.img configuration-clh.toml +echo ">> sha256 (paste the DOWNLOADED assets into counter-microvm.yaml.tmpl;" +echo ">> virtiofsd's sha is injected at deploy by run-microvm-demo.sh, not pinned):" +sha256sum cloud-hypervisor virtiofsd vmlinux rootfs.img configuration-clh.toml diff --git a/hack/microvm-assets/stage-to-gcs.sh b/hack/microvm-assets/stage-to-gcs.sh index 6ec9e5c12..cab3bdce0 100755 --- a/hack/microvm-assets/stage-to-gcs.sh +++ b/hack/microvm-assets/stage-to-gcs.sh @@ -33,7 +33,7 @@ BUCKET="${BUCKET:-ate-snapshots}" # gcloud uses its active config project. ${PROJECT_ID:+...} elides the flag entirely # when unset (same idiom as KUBECTL_CONTEXT in hack/run-microvm-demo.sh). echo ">> Uploading assets to gs://${BUCKET}/kata-assets/ ..." -for f in cloud-hypervisor vmlinux rootfs.img configuration-clh.toml; do +for f in cloud-hypervisor virtiofsd vmlinux rootfs.img configuration-clh.toml; do echo " $f" gcloud storage cp ${PROJECT_ID:+--project="${PROJECT_ID}"} "${OUT}/${f}" "gs://${BUCKET}/kata-assets/${f}" done diff --git a/hack/microvm-assets/stage-to-rustfs.sh b/hack/microvm-assets/stage-to-rustfs.sh index 3f705dcfc..48875b656 100755 --- a/hack/microvm-assets/stage-to-rustfs.sh +++ b/hack/microvm-assets/stage-to-rustfs.sh @@ -48,7 +48,7 @@ sleep 3 ENDPOINT="http://localhost:9000" echo ">> Uploading assets to s3://${BUCKET}/kata-assets/ via ${ENDPOINT}..." -for f in cloud-hypervisor vmlinux rootfs.img configuration-clh.toml; do +for f in cloud-hypervisor virtiofsd vmlinux rootfs.img configuration-clh.toml; do echo " $f" aws --endpoint-url "${ENDPOINT}" s3 cp "${OUT}/${f}" "s3://${BUCKET}/kata-assets/${f}" done diff --git a/hack/run-microvm-demo.sh b/hack/run-microvm-demo.sh index 2ad3009c1..fe5b19eac 100755 --- a/hack/run-microvm-demo.sh +++ b/hack/run-microvm-demo.sh @@ -21,12 +21,6 @@ # Like the other hack scripts, this sources .ate-dev-env.sh for the cluster / # registry / bucket settings unless NO_DEV_ENV is set. # -# The committed .ko.yaml base for cmd/ateom-microvm is debian:stable-slim, which -# lacks mkfs.ext4 (e2fsprogs). The worker needs mkfs.ext4 at runtime to build the -# actor's virtio-blk rootfs, so this script builds hack/ateom-base (debian-slim + -# e2fsprogs) and overrides ONLY that base at build time via a throwaway ko config -# pointed at by KO_CONFIG_PATH — the committed .ko.yaml is never touched. -# # Env (most come from .ate-dev-env.sh): # KO_DOCKER_REPO (required) image registry, e.g. gcr.io/PROJECT/ate-images for # GKE or localhost:5001 for kind. @@ -34,7 +28,6 @@ # KUBECTL_CONTEXT (optional) kube context; threaded into install + ko apply + kubectl. # PROJECT_ID (optional) GCP project for the GCS asset upload (GKE path). # ARCH target arch (default: from KO_DEFAULTPLATFORMS, else host arch). -# ATEOM_BASE_TAG tag for the built ateom-base image (default: e2fsprogs). # OUT asset dir (default: $PWD/bin/microvm-assets/$ARCH, gitignored). # ATE_INSTALL_KIND "true" for the kind path (stage assets to rustfs + install-ate-kind.sh); # default false uploads assets to GCS + uses install-ate.sh. @@ -54,7 +47,6 @@ fi KO_DOCKER_REPO="${KO_DOCKER_REPO:-}" KUBECTL_CONTEXT="${KUBECTL_CONTEXT:-}" BUCKET_NAME="${BUCKET_NAME:-ate-snapshots}" -ATEOM_BASE_TAG="${ATEOM_BASE_TAG:-e2fsprogs}" ATE_INSTALL_KIND="${ATE_INSTALL_KIND:-false}" # Target arch: match the images' platform (KO_DEFAULTPLATFORMS is set by @@ -82,47 +74,9 @@ log() { echo -e "${COLOR_CYAN}[run-microvm-demo]: $*${COLOR_RESET}" } -ATEOM_BASE_IMAGE="${KO_DOCKER_REPO}/ateom-base:${ATEOM_BASE_TAG}" - -# --- 2. build + push ateom-base (debian-slim + e2fsprogs) for the target arch - -# We build with buildx --load (import into the local docker daemon) and then -# `docker push`, NOT buildx --push: the buildkit builder runs in its own container -# and cannot reach a localhost/kind registry, whereas the docker daemon can. --load -# imports a single-platform image fine even when ARCH != the host arch. For a real -# remote registry (e.g. gcr.io) the same daemon `docker push` works with its creds. -log "Building ateom-base ${ATEOM_BASE_IMAGE} (linux/${ARCH})..." -if docker buildx version >/dev/null 2>&1; then - log " using: docker buildx build --load + docker push" - docker buildx build --platform "linux/${ARCH}" -t "${ATEOM_BASE_IMAGE}" --load hack/ateom-base -else - log " using: docker build + docker push (buildx unavailable)" - docker build -t "${ATEOM_BASE_IMAGE}" hack/ateom-base -fi -docker push "${ATEOM_BASE_IMAGE}" - -# --- 3. throwaway ko config overriding ONLY the ateom-microvm base ----------- -# KO_CONFIG_PATH points at a FILE that ko parses by extension, so it MUST end in -# .yaml (a bare mktemp file is rejected: "Unsupported Config Type"). Use a temp dir -# with a .yaml-named copy of the repo .ko.yaml and swap the one base line. -KO_CONFIG_DIR="$(mktemp -d)" -KO_CONFIG_TMP="${KO_CONFIG_DIR}/ko-override.yaml" -trap 'rm -rf "${KO_CONFIG_DIR}"' EXIT -cp "${ROOT}/.ko.yaml" "${KO_CONFIG_TMP}" - -OVERRIDE_KEY="github.com/agent-substrate/substrate/cmd/ateom-microvm" -if ! grep -q "^ ${OVERRIDE_KEY}:" "${KO_CONFIG_TMP}"; then - echo "Error: could not find the cmd/ateom-microvm baseImageOverride line in .ko.yaml" >&2 - exit 1 -fi -# Replace only the value after the key (use | as the sed delimiter; the value has /). -sed -i.bak "s|^ ${OVERRIDE_KEY}:.*| ${OVERRIDE_KEY}: ${ATEOM_BASE_IMAGE}|" "${KO_CONFIG_TMP}" -rm -f "${KO_CONFIG_TMP}.bak" -export KO_CONFIG_PATH="${KO_CONFIG_TMP}" -log "Using throwaway KO_CONFIG_PATH=${KO_CONFIG_PATH} (ateom-microvm base -> ${ATEOM_BASE_IMAGE})" - -# --- 4. assets: assemble (if missing) then stage to rustfs (kind) / GCS (GKE) -- +# --- 2. assets: assemble (if missing) then stage to rustfs (kind) / GCS (GKE) -- need_assemble=false -for f in cloud-hypervisor vmlinux rootfs.img configuration-clh.toml; do +for f in cloud-hypervisor virtiofsd vmlinux rootfs.img configuration-clh.toml; do if [[ ! -f "${OUT}/${f}" ]]; then need_assemble=true break @@ -135,7 +89,7 @@ else log "Assets already present in ${OUT}; skipping assemble." fi -# Upload the four assets under kata-assets/, where atelet fetches them: the +# Upload the five assets under kata-assets/, where atelet fetches them: the # in-cluster rustfs (port-forwarded, S3 API) on kind, or the GCS bucket on GKE. if [[ "${ATE_INSTALL_KIND}" == "true" ]]; then log "Staging assets to in-cluster rustfs bucket ${BUCKET_NAME} (kata-assets/)..." @@ -145,7 +99,7 @@ else OUT="${OUT}" BUCKET="${BUCKET_NAME}" hack/microvm-assets/stage-to-gcs.sh fi -# --- 5. deploy the control plane -------------------------------------------- +# --- 3. deploy the control plane -------------------------------------------- log "Deploying the ate control plane (--deploy-ate-system)..." if [[ "${ATE_INSTALL_KIND}" == "true" ]]; then # install-ate-kind.sh sets NO_DEV_ENV/KO_DOCKER_REPO/ARCH/ATE_INSTALL_KIND itself. @@ -155,15 +109,23 @@ else KUBECTL_CONTEXT="${KUBECTL_CONTEXT}" hack/install-ate.sh --deploy-ate-system fi -# --- 6. apply the demo ------------------------------------------------------ -# Use ./hack/run-tool.sh ko so ko honors KO_CONFIG_PATH + KO_DOCKER_REPO. Only -# ko apply/create/delete/run accept args after `--`; thread --context there -# (mirrors the run_ko helper in hack/install-ate.sh). +# --- 4. apply the demo ------------------------------------------------------ +# Use ./hack/run-tool.sh ko so ko honors KO_DOCKER_REPO (the committed .ko.yaml base +# is used as-is — no override). Only ko apply/create/delete/run accept args after +# `--`; thread --context there (mirrors the run_ko helper in hack/install-ate.sh). log "Applying the counter-microvm demo manifest..." -sed "s|\${BUCKET_NAME}|${BUCKET_NAME}|g" demos/counter/counter-microvm.yaml.tmpl \ +# virtiofsd is built from source (pinned commit in assemble.sh), so its binary bytes +# are not reproducible across toolchains/arches and its sha can't be a fixed pin in the +# manifest. Compute it from the freshly-staged binary and inject it, so the deployed +# SandboxConfig always matches whatever was staged. The downloaded assets +# (cloud-hypervisor/kernel/rootfs/config) keep their committed, reproducible per-arch shas. +VIRTIOFSD_SHA256="$(sha256sum "${OUT}/virtiofsd" | awk '{print $1}')" +sed -e "s|\${BUCKET_NAME}|${BUCKET_NAME}|g" \ + -e "s|\${VIRTIOFSD_SHA256}|${VIRTIOFSD_SHA256}|g" \ + demos/counter/counter-microvm.yaml.tmpl \ | ./hack/run-tool.sh ko apply -f - ${KUBECTL_CONTEXT:+-- --context="${KUBECTL_CONTEXT}"} -# --- 7. next steps ---------------------------------------------------------- +# --- 5. next steps ---------------------------------------------------------- KCTX_FLAG="" if [[ -n "${KUBECTL_CONTEXT}" ]]; then KCTX_FLAG=" --context=${KUBECTL_CONTEXT}" diff --git a/manifests/ate-install/sandboxconfig-validation.yaml b/manifests/ate-install/sandboxconfig-validation.yaml index 7e96857f8..d350d10b4 100644 --- a/manifests/ate-install/sandboxconfig-validation.yaml +++ b/manifests/ate-install/sandboxconfig-validation.yaml @@ -41,9 +41,9 @@ spec: object.spec.sandboxClass != 'microvm' || (has(object.spec.assets) && size(object.spec.assets) > 0 && object.spec.assets.all(arch, - ['cloud-hypervisor', 'kata-kernel', 'kata-image', 'kata-config'] + ['cloud-hypervisor', 'virtiofsd', 'kata-kernel', 'kata-image', 'kata-config'] .all(name, name in object.spec.assets[arch]))) - message: "a microvm SandboxConfig must define cloud-hypervisor, kata-kernel, kata-image, and kata-config assets for every architecture under spec.assets" + message: "a microvm SandboxConfig must define cloud-hypervisor, virtiofsd, kata-kernel, kata-image, and kata-config assets for every architecture under spec.assets" --- apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingAdmissionPolicyBinding diff --git a/pkg/api/v1alpha1/sandboxconfig_validation_test.go b/pkg/api/v1alpha1/sandboxconfig_validation_test.go index 3bb28dbba..20423e674 100644 --- a/pkg/api/v1alpha1/sandboxconfig_validation_test.go +++ b/pkg/api/v1alpha1/sandboxconfig_validation_test.go @@ -48,13 +48,13 @@ func sandboxConfig(name string, class SandboxClass, assets map[string]map[string func runscAsset() AssetFile { return AssetFile{URL: "gs://bucket/runsc", SHA256: validSHA256} } // microVMAssets returns a full, valid micro-VM asset set for one architecture: -// the four assets the policy requires. ateom owns the cloud-hypervisor boot and -// gives the actor a writable virtio-blk rootfs, so the set has no kata-shim or -// virtiofsd. +// the five assets the policy requires. The overlay rootfs serves the OCI image +// over virtio-fs, so virtiofsd is part of the set. func microVMAssets() map[string]AssetFile { a := AssetFile{URL: "gs://bucket/asset", SHA256: validSHA256} return map[string]AssetFile{ "cloud-hypervisor": a, + "virtiofsd": a, "kata-kernel": a, "kata-image": a, "kata-config": a, @@ -133,6 +133,15 @@ func TestSandboxConfigValidation(t *testing.T) { }()}), wantErr: true, errMsg: "microvm SandboxConfig must define", + }, { + name: "microvm missing virtiofsd", + sc: sandboxConfig("bad-microvm-novfsd", "microvm", map[string]map[string]AssetFile{"amd64": func() map[string]AssetFile { + m := microVMAssets() + delete(m, "virtiofsd") + return m + }()}), + wantErr: true, + errMsg: "microvm SandboxConfig must define", }, { name: "gvisor arch missing runsc", sc: sandboxConfig("bad-no-runsc", SandboxClassGvisor, map[string]map[string]AssetFile{"amd64": {"notrunsc": runscAsset()}}), From 31bd53b19672bb87cbaf042a4451270f57b4b486 Mon Sep 17 00:00:00 2001 From: Benjamin Elder Date: Thu, 25 Jun 2026 18:38:37 -0700 Subject: [PATCH 5/5] ateom-microvm: drop stale owned-boot / reset-to-golden comments Terminology and accuracy fixup in files the overlay change doesn't otherwise touch: the runtime no longer resets the rootfs to golden (the overlay's tmpfs upper persists in the memory snapshot), and "owned-boot" was local jargon for ateom booting cloud-hypervisor itself. Comments only. --- cmd/ateom-microvm/internal/kata/config.go | 6 +++--- cmd/ateom-microvm/internal/kata/config_test.go | 2 +- cmd/ateom-microvm/net.go | 10 +++++----- cmd/ateom-microvm/spec.go | 9 ++++----- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/cmd/ateom-microvm/internal/kata/config.go b/cmd/ateom-microvm/internal/kata/config.go index 553059d49..7591e402f 100644 --- a/cmd/ateom-microvm/internal/kata/config.go +++ b/cmd/ateom-microvm/internal/kata/config.go @@ -32,8 +32,8 @@ type KataConfig struct { VCPUs int // KernelParams is the guest kernel command line ([hypervisor.clh] // kernel_params): the kata-agent parameters (agent.log, the systemd target, - // etc.). The owned boot appends these to the cloud-hypervisor payload cmdline, - // since there is no kata shim to inject them. + // etc.). ateom appends these to the cloud-hypervisor payload cmdline, since + // there is no kata shim to inject them. KernelParams string } @@ -52,7 +52,7 @@ type clhConfigTOML struct { // ParseConfig reads the guest sizing and kernel_params from a kata // configuration.toml. memDefault/vcpuDefault are substituted when the key is // absent or non-positive (kata also accepts default_vcpus = -1 meaning "all host -// CPUs", which the owned boot does not support). +// CPUs", which ateom does not support). func ParseConfig(base []byte, memDefault, vcpuDefault int) (KataConfig, error) { var c clhConfigTOML if err := toml.Unmarshal(base, &c); err != nil { diff --git a/cmd/ateom-microvm/internal/kata/config_test.go b/cmd/ateom-microvm/internal/kata/config_test.go index 776aa7e00..dbced02ff 100644 --- a/cmd/ateom-microvm/internal/kata/config_test.go +++ b/cmd/ateom-microvm/internal/kata/config_test.go @@ -49,7 +49,7 @@ func TestParseConfig(t *testing.T) { // TestParseConfigDefaults asserts the mem/vcpu defaults kick in when the keys are // absent or non-positive (kata also accepts default_vcpus = -1 meaning "all host -// CPUs", which the owned boot does not support). +// CPUs", which ateom does not support). func TestParseConfigDefaults(t *testing.T) { for _, tc := range []struct { name string diff --git a/cmd/ateom-microvm/net.go b/cmd/ateom-microvm/net.go index e74db2bd3..cc047d4f2 100644 --- a/cmd/ateom-microvm/net.go +++ b/cmd/ateom-microvm/net.go @@ -69,9 +69,9 @@ const ( // gateway MAC keeps the frozen entry valid on every pod. hostVethMAC = "02:a8:1e:00:00:01" - // actorGuestMAC is the FIXED MAC for the guest's eth0 (the CH virtio-net) on - // the ateom-owned-boot path. Fixed for the same reason as hostVethMAC: a cold - // boot freezes this MAC into the guest+snapshot, and restore re-adds the + // actorGuestMAC is the FIXED MAC for the guest's eth0 (the CH virtio-net). + // Fixed for the same reason as hostVethMAC: a cold boot freezes this MAC into + // the guest+snapshot, and restore re-adds the // virtio-net under the same MAC (SnapshotNetDevices reads it back), so the // guest's frozen interface config stays valid across pods. Distinct from the // gateway MAC (…:01). @@ -599,8 +599,8 @@ func (s *AteomService) setupRestoreTap(ctx context.Context, name string, queuePa return fds, nil } -// actorVethMTU reads the MTU of the actor veth (eth0 in the interior netns) so the -// owned-boot path can configure the guest eth0 with a matching MTU via the agent +// actorVethMTU reads the MTU of the actor veth (eth0 in the interior netns) so +// ateom can configure the guest eth0 with a matching MTU via the agent // (UpdateInterface). Defaults to 1500 if the link can't be read. func (s *AteomService) actorVethMTU(ctx context.Context) int { mtu := 1500 diff --git a/cmd/ateom-microvm/spec.go b/cmd/ateom-microvm/spec.go index 8b9d5ca08..7962bc5aa 100644 --- a/cmd/ateom-microvm/spec.go +++ b/cmd/ateom-microvm/spec.go @@ -90,11 +90,10 @@ func ensureKataCompatibleSpec(bundle, id, netnsPath string) (*specs.Spec, error) // // KNOWN GAP vs the gVisor runtime: this also drops atelet's read-only actor // identity bind mount (/run/ate/actor-id). The micro-VM guest can't see host - // paths (the rootfs is a virtio-blk disk, not a shared filesystem), and - // reset-to-golden restores guest RAM + rootfs from the golden snapshot, so a - // per-actor file written into the rootfs would be shadowed/incorrect on restore. - // Exposing the identity needs a per-actor volume injected from OUTSIDE the golden - // state; not yet implemented. No micro-VM workload depends on it today. + // paths (the rootfs is an overlay of a virtio-fs base + a guest-RAM upper, not a + // host bind), so atelet's host-path identity mount has nothing to bind to. + // Exposing the identity needs a per-actor volume plumbed into the guest; not yet + // implemented. No micro-VM workload depends on it today. spec.Mounts = defaultKataMounts() out, err := json.MarshalIndent(&spec, "", " ")