From fa991e279fee4fd209c8b455e84af372f261222d Mon Sep 17 00:00:00 2001
From: Benjamin Elder <bentheelder@google.com>
Date: Thu, 25 Jun 2026 17:39:33 -0700
Subject: [PATCH 1/5] ateom-microvm/ch: add virtio-fs and platform devices to
 the VM config

The overlay rootfs serves each container's image read-only over virtio-fs, which
needs a vhost-user fs device (a virtiofsd socket) and more than one PCI segment (the
fs device sits on segment 1, kata's convention). Add FsConfig + PlatformConfig and the
Fs/Platform fields to VmConfig; both are omitempty, so a config without them is
serialized exactly as before.
---
 cmd/ateom-microvm/internal/ch/createvm.go | 47 +++++++++++++++--------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/cmd/ateom-microvm/internal/ch/createvm.go b/cmd/ateom-microvm/internal/ch/createvm.go
index f24266653..51d0ea4a3 100644
--- a/cmd/ateom-microvm/internal/ch/createvm.go
+++ b/cmd/ateom-microvm/internal/ch/createvm.go
@@ -20,19 +20,35 @@ import (
 )
 
 // VmConfig is the body of /api/v1/vm.create — the subset of cloud-hypervisor's
-// VmConfig ateom sets to boot a kata guest itself (the "ateom owns the boot"
-// path, replacing the kata shim). Modeled on kata's clh driver
-// (src/runtime/virtcontainers/clh.go) and the proven suspend-bench vmConfig.
-// vm.create + vm.boot use PUT (empirically accepted by CH, like the bench).
+// VmConfig ateom sets to boot the kata guest. Modeled on kata's clh driver
+// (src/runtime/virtcontainers/clh.go). vm.create + vm.boot are issued with PUT.
 type VmConfig struct {
-	Cpus    CpusConfig     `json:"cpus"`
-	Memory  MemoryConfig   `json:"memory"`
-	Payload PayloadConfig  `json:"payload"`
-	Disks   []DiskConfig   `json:"disks,omitempty"`
-	Rng     *RngConfig     `json:"rng,omitempty"`
-	Serial  *ConsoleConfig `json:"serial,omitempty"`
-	Console *ConsoleConfig `json:"console,omitempty"`
-	Vsock   *VsockConfig   `json:"vsock,omitempty"`
+	Cpus     CpusConfig      `json:"cpus"`
+	Memory   MemoryConfig    `json:"memory"`
+	Payload  PayloadConfig   `json:"payload"`
+	Disks    []DiskConfig    `json:"disks,omitempty"`
+	Fs       []FsConfig      `json:"fs,omitempty"`
+	Rng      *RngConfig      `json:"rng,omitempty"`
+	Serial   *ConsoleConfig  `json:"serial,omitempty"`
+	Console  *ConsoleConfig  `json:"console,omitempty"`
+	Vsock    *VsockConfig    `json:"vsock,omitempty"`
+	Platform *PlatformConfig `json:"platform,omitempty"`
+}
+
+// FsConfig is a virtio-fs device backed by a vhost-user (virtiofsd) socket. The
+// overlay rootfs path uses it as the RO lower; the guest mounts it via the FsTag.
+type FsConfig struct {
+	Tag        string `json:"tag"`
+	Socket     string `json:"socket"`
+	NumQueues  int32  `json:"num_queues,omitempty"`
+	QueueSize  int32  `json:"queue_size,omitempty"`
+	PciSegment int32  `json:"pci_segment,omitempty"`
+}
+
+// PlatformConfig sets VM-wide platform options. NumPciSegments must be >1 when a
+// virtio-fs device sits on a non-zero PCI segment (kata puts fs on segment 1).
+type PlatformConfig struct {
+	NumPciSegments int32 `json:"num_pci_segments,omitempty"`
 }
 
 // CpusConfig sets the boot/max vCPU counts.
@@ -56,10 +72,9 @@ type PayloadConfig struct {
 	Cmdline string `json:"cmdline"`
 }
 
-// DiskConfig is one virtio-blk disk. The kata guest image is disk 0 (/dev/vda,
-// readonly); ateom appends the actor rootfs as disk 1 (/dev/vdb, writable). The
-// guest sees disks in config order. NumQueues/QueueSize mirror kata's clh
-// (num_queues = vcpus, queue_size = 1024).
+// DiskConfig is one virtio-blk disk. The only disk is the kata guest image
+// (/dev/vda, read-only); the actor rootfs is an overlay served over virtio-fs, not a
+// disk. NumQueues/QueueSize mirror kata's clh (num_queues = vcpus, queue_size = 1024).
 type DiskConfig struct {
 	Path      string `json:"path"`
 	Readonly  bool   `json:"readonly"`

From 3f9e8cb301255b762a5e33e86fc12ea7b86be909 Mon Sep 17 00:00:00 2001
From: Benjamin Elder <bentheelder@google.com>
Date: Thu, 25 Jun 2026 17:57:20 -0700
Subject: [PATCH 2/5] ateom-microvm/kata: overlay rootfs helpers (virtio-fs RO
 lower + tmpfs upper)

Helpers to assemble a container's rootfs as an overlay: its OCI image served
read-only over virtio-fs (the lower) plus a guest tmpfs (the writable upper).

  - StartVirtiofsd: run virtiofsd in find-paths migration mode (so the fs device
    survives CH snapshot/restore), serving the per-sandbox shared dir.
  - ReconstructSharedDirFromImage: bind-mount a container's image into <cid>/rootfs
    under the shared dir (no host-side copy; virtiofsd serves it to the guest on
    demand), ensure the standard OCI mountpoints exist, and remount it read-only so the
    lower is immutable and byte-identical on every node (find-paths re-opens its inodes
    by path on restore).
  - CreateSandboxForActor: create the sandbox with the kataShared virtio-fs mount.
  - CreateCarrier: a created-but-unstarted container that binds the base to a stable
    per-container path the overlay uses as its lowerdir.
  - StartOverlayWorkload: create + start the container with an overlayfs rootfs whose
    upper/work live on a guest tmpfs.
---
 .../internal/kata/agentclient.go              |  20 +-
 cmd/ateom-microvm/internal/kata/kata.go       |  15 +-
 .../internal/kata/overlay_linux.go            | 247 ++++++++++++++++++
 3 files changed, 264 insertions(+), 18 deletions(-)
 create mode 100644 cmd/ateom-microvm/internal/kata/overlay_linux.go

diff --git a/cmd/ateom-microvm/internal/kata/agentclient.go b/cmd/ateom-microvm/internal/kata/agentclient.go
index 42537f8f9..f7a242694 100644
--- a/cmd/ateom-microvm/internal/kata/agentclient.go
+++ b/cmd/ateom-microvm/internal/kata/agentclient.go
@@ -158,9 +158,9 @@ func (a *AgentClient) StartContainer(ctx context.Context, containerID string) er
 
 // CreateSandbox establishes the agent's sandbox context (sandbox id, hostname,
 // sandbox pidns) before any container is created. The kata shim normally issues
-// this once at VM boot; on the ateom-owned-boot path (no shim) ateom must call it
-// itself so the agent has a sandbox to attach containers to. Storages is empty —
-// the actor rootfs arrives as a per-container "blk" storage, not a sandbox mount.
+// this once at VM boot; ateom (no shim) must call it itself so the agent has a
+// sandbox to attach containers to. Storages carries the shared virtio-fs mount
+// (the overlay lowers); each container's rootfs is assembled per-container.
 // Mirrors grpc.AgentService/CreateSandbox (returns google.protobuf.Empty).
 func (a *AgentClient) CreateSandbox(ctx context.Context, req *agentpb.CreateSandboxRequest) error {
 	if err := a.client.Call(ctx, "grpc.AgentService", "CreateSandbox", req, &emptypb.Empty{}); err != nil {
@@ -169,10 +169,10 @@ func (a *AgentClient) CreateSandbox(ctx context.Context, req *agentpb.CreateSand
 	return nil
 }
 
-// UpdateInterface configures a guest network interface (the kata shim's job; on
-// the owned-boot path ateom does it). The agent matches the link by HwAddr, then
-// applies the name/IP/MTU. Mirrors grpc.AgentService/UpdateInterface (returns the
-// resulting Interface).
+// UpdateInterface configures a guest network interface (the kata shim's job, which
+// ateom does itself). The agent matches the link by HwAddr, then applies the
+// name/IP/MTU. Mirrors grpc.AgentService/UpdateInterface (returns the resulting
+// Interface).
 func (a *AgentClient) UpdateInterface(ctx context.Context, iface *agentpb.Interface) error {
 	req := &agentpb.UpdateInterfaceRequest{Interface: iface}
 	if err := a.client.Call(ctx, "grpc.AgentService", "UpdateInterface", req, &agentpb.Interface{}); err != nil {
@@ -208,7 +208,7 @@ func (a *AgentClient) AddARPNeighbors(ctx context.Context, neighbors []*agentpb.
 // buffered (up to max), so callers loop until it returns an error — the agent
 // returns an error/EOF-like status once the stream ends (container exit / connection
 // close). Mirrors grpc.AgentService/ReadStdout. The kata-agent keys the stream by
-// ExecId, which the owned-boot path sets equal to ContainerId (see StartBlkWorkload).
+// ExecId, which ateom sets equal to ContainerId.
 func (a *AgentClient) ReadStdout(ctx context.Context, containerID, execID string, max uint32) ([]byte, error) {
 	resp := &agentpb.ReadStreamResponse{}
 	req := &agentpb.ReadStreamRequest{ContainerId: containerID, ExecId: execID, Len: max}
@@ -246,8 +246,8 @@ type StreamReader struct {
 }
 
 // NewStdioReader returns an io.Reader over the container's stdout (stderr=false)
-// or stderr (stderr=true). execID matches the value passed to StartBlkWorkload
-// (equal to containerID on the owned-boot path).
+// or stderr (stderr=true). execID equals containerID (ateom sets ExecId ==
+// ContainerId when it creates the container).
 func NewStdioReader(ctx context.Context, ac *AgentClient, containerID, execID string, stderr bool) *StreamReader {
 	return &StreamReader{ctx: ctx, ac: ac, containerID: containerID, execID: execID, stderr: stderr}
 }
diff --git a/cmd/ateom-microvm/internal/kata/kata.go b/cmd/ateom-microvm/internal/kata/kata.go
index f6008db23..0b09241ec 100644
--- a/cmd/ateom-microvm/internal/kata/kata.go
+++ b/cmd/ateom-microvm/internal/kata/kata.go
@@ -13,15 +13,14 @@
 // limitations under the License.
 
 // Package kata holds the helpers ateom uses to boot and drive a kata guest in a
-// cloud-hypervisor micro-VM WITHOUT the kata shim: ateom boots cloud-hypervisor
-// itself (see internal/ch), then drives the stock kata-agent over its
-// hybrid-vsock ttrpc API (DialAgent / AgentClient) to create the sandbox and
-// start the actor's container on a writable virtio-blk rootfs (StartBlkWorkload).
+// cloud-hypervisor micro-VM without the kata shim: ateom boots cloud-hypervisor
+// itself (see internal/ch), then drives the stock kata-agent over its hybrid-vsock
+// ttrpc API (DialAgent / AgentClient) to create the sandbox and assemble each
+// container's overlay rootfs (overlay_linux.go).
 //
-// It also renders the kata configuration.toml (for the agent kernel_params +
-// guest sizing) from runtime-fetched assets (config.go), builds the actor's ext4
-// rootfs disk (BuildExt4Image), and sweeps leftover per-sandbox host-side state
-// (CleanupSandboxState).
+// It also renders the kata configuration.toml (for the agent kernel_params + guest
+// sizing) from runtime-fetched assets (config.go) and sweeps leftover per-sandbox
+// host-side state (CleanupSandboxState).
 package kata
 
 import (
diff --git a/cmd/ateom-microvm/internal/kata/overlay_linux.go b/cmd/ateom-microvm/internal/kata/overlay_linux.go
new file mode 100644
index 000000000..62007e3d4
--- /dev/null
+++ b/cmd/ateom-microvm/internal/kata/overlay_linux.go
@@ -0,0 +1,247 @@
+//go:build linux
+
+// Copyright 2026 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kata
+
+// Each container's rootfs is an overlay: its OCI image served read-only over virtio-fs
+// (the lower) plus a guest tmpfs (the writable upper). The upper is in guest RAM, so
+// rootfs writes ride along in the memory snapshot and persist across suspend/resume.
+// This file holds the overlay-specific helpers.
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/agent-substrate/substrate/cmd/ateom-microvm/internal/third_party/kata/agentpb"
+	specs "github.com/opencontainers/runtime-spec/specs-go"
+)
+
+const (
+	// FsTag is the virtio-fs tag kata uses for the shared filesystem. The CH fs
+	// device Tag and the agent mount Source must both be this value.
+	FsTag = "kataShared"
+	// typeVirtioFS / virtioFSDriver are the agent fstype + driver for it.
+	typeVirtioFS   = "virtiofs"
+	virtioFSDriver = "virtio-fs"
+	// guestSharedDir is where the agent mounts the kataShared tag in the guest;
+	// per-container rootfs then lives at <guestSharedDir>/<cid>/rootfs.
+	guestSharedDir = "/run/kata-containers/shared/containers/"
+)
+
+// SharedDir is the host directory virtiofsd serves into the guest as the RO base.
+// Its layout (<cid>/rootfs) is what find-paths re-opens by path on restore.
+func SharedDir(id string) string {
+	return filepath.Join("/run/kata-containers/shared/sandboxes", id, "shared")
+}
+
+// VirtiofsdSocketPath is the vhost-user-fs socket CH connects to for the fs device.
+func VirtiofsdSocketPath(id string) string { return filepath.Join(VMDir(id), "virtiofsd.sock") }
+
+// OverlayUpperBase is the in-guest mount point for one container's overlay upper/work.
+// It lives under /run (tmpfs) so the upper's writes are in guest RAM and ride along in
+// the memory-only snapshot (rootfs writes persist). Keyed on the container id, which is
+// stable across the actor's restore lineage.
+func OverlayUpperBase(containerID string) string { return "/run/ateom-upper/" + containerID }
+
+// GuestSharedRootfs is the in-guest path the kataShared mount exposes a container's
+// rootfs at. A carrier container with this as Root.Path makes the agent bind it to
+// /run/kata-containers/<cid>/rootfs — a stable per-container path the overlay then
+// uses as its lowerdir.
+func GuestSharedRootfs(containerID string) string { return guestSharedDir + containerID + "/rootfs" }
+
+// VirtiofsdOptions configures StartVirtiofsd.
+type VirtiofsdOptions struct {
+	Binary     string // virtiofsd executable; defaults to "virtiofsd"
+	SocketPath string // vhost-user socket CH connects to (VirtiofsdSocketPath)
+	SharedDir  string // directory to serve (SharedDir(id))
+	Log        io.Writer
+}
+
+// StartVirtiofsd launches virtiofsd in find-paths migration mode serving o.SharedDir
+// on o.SocketPath, and waits for the socket to appear. The returned cmd outlives the
+// caller's ctx (CH demand-pages from it under the running VM); the caller owns it.
+func StartVirtiofsd(ctx context.Context, o VirtiofsdOptions) (*exec.Cmd, error) {
+	bin := o.Binary
+	if bin == "" {
+		bin = "virtiofsd"
+	}
+	_ = os.Remove(o.SocketPath)
+	cmd := exec.Command(bin,
+		"--socket-path="+o.SocketPath,
+		"--shared-dir="+o.SharedDir,
+		"--cache=auto",
+		"--thread-pool-size=1",
+		"--announce-submounts",
+		"--migration-mode", "find-paths",
+	)
+	cmd.Stdout = o.Log
+	cmd.Stderr = o.Log
+	if err := cmd.Start(); err != nil {
+		return nil, fmt.Errorf("starting virtiofsd: %w", err)
+	}
+	deadline := time.Now().Add(10 * time.Second)
+	for time.Now().Before(deadline) {
+		if _, err := os.Stat(o.SocketPath); err == nil {
+			return cmd, nil
+		}
+		select {
+		case <-ctx.Done():
+			_ = cmd.Process.Kill()
+			return nil, ctx.Err()
+		case <-time.After(50 * time.Millisecond):
+		}
+	}
+	_ = cmd.Process.Kill()
+	return nil, fmt.Errorf("virtiofsd socket %q did not appear", o.SocketPath)
+}
+
+// ReconstructSharedDirFromImage bind-mounts a container's OCI image rootfs at
+// <cid>/rootfs under SharedDir(restoreID) so virtiofsd serves it as the read-only lower.
+// The bind copies nothing on the host (virtiofsd serves files to the guest on demand).
+// The path is identical on every node — find-paths migration re-opens the lower by path
+// — given a deterministic image unpack. cid is stable across the actor's lineage.
+func ReconstructSharedDirFromImage(ctx context.Context, bundleRootfs, restoreID, cid string) error {
+	if cid == "" {
+		return fmt.Errorf("ReconstructSharedDirFromImage: empty container id")
+	}
+	dst := filepath.Join(SharedDir(restoreID), cid, "rootfs")
+	// Drop any stale bind first (lazy if busy), then ensure a clean mountpoint. Not
+	// RemoveAll: that would chase a live bind into bundleRootfs.
+	if err := exec.Command("umount", dst).Run(); err != nil {
+		_ = exec.Command("umount", "-l", dst).Run()
+	}
+	if err := os.MkdirAll(dst, 0o755); err != nil {
+		return fmt.Errorf("creating shared dir %q: %w", dst, err)
+	}
+	cmd := exec.CommandContext(ctx, "mount", "--bind", bundleRootfs, dst)
+	var stderr strings.Builder
+	cmd.Stderr = &stderr
+	if err := cmd.Run(); err != nil {
+		return fmt.Errorf("bind-mounting image rootfs %q -> %q: %w (%s)", bundleRootfs, dst, err, strings.TrimSpace(stderr.String()))
+	}
+	// Ensure the standard OCI mountpoints exist even for minimal images: the container
+	// mounts /proc,/sys,/dev over them, and find-paths re-opens the lower by path on
+	// restore, so the layout must match on every node. (Bind still writable; ignore EEXIST.)
+	for _, d := range []string{"proc", "sys", "dev"} {
+		_ = os.MkdirAll(filepath.Join(dst, d), 0o755)
+	}
+	// Remount read-only: the lower is immutable, so all writes go to the tmpfs upper and
+	// it stays byte-identical across reconstructions (required by find-paths migration).
+	ro := exec.CommandContext(ctx, "mount", "-o", "remount,bind,ro", dst)
+	var roErr strings.Builder
+	ro.Stderr = &roErr
+	if err := ro.Run(); err != nil {
+		return fmt.Errorf("remounting overlay lower read-only %q: %w (%s)", dst, err, strings.TrimSpace(roErr.String()))
+	}
+	return nil
+}
+
+// CreateSandboxForActor creates the guest sandbox with the kataShared virtio-fs mount
+// (the RO base backing every container's rootfs). Mirrors kata startSandbox.
+func (a *AgentClient) CreateSandboxForActor(ctx context.Context, sandboxID, hostname string) error {
+	return a.CreateSandbox(ctx, &agentpb.CreateSandboxRequest{
+		Hostname:  hostname,
+		SandboxId: sandboxID,
+		Storages: []*agentpb.Storage{{
+			Driver:     virtioFSDriver,
+			Source:     FsTag,
+			Fstype:     typeVirtioFS,
+			MountPoint: guestSharedDir,
+		}},
+	})
+}
+
+// CreateCarrier creates a "carrier" container (id == cid): rootfs = the kataShared
+// virtio-fs base for that container, created but NOT started. This makes the agent's
+// setup_bundle bind the base to /run/kata-containers/<cid>/rootfs — the stable path the
+// overlay uses as its lowerdir (a bare virtio-fs submount is not reliably visible there).
+func (a *AgentClient) CreateCarrier(ctx context.Context, cid string, spec *specs.Spec) error {
+	pbSpec := SpecToAgentPB(spec)
+	// Readonly: the carrier only exists to materialize the base bind; its rootfs (the
+	// overlay lower) must stay immutable. Overlay writes go to the tmpfs upper.
+	pbSpec.Root = &agentpb.Root{Path: GuestSharedRootfs(cid), Readonly: true}
+	if pbSpec.Linux != nil {
+		pbSpec.Linux.CgroupsPath = "/ateomchv/" + cid + "-carrier"
+	}
+	if err := a.CreateContainer(ctx, &agentpb.CreateContainerRequest{
+		ContainerId: cid,
+		ExecId:      cid,
+		OCI:         pbSpec,
+	}); err != nil {
+		return fmt.Errorf("creating carrier container %q: %w", cid, err)
+	}
+	return nil
+}
+
+// StartOverlayWorkload creates + starts one container with an overlayfs rootfs:
+// lower = the carrier's resolved bind (/run/kata-containers/<cid>/rootfs from the RO
+// virtio-fs base), upper/work = <upperBase>/{fs,work} on a guest tmpfs so rootfs writes
+// land in guest RAM (captured by the memory-only snapshot → persist). The agent creates
+// the upper/work dirs (create_directory) before mounting the overlay.
+func (a *AgentClient) StartOverlayWorkload(ctx context.Context, cid, workloadID, upperBase string, spec *specs.Spec) error {
+	const createDir = "io.katacontainers.volume.overlayfs.create_directory"
+	sharedBase := "/run/kata-containers/" + cid + "/rootfs"
+	base := "/run/kata-containers/" + workloadID
+	lower := base + "/lower"
+	ovlRoot := base + "/rootfs"
+	upper := upperBase + "/fs"
+	work := upperBase + "/work"
+
+	storages := []*agentpb.Storage{
+		{
+			Driver:     virtioFSDriver,
+			Source:     sharedBase,
+			MountPoint: lower,
+			Fstype:     "bind",
+			Options:    []string{"bind"},
+		},
+		{
+			Driver:        "overlayfs",
+			Source:        "overlay",
+			Fstype:        "overlay",
+			MountPoint:    ovlRoot,
+			DriverOptions: []string{createDir + "=" + upper, createDir + "=" + work},
+			Options:       []string{"lowerdir=" + lower, "upperdir=" + upper, "workdir=" + work},
+		},
+	}
+	pbSpec := SpecToAgentPB(spec)
+	pbSpec.Root = &agentpb.Root{Path: ovlRoot, Readonly: false}
+	// Per-workload cgroup: the shaped spec carries the actor-wide /ateomchv/<actorID>
+	// (spec.go), which collides across an actor's containers — mirror the carrier's
+	// per-id path so each workload gets its own cgroup.
+	if pbSpec.Linux != nil {
+		pbSpec.Linux.CgroupsPath = "/ateomchv/" + workloadID
+	}
+
+	if err := a.CreateContainer(ctx, &agentpb.CreateContainerRequest{
+		ContainerId: workloadID,
+		ExecId:      workloadID,
+		Storages:    storages,
+		OCI:         pbSpec,
+	}); err != nil {
+		return fmt.Errorf("creating overlay workload %q: %w", workloadID, err)
+	}
+	if err := a.StartContainer(ctx, workloadID); err != nil {
+		return fmt.Errorf("starting overlay workload %q: %w", workloadID, err)
+	}
+	return nil
+}

From 1546c8ecf6ddca665c0b2ea475e163a69694c3e0 Mon Sep 17 00:00:00 2001
From: Benjamin Elder <bentheelder@google.com>
Date: Thu, 25 Jun 2026 17:57:20 -0700
Subject: [PATCH 3/5] ateom-microvm: run multi-container actors on an overlay
 rootfs

Run all of an actor's containers in the one micro-VM (the pod sandbox), each with its
own overlay rootfs (virtio-fs RO lower + guest-tmpfs upper) rather than a per-container
disk. Because the writable upper is a guest tmpfs, rootfs writes are part of the CH
memory snapshot and persist across suspend/resume alongside process memory.

  - RunWorkload: bind each container's image into the shared dir and start one
    virtiofsd; create the sandbox, then a carrier + overlay workload per container.
  - CheckpointWorkload: pause + snapshot memory; the tmpfs upper rides along, so there
    is no per-container disk to ship.
  - RestoreWorkload: reconstruct each read-only lower from the local OCI bundle, start
    virtiofsd, repoint the snapshot config's per-VMDir paths (vsock, serial, fs socket),
    and OnDemand-restore + resume.

This replaces the per-container disk path: remove the disk builder (BuildExt4Image),
the blk workload (StartBlkWorkload), and the now-obsolete blk integration test.
---
 cmd/ateom-microvm/checkpoint.go               |  51 +-
 cmd/ateom-microvm/internal/kata/disk.go       | 140 ------
 cmd/ateom-microvm/internal/kata/disk_test.go  |  76 ---
 cmd/ateom-microvm/internal/kata/specconv.go   |  56 +--
 cmd/ateom-microvm/restore.go                  | 168 +++----
 cmd/ateom-microvm/run.go                      | 347 ++++++++-----
 cmd/ateom-microvm/service_integration_test.go | 473 ------------------
 7 files changed, 307 insertions(+), 1004 deletions(-)
 delete mode 100644 cmd/ateom-microvm/internal/kata/disk.go
 delete mode 100644 cmd/ateom-microvm/internal/kata/disk_test.go
 delete mode 100644 cmd/ateom-microvm/service_integration_test.go

diff --git a/cmd/ateom-microvm/checkpoint.go b/cmd/ateom-microvm/checkpoint.go
index 7461b73d3..511cd0332 100644
--- a/cmd/ateom-microvm/checkpoint.go
+++ b/cmd/ateom-microvm/checkpoint.go
@@ -32,15 +32,15 @@ import (
 
 // CheckpointWorkload suspends the actor and writes a portable CH snapshot.
 //
-// Contract with atelet (mirrors ateom-gvisor): after we return, atelet uploads
-// the checkpoint dir to object storage, then tears down bundles and resets the
-// actor dir.
+// Contract with atelet: after we return, atelet uploads the checkpoint dir to object
+// storage, then tears down bundles and resets the actor dir.
 //
-// ateom drives the ateom-owned CH's REST api-socket: pause -> snapshot
-// file://<CheckpointStateDir> (config.json + state.json + sparse memory-ranges) ->
-// tear the VMM down. The actor's rootfs lives on the host-backed /dev/vdb, not a
-// guest tmpfs overlay-upper, so the snapshot is naturally memory-only and small —
-// no RAM-backed upper to wipe and no balloon to inflate before snapshot.
+// ateom drives the CH REST api-socket: pause -> snapshot file://<CheckpointStateDir>
+// (config.json + state.json + sparse memory-ranges) -> tear the VMM down. Each
+// container's rootfs is overlay(virtio-fs RO lower + guest-tmpfs upper), so the
+// writable upper lives in guest RAM and is captured by the memory snapshot — process
+// memory and rootfs writes both persist across suspend/resume. The RO lower is
+// reconstructed from the OCI image at restore, so nothing rootfs-related ships here.
 func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.CheckpointWorkloadRequest) (*ateompb.CheckpointWorkloadResponse, error) {
 	s.lock.Lock()
 	defer s.lock.Unlock()
@@ -79,9 +79,9 @@ func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.Chec
 	}
 
 	// Record the FROZEN base id (the id the guest's virtio-fs find-paths are pinned
-	// to, <baseID>/rootfs). For a cold (owned-boot) actor this is its own id; for a
-	// restored actor it is the golden id propagated via ra.baseID (set from the
-	// snapshot we restored from). RestoreWorkload reads this to lay the
+	// to, <baseID>/rootfs). For a cold-run actor this is its own id; for a restored
+	// actor it is the golden id propagated via ra.baseID (set from the snapshot we
+	// restored from). RestoreWorkload reads this to lay the
 	// reconstructed-from-image base at the path the guest expects. We can NOT derive
 	// it from config.json (its socket paths get rewritten to the current id on every
 	// restore, losing the invariant golden id).
@@ -120,25 +120,13 @@ func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.Chec
 			slog.String("id", id), slog.Duration("merge", time.Since(tMerge)))
 	}
 
-	// reset-to-golden support: save the actor's /dev/vdb AS-OF this (paused,
-	// consistent) snapshot as a verbatim golden template, so future restores can
-	// recreate the disk byte-identical to what the snapshot's guest RAM expects
-	// while discarding the actor's later rootfs writes. Saved once (the first/golden
-	// checkpoint) and kept; best-effort (without it, restore reopens the live disk =
-	// continuity). TODO: ship the template with the snapshot for cross-node restore
-	// (it's golden, shipped once per template, like the OCI base).
-	actorDir := ateompath.ActorPath(ns, name, id)
-	if tmpl := filepath.Join(actorDir, goldenRootfsDiskName); fileMissing(tmpl) {
-		if cerr := copyDiskFile(ctx, filepath.Join(actorDir, actorRootfsDiskName), tmpl); cerr != nil {
-			slog.WarnContext(ctx, "Failed to save golden rootfs template; restore will reopen live disk", slog.Any("err", cerr))
-		} else {
-			slog.InfoContext(ctx, "Saved golden rootfs disk template", slog.String("id", id))
-		}
-	}
+	// Nothing rootfs-related ships: the overlay's writable upper is a guest tmpfs, so
+	// the actor's rootfs writes are already in the memory snapshot above, and the RO
+	// lower is reconstructed from the OCI image at restore (it never changes).
 
 	// Report exactly the files we wrote so atelet ships precisely the CH snapshot
-	// (config.json + state.json + memory-ranges + base-id), not gVisor's fixed set.
-	// Memory-only: the RO base is reconstructed from the OCI image at restore.
+	// (config.json + state.json + memory-ranges + base-id). The RO base is
+	// reconstructed from the OCI image at restore.
 	snapshotFiles, err := listFiles(checkpointDir)
 	if err != nil {
 		return nil, fmt.Errorf("while listing snapshot files: %w", err)
@@ -151,7 +139,7 @@ func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.Chec
 	dTeardown := time.Since(tTeardown)
 	delete(s.running, id)
 
-	// Tear down the per-activation actor network (mirrors gVisor).
+	// Tear down the per-activation actor network.
 	if err := s.cleanupActorNetwork(ctx); err != nil {
 		slog.WarnContext(ctx, "Failed to clean up actor network after checkpoint", slog.Any("err", err))
 	}
@@ -207,6 +195,11 @@ func (s *AteomService) teardownActor(ctx context.Context, id string, ra *running
 			_ = ra.chCmd.Process.Kill()
 			_, _ = ra.chCmd.Process.Wait()
 		}
+		// Kill the virtiofsd serving the overlay RO lower (after CH, its only client).
+		if ra.vfsdCmd != nil && ra.vfsdCmd.Process != nil {
+			_ = ra.vfsdCmd.Process.Kill()
+			_, _ = ra.vfsdCmd.Process.Wait()
+		}
 	}
 
 	// Sweep any leftover per-sandbox host-side state + orphaned per-sandbox
diff --git a/cmd/ateom-microvm/internal/kata/disk.go b/cmd/ateom-microvm/internal/kata/disk.go
deleted file mode 100644
index ae03353f7..000000000
--- a/cmd/ateom-microvm/internal/kata/disk.go
+++ /dev/null
@@ -1,140 +0,0 @@
-//go:build linux
-
-// Copyright 2026 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kata
-
-import (
-	"context"
-	"fmt"
-	"io/fs"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"strconv"
-)
-
-// rootfsDiskScratchBytes is the free-space headroom added on top of a bundle's
-// contents when sizing its writable rootfs disk: room for the actor to write
-// during a single activation. It stays sparse (unused space is holes), so it
-// costs nothing in the image file or the memory-only snapshot.
-const rootfsDiskScratchBytes = 512 << 20
-
-// rootfsDiskGeometry walks srcDir and returns the ext4 image size (MiB) and the
-// inode count to build a writable rootfs disk holding that tree plus headroom for
-// ext4 metadata and the actor's in-activation scratch writes. Both are
-// DETERMINISTIC functions of the tree's apparent contents (summed regular-file
-// sizes and entry count, NOT host block allocation), so the cold-boot build and
-// the restore-time rebuild from the same OCI image produce an identically-sized
-// disk — required because the guest resumes with the ext4 superblock cached in RAM.
-func rootfsDiskGeometry(srcDir string) (sizeMiB int, inodes int64, err error) {
-	var contentBytes, entries int64
-	if werr := filepath.WalkDir(srcDir, func(_ string, d fs.DirEntry, err error) error {
-		if err != nil {
-			return err
-		}
-		entries++ // every entry (file, dir, symlink, device) needs an inode
-		if d.Type().IsRegular() {
-			info, ierr := d.Info()
-			if ierr != nil {
-				return ierr
-			}
-			contentBytes += info.Size()
-		}
-		return nil
-	}); werr != nil {
-		return 0, 0, werr
-	}
-
-	const (
-		mib            = 1 << 20
-		inodeSizeBytes = 256 // ext4 default; over-estimates the table if it's 128
-	)
-	// One inode per entry plus 25% and a fixed reserve, so the actor can create new
-	// files during its activation without exhausting inodes (the default
-	// size-derived ratio can starve a file-heavy rootfs).
-	inodes = entries + entries/4 + 8192
-	// Contents + the eagerly-written inode table + ~6% for bitmaps/directory/extent
-	// metadata + the scratch reserve. Unused space stays sparse (holes).
-	sizeBytes := contentBytes + inodes*inodeSizeBytes + contentBytes/16 + rootfsDiskScratchBytes
-	sizeMiB = int((sizeBytes + mib - 1) / mib)
-	return sizeMiB, inodes, nil
-}
-
-// BuildExt4Image creates a raw ext4 disk image at outPath, sized dynamically from
-// srcDir (see rootfsDiskGeometry), pre-populated with srcDir's contents in a single
-// mkfs pass (`mkfs.ext4 -d <srcDir> ...`). This is how the ateom-owned-boot path
-// turns the actor's OCI bundle rootfs into a writable virtio-blk disk (/dev/vdb):
-// the guest mounts it as the container rootfs, so rootfs writes land on this
-// host-backed file (off guest RAM) -> memory-only CH snapshot, no balloon.
-//
-// The size is a deterministic function of srcDir's contents, so the cold-boot
-// build and the restore-time rebuild from the same OCI image agree (the guest
-// resumes with the ext4 superblock cached in RAM, which must match the disk).
-//
-// Requires mkfs.ext4 (e2fsprogs) on PATH in the worker image. The image is
-// recreated from scratch each call (reset-to-golden recreates it from the golden
-// bundle), so any prior file at outPath is truncated.
-//
-// mkfs.ext4 -d copies srcDir's tree (perms, ownership, symlinks, xattrs) into the
-// new filesystem without needing a loop mount or root's mount privileges — it
-// writes the filesystem structures directly to the image file.
-func BuildExt4Image(ctx context.Context, srcDir, outPath string) error {
-	if fi, err := os.Stat(srcDir); err != nil || !fi.IsDir() {
-		return fmt.Errorf("BuildExt4Image: source %q is not a directory: %v", srcDir, err)
-	}
-	sizeMiB, inodes, err := rootfsDiskGeometry(srcDir)
-	if err != nil {
-		return fmt.Errorf("BuildExt4Image: sizing from %q: %w", srcDir, err)
-	}
-
-	// Truncate to size first so mkfs writes into a sparse file of the right size
-	// (mkfs.ext4 also accepts a size argument, but a pre-sized file is unambiguous
-	// and keeps the on-disk size predictable for the snapshot config).
-	if err := os.Remove(outPath); err != nil && !os.IsNotExist(err) {
-		return fmt.Errorf("BuildExt4Image: removing stale image %q: %w", outPath, err)
-	}
-	f, err := os.OpenFile(outPath, os.O_CREATE|os.O_RDWR, 0o600)
-	if err != nil {
-		return fmt.Errorf("BuildExt4Image: creating image %q: %w", outPath, err)
-	}
-	if err := f.Truncate(int64(sizeMiB) * 1024 * 1024); err != nil {
-		f.Close()
-		return fmt.Errorf("BuildExt4Image: sizing image %q: %w", outPath, err)
-	}
-	f.Close()
-
-	// -F: don't prompt (operating on a regular file, not a block device).
-	// -q: quiet. -d: populate from srcDir. -N: fix the inode count to the tree's
-	// entries + slack (the default size-derived ratio can starve a file-heavy
-	// rootfs of inodes). -E lazy_*=0: write tables eagerly so the image is fully
-	// materialized (deterministic on-disk bytes, important for the reset-to-golden
-	// "verbatim copy" approach). -O ^has_journal: a reset-each-restore rootfs gains
-	// nothing from a journal and it adds nondeterminism.
-	args := []string{
-		"-F", "-q",
-		"-N", strconv.FormatInt(inodes, 10),
-		"-E", "lazy_itable_init=0,lazy_journal_init=0",
-		"-O", "^has_journal",
-		"-d", srcDir,
-		outPath,
-		strconv.Itoa(sizeMiB) + "M",
-	}
-	cmd := exec.CommandContext(ctx, "mkfs.ext4", args...)
-	if out, err := cmd.CombinedOutput(); err != nil {
-		return fmt.Errorf("BuildExt4Image: mkfs.ext4 %v: %w: %s", args, err, out)
-	}
-	return nil
-}
diff --git a/cmd/ateom-microvm/internal/kata/disk_test.go b/cmd/ateom-microvm/internal/kata/disk_test.go
deleted file mode 100644
index f06fb483b..000000000
--- a/cmd/ateom-microvm/internal/kata/disk_test.go
+++ /dev/null
@@ -1,76 +0,0 @@
-//go:build linux
-
-// Copyright 2026 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package kata
-
-import (
-	"os"
-	"path/filepath"
-	"testing"
-)
-
-func TestRootfsDiskGeometry(t *testing.T) {
-	dir := t.TempDir()
-	if err := os.Mkdir(filepath.Join(dir, "sub"), 0o755); err != nil {
-		t.Fatal(err)
-	}
-	if err := os.WriteFile(filepath.Join(dir, "a"), make([]byte, 1<<20), 0o644); err != nil {
-		t.Fatal(err)
-	}
-	if err := os.WriteFile(filepath.Join(dir, "sub", "b"), make([]byte, 2<<20), 0o644); err != nil {
-		t.Fatal(err)
-	}
-	if err := os.Symlink("a", filepath.Join(dir, "link")); err != nil {
-		t.Fatal(err)
-	}
-	// Entries the walk should count: the root dir, sub, a, sub/b, link = 5.
-	const wantEntries = 5
-
-	size1, inodes1, err := rootfsDiskGeometry(dir)
-	if err != nil {
-		t.Fatalf("rootfsDiskGeometry: %v", err)
-	}
-
-	// Determinism is required: the cold-boot build and the restore-time rebuild
-	// must produce an identically-sized disk for the same tree.
-	size2, inodes2, err := rootfsDiskGeometry(dir)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if size1 != size2 || inodes1 != inodes2 {
-		t.Errorf("non-deterministic geometry: (%d MiB, %d inodes) vs (%d MiB, %d inodes)", size1, inodes1, size2, inodes2)
-	}
-
-	// Size must cover the ~3 MiB of contents plus the scratch reserve.
-	if floorMiB := rootfsDiskScratchBytes/(1<<20) + 3; size1 < floorMiB {
-		t.Errorf("size %d MiB below expected floor %d MiB", size1, floorMiB)
-	}
-
-	// Inodes must cover every entry plus the reserve (so a file-heavy rootfs can
-	// still create files), and never fewer than the entries present.
-	if inodes1 < wantEntries {
-		t.Errorf("inodes %d < %d entries", inodes1, wantEntries)
-	}
-	if inodes1 < 8192 {
-		t.Errorf("inodes %d missing the fixed reserve", inodes1)
-	}
-}
-
-func TestRootfsDiskGeometryMissingDir(t *testing.T) {
-	if _, _, err := rootfsDiskGeometry(filepath.Join(t.TempDir(), "does-not-exist")); err == nil {
-		t.Fatal("rootfsDiskGeometry on a missing dir: want error, got nil")
-	}
-}
diff --git a/cmd/ateom-microvm/internal/kata/specconv.go b/cmd/ateom-microvm/internal/kata/specconv.go
index 4ac75f992..cfcceb00e 100644
--- a/cmd/ateom-microvm/internal/kata/specconv.go
+++ b/cmd/ateom-microvm/internal/kata/specconv.go
@@ -17,63 +17,19 @@
 package kata
 
 import (
-	"context"
-	"fmt"
-
 	"github.com/agent-substrate/substrate/cmd/ateom-microvm/internal/third_party/kata/agentpb"
 	specs "github.com/opencontainers/runtime-spec/specs-go"
 )
 
-// StartBlkWorkload starts the actor container with its rootfs backed by a single
-// boot-time virtio-blk disk (devPath, e.g. "/dev/vdb") — the virtio-blk-rootfs
-// path. There is NO overlay, NO virtio-fs, NO tmpfs upper: the agent direct-mounts
-// devPath (ext4) as the container rootfs, so rootfs writes land on the host-backed
-// disk file (off guest RAM) and the CH snapshot stays memory-only with no balloon.
-//
-// One "blk" storage: source is the /dev node (kata's block storage handler mounts
-// it directly when source starts with /dev — no uevent/auto-enumeration wait,
-// unlike a hotplugged disk), fstype ext4, mounted at the container rootfs path.
-// The spec's Root.Path is set to that mount point, which the agent's setup_bundle
-// then uses as the container root.
-func (a *AgentClient) StartBlkWorkload(ctx context.Context, containerID, devPath string, spec *specs.Spec) error {
-	rootfs := "/run/kata-containers/" + containerID + "/rootfs"
-	storages := []*agentpb.Storage{
-		{
-			Driver:     "blk",
-			Source:     devPath,
-			Fstype:     "ext4",
-			MountPoint: rootfs,
-			Options:    []string{"rw"},
-		},
-	}
-
-	pbSpec := SpecToAgentPB(spec)
-	pbSpec.Root = &agentpb.Root{Path: rootfs, Readonly: false}
-
-	if err := a.CreateContainer(ctx, &agentpb.CreateContainerRequest{
-		ContainerId: containerID,
-		ExecId:      containerID,
-		Storages:    storages,
-		OCI:         pbSpec,
-	}); err != nil {
-		return fmt.Errorf("creating blk workload %q: %w", containerID, err)
-	}
-	if err := a.StartContainer(ctx, containerID); err != nil {
-		return fmt.Errorf("starting blk workload %q: %w", containerID, err)
-	}
-	return nil
-}
-
 // SpecToAgentPB converts an OCI runtime spec into the kata-agent's protobuf Spec
-// (agentpb.Spec) for a CreateContainer ttrpc call. The shim normally does this
-// conversion; ateom does it itself when it drives the agent directly ("be your
-// own hook scheduler"). A blind json round-trip does NOT work: agentpb's Spec
-// JSON tags are PascalCase (from oci.proto), while OCI config.json is lowercase.
+// (agentpb.Spec) for a CreateContainer ttrpc call. A blind json round-trip does NOT
+// work: agentpb's Spec JSON tags are PascalCase (from oci.proto), while OCI
+// config.json is lowercase.
 //
 // Only the fields the kata-agent needs to create + start a container are mapped
-// (process, root, mounts, linux namespaces/resources/cgroup/masked+readonly
-// paths). The container rootfs is provided out-of-band as storages; the caller
-// is expected to set the returned spec's Root.Path to the overlay mount point.
+// (process, root, mounts, linux namespaces/resources/cgroup/masked+readonly paths).
+// The container rootfs is provided out-of-band as storages; the caller sets the
+// returned spec's Root.Path to the overlay mount point.
 func SpecToAgentPB(s *specs.Spec) *agentpb.Spec {
 	if s == nil {
 		return nil
diff --git a/cmd/ateom-microvm/restore.go b/cmd/ateom-microvm/restore.go
index 1c7c82b84..015730944 100644
--- a/cmd/ateom-microvm/restore.go
+++ b/cmd/ateom-microvm/restore.go
@@ -37,16 +37,17 @@ import (
 // RestoreWorkload restores the actor on a (possibly different) pod by relaunching
 // cloud-hypervisor directly from the downloaded snapshot and resuming.
 //
-// Contract with atelet: the memory-only snapshot dir (config.json + state.json +
-// memory-ranges + base-id) has been downloaded to RestoreStateDir.
+// Contract with atelet: the snapshot dir (config.json + state.json + memory-ranges +
+// base-id) has been downloaded to RestoreStateDir.
 //
-// There is NO virtiofsd and NO shared-dir to reconstruct — the rootfs is the
-// writable /dev/vdb disk, which CH reopens from the path recorded in the snapshot
-// config.json. Steps: rewrite the vsock socket path to this actor's VMDir,
-// reset /dev/vdb to the golden disk template (or rebuild it from the OCI image),
-// rebuild the tap (the snapshot's virtio-net is fd-backed → fresh net_fds),
-// relaunch CH with --restore, and resume. Guest RAM (incl. the actor's in-memory
-// state and frozen network config) comes back from the memory-only snapshot.
+// Each container's rootfs is overlay(virtio-fs RO lower + guest-tmpfs upper). Steps:
+// reconstruct each RO lower from the local OCI bundle (atelet re-unpacked the golden
+// image) at the frozen find-paths path and start the virtiofsd serving them; rewrite
+// the snapshot config's per-VMDir paths (vsock + serial + fs socket) to this actor's;
+// rebuild the tap (the snapshot's virtio-net is fd-backed → fresh net_fds); relaunch
+// CH with --restore (OnDemand), and resume. Guest RAM — incl. the actor's in-memory
+// state, the tmpfs rootfs upper (so rootfs writes PERSIST), and the frozen network
+// config — comes back from the memory snapshot.
 func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.RestoreWorkloadRequest) (resp *ateompb.RestoreWorkloadResponse, retErr error) {
 	s.lock.Lock()
 	defer s.lock.Unlock()
@@ -77,48 +78,35 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore
 		return nil, fmt.Errorf("while creating VM dir: %w", err)
 	}
 
-	// Recreate the /dev/vdb backing file the snapshot references (the actor dir),
-	// reset-to-golden. Two ways, both byte-consistent with the golden snapshot's
-	// guest ext4 cache:
-	//   - same-node: a verbatim golden template (copyDiskFile) — guaranteed identical.
-	//   - cross-node: rebuild from the OCI image atelet unpacked to the bundle at
-	//     restore (mkfs.ext4 -d is LAYOUT-deterministic for identical inputs, so the
-	//     data blocks land at the same offsets the guest cache expects; only the
-	//     superblock UUID/timestamps differ, which are cached in RAM and not re-read).
-	// Either way the actor's prior rootfs writes are discarded (gVisor semantics).
+	// Reconstruct each container's overlay RO lower from the LOCAL OCI bundle (atelet
+	// re-unpacked the golden image; the lower is the immutable golden image) at the
+	// frozen find-paths location SharedDir(id)/<cid>/rootfs, and start the one virtiofsd
+	// serving them. The writable upper is a guest tmpfs restored from the memory
+	// snapshot (rootfs writes persist), so there is no disk to rebuild or repoint; the
+	// fs socket in the snapshot config is repointed to this VMDir by
+	// rewriteSnapshotSocketPaths above. cross-node consistency relies on a deterministic
+	// unpack of the same image at the same <cid>/rootfs path.
 	containers := req.GetSpec().GetContainers()
-	if len(containers) != 1 {
-		return nil, status.Errorf(codes.Unimplemented, "ateom-microvm supports exactly one container, got %d", len(containers))
+	if len(containers) == 0 {
+		return nil, status.Error(codes.InvalidArgument, "actor spec has no containers")
 	}
-	actorDir := ateompath.ActorPath(ns, name, id)
-	diskPath := filepath.Join(actorDir, actorRootfsDiskName)
-	if tmpl := filepath.Join(actorDir, goldenRootfsDiskName); !fileMissing(tmpl) {
-		if err := copyDiskFile(ctx, tmpl, diskPath); err != nil {
-			return nil, fmt.Errorf("while resetting rootfs disk to golden (template): %w", err)
-		}
-		slog.InfoContext(ctx, "Reset actor rootfs disk to golden (template)", slog.String("id", id))
-	} else {
-		bundleRootfs := filepath.Join(ateompath.OCIBundlePath(ns, name, id, containers[0].GetName()), "rootfs")
-		// Cross-node restore rebuilds from the bundle (no local golden template),
-		// so re-inject DNS here too; the same-node golden-copy path above already
-		// carries it from the golden boot.
-		if err := writeGuestResolvConf(bundleRootfs); err != nil {
-			return nil, fmt.Errorf("while writing guest resolv.conf: %w", err)
-		}
-		if err := kata.BuildExt4Image(ctx, bundleRootfs, diskPath); err != nil {
-			return nil, fmt.Errorf("while reconstructing rootfs disk from image: %w", err)
-		}
-		slog.InfoContext(ctx, "Reconstructed actor rootfs disk from image", slog.String("id", id))
+	if len(containers) > maxActorContainers {
+		return nil, status.Errorf(codes.Unimplemented, "ateom-microvm supports at most %d containers, got %d", maxActorContainers, len(containers))
 	}
-
-	// Repoint the snapshot config's writable /dev/vdb disk at THIS actor's
-	// reconstructed backing file. The golden snapshot recorded the golden actor's
-	// per-actor disk path, which is stale on any pod restoring a different actor
-	// (and absent on any node that never ran the golden) — unlike /dev/vda, the
-	// content-addressed kata image whose path is identical on every node.
-	if err := repointActorRootfsDisk(restoreDir, diskPath); err != nil {
-		return nil, fmt.Errorf("while repointing actor rootfs disk in snapshot config: %w", err)
+	ctrs, err := s.buildActorContainers(ns, name, id, containers)
+	if err != nil {
+		return nil, err
+	}
+	vfsdCmd, err := s.stageOverlayLowers(ctx, rr, id, ctrs)
+	if err != nil {
+		return nil, err
 	}
+	defer func() {
+		if retErr != nil && vfsdCmd.Process != nil {
+			_ = vfsdCmd.Process.Kill()
+			_, _ = vfsdCmd.Process.Wait()
+		}
+	}()
 
 	// Networking: rebuild the per-activation veth + tap; the snapshot's virtio-net
 	// is fd-backed, so CH needs fresh tap FDs (net_fds) on restore.
@@ -157,7 +145,7 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore
 	}
 
 	// Relaunch CH and restore with the tap FDs attached (SCM_RIGHTS). CH reopens
-	// /dev/vda (image) + /dev/vdb (actor rootfs) from the snapshot config paths.
+	// /dev/vda (image) + each /dev/vd{b+i} (actor rootfs) from the snapshot config paths.
 	apiSocket := filepath.Join(kata.VMDir(id), "clh-api-restore.sock")
 	chCmd, client, err := ch.LaunchVMM(ctx, ch.LaunchVMMOptions{
 		Binary: rr.chBinary, APISocket: apiSocket, Stdout: slogWriter{ctx}, Stderr: slogWriter{ctx},
@@ -184,12 +172,13 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore
 		return nil, fmt.Errorf("while resuming restored guest: %w", err)
 	}
 
-	ra := &runningActor{chCmd: chCmd, apiSocket: apiSocket, baseID: srcID, restoreSourceDir: restoreDir}
+	ra := &runningActor{chCmd: chCmd, vfsdCmd: vfsdCmd, apiSocket: apiSocket, baseID: srcID, restoreSourceDir: restoreDir}
 
-	// Re-attach stdout/stderr forwarding: the restored guest's container + kata-agent
-	// are alive, so a fresh dial over this actor's vsock resumes ReadStdout/ReadStderr
-	// (same containerID==execID==id as the cold run). Best-effort — a failed dial must
-	// not fail the restore (the actor is already running); forwarding is just skipped.
+	// Re-attach stdout/stderr forwarding for each container: the restored guest's
+	// containers + kata-agent are alive, so a fresh dial over this actor's vsock
+	// resumes ReadStdout/ReadStderr. The overlay workload's container/exec id is
+	// <name>_ovl (same as the cold run). Best-effort — a failed dial must not fail the
+	// restore (the actor is already running); forwarding is just skipped.
 	vsockPath := kata.VsockSocketPath(id)
 	logAC, dialErr := dialAgentRetry(ctx, vsockPath, 15*time.Second)
 	if dialErr != nil {
@@ -197,22 +186,24 @@ func (s *AteomService) RestoreWorkload(ctx context.Context, req *ateompb.Restore
 			slog.String("id", id), slog.Any("err", dialErr))
 	} else {
 		ra.logAgent = logAC
-		s.startActorLogForwarding(logAC, id, name, ns, containers[0].GetName())
+		for _, c := range containers {
+			s.startActorLogForwarding(logAC, id, overlayWorkloadID(c.GetName()), c.GetName(), name, ns)
+		}
 	}
 
 	s.running[id] = ra
 	s.actorLogger.EmitLifecycleLog("Actor restored", id, name, ns)
-	slog.InfoContext(ctx, "Actor restored (owned-boot, virtio-blk rootfs)",
+	slog.InfoContext(ctx, "Actor restored (overlay rootfs)",
 		slog.String("id", id), slog.Duration("total", time.Since(tStart)))
 	return &ateompb.RestoreWorkloadResponse{}, nil
 }
 
-// rewriteSnapshotSocketPaths repoints the snapshot config.json's per-sandbox
-// hybrid-vsock socket from the source actor's VMDir to the restoring actor's
-// VMDir, so the socket we create is the one CH reopens. The kernel and /dev/vda
-// kata image are content-addressed static files with identical paths on every
-// node, so they need no rewrite; the writable /dev/vdb actor rootfs disk is
-// per-actor and is repointed separately (see repointActorRootfsDisk).
+// rewriteSnapshotSocketPaths repoints the snapshot config.json's per-VMDir paths from
+// the source actor's VMDir to the restoring actor's: the hybrid-vsock socket, the
+// File serial console, and each virtio-fs (overlay RO lower) socket, so the sockets/
+// files we create are the ones CH reopens. The kernel and /dev/vda kata image are
+// content-addressed static files with identical paths on every node, so they need no
+// rewrite, and the overlay has no per-actor disk to repoint.
 func rewriteSnapshotSocketPaths(snapshotDir, id string) error {
 	cfgPath := filepath.Join(snapshotDir, "config.json")
 	b, err := os.ReadFile(cfgPath)
@@ -226,8 +217,8 @@ func rewriteSnapshotSocketPaths(snapshotDir, id string) error {
 	if vsock, ok := cfg["vsock"].(map[string]any); ok {
 		vsock["socket"] = kata.VsockSocketPath(id)
 	}
-	// The owned-boot path captures the guest serial console to a file under the
-	// source actor's VMDir (Serial{Mode:"File"}). On restore that path is stale
+	// ateom captures the guest serial console to a file under the source actor's
+	// VMDir (Serial{Mode:"File"}). On restore that path is stale
 	// (points at the golden/source pod's VMDir), so CH's CreateConsoleDevice fails
 	// (No such file or directory). Repoint it at this actor's VMDir.
 	if serial, ok := cfg["serial"].(map[string]any); ok {
@@ -235,6 +226,15 @@ func rewriteSnapshotSocketPaths(snapshotDir, id string) error {
 			serial["file"] = filepath.Join(kata.VMDir(id), "serial.log")
 		}
 	}
+	// The overlay RO lower is served by a per-VMDir virtiofsd socket; the snapshot
+	// recorded the golden actor's, so repoint each fs device at this actor's VMDir.
+	if fss, ok := cfg["fs"].([]any); ok {
+		for _, f := range fss {
+			if fm, ok := f.(map[string]any); ok {
+				fm["socket"] = kata.VirtiofsdSocketPath(id)
+			}
+		}
+	}
 	out, err := json.Marshal(cfg)
 	if err != nil {
 		return err
@@ -244,45 +244,3 @@ func rewriteSnapshotSocketPaths(snapshotDir, id string) error {
 	}
 	return nil
 }
-
-// repointActorRootfsDisk rewrites the snapshot config.json so the writable
-// /dev/vdb actor rootfs disk points at this actor's reconstructed backing file
-// (diskPath). The actor rootfs disk lives under the actor's per-actor directory
-// (keyed by actor id), so the golden snapshot's recorded path is the GOLDEN
-// actor's — stale on any pod restoring a different actor, and absent on any node
-// that never ran the golden. (This is the disk analogue of the serial.file
-// repoint in rewriteSnapshotSocketPaths.) The disk is identified by basename so
-// the read-only /dev/vda kata image (a content-addressed static file) is left
-// untouched; it is an error if no actor rootfs disk is present to repoint.
-func repointActorRootfsDisk(snapshotDir, diskPath string) error {
-	cfgPath := filepath.Join(snapshotDir, "config.json")
-	b, err := os.ReadFile(cfgPath)
-	if err != nil {
-		return err
-	}
-	var cfg map[string]any
-	if err := json.Unmarshal(b, &cfg); err != nil {
-		return fmt.Errorf("parsing %q: %w", cfgPath, err)
-	}
-	rewrote := false
-	if disks, ok := cfg["disks"].([]any); ok {
-		for _, d := range disks {
-			dm, ok := d.(map[string]any)
-			if !ok {
-				continue
-			}
-			if p, _ := dm["path"].(string); filepath.Base(p) == actorRootfsDiskName {
-				dm["path"] = diskPath
-				rewrote = true
-			}
-		}
-	}
-	if !rewrote {
-		return fmt.Errorf("no %q disk found in %q to repoint", actorRootfsDiskName, cfgPath)
-	}
-	out, err := json.Marshal(cfg)
-	if err != nil {
-		return err
-	}
-	return os.WriteFile(cfgPath, out, 0o600)
-}
diff --git a/cmd/ateom-microvm/run.go b/cmd/ateom-microvm/run.go
index 2acdd57bd..d66abc2bf 100644
--- a/cmd/ateom-microvm/run.go
+++ b/cmd/ateom-microvm/run.go
@@ -41,8 +41,6 @@ import (
 // cloud-hypervisor process directly (booted by RunWorkload or relaunched by
 // RestoreWorkload), so it tracks that process and its api-socket for teardown.
 type runningActor struct {
-	containerName string
-
 	// baseID is the FROZEN base sandbox id propagated across this actor's restore
 	// lineage. For a cold-run actor this is the actor's own id; for a restored
 	// actor it is the id read from the snapshot's base-id file (the golden id,
@@ -52,15 +50,18 @@ type runningActor struct {
 
 	// ateom owns this CH process (booted at Run or relaunched at Restore).
 	chCmd *exec.Cmd
+	// vfsdCmd is the virtiofsd serving the overlay RO lower (the CH fs device
+	// demand-pages from it for the actor's lifetime). ateom owns it; teardownActor
+	// kills it after the CH process.
+	vfsdCmd *exec.Cmd
 	// apiSocket is the CH api-socket for this ateom-owned VMM.
 	apiSocket string
 
 	// restoreSourceDir is the snapshot dir this actor was OnDemand-restored from
-	// (the base CH is demand-paging from). Set only on the owned-boot virtio-blk
-	// path when restored via OnDemand. CheckpointWorkload overlays CH's new (sparse,
-	// faulted-only) snapshot onto this base to produce a COMPLETE snapshot (CH's
-	// OnDemand snapshot alone drops the un-faulted pages). Empty for cold-run actors
-	// (their snapshot is already complete).
+	// (CH demand-pages its guest RAM from it). Set when restored via OnDemand.
+	// CheckpointWorkload overlays CH's new (sparse, faulted-only) snapshot onto this
+	// base to produce a COMPLETE snapshot (CH's OnDemand snapshot alone drops the
+	// un-faulted pages). Empty for cold-run actors (their snapshot is already complete).
 	restoreSourceDir string
 
 	// logAgent is the kata-agent ttrpc client kept open for the lifetime of the
@@ -86,42 +87,37 @@ const baseIDFile = "base-id"
 // Asset names in RunWorkloadRequest.runtime_asset_paths (set by atelet's
 // fetchRuntimeAssets, keyed by the ActorTemplate runtime asset names).
 const (
-	assetCH     = "cloud-hypervisor"
-	assetKernel = "kata-kernel"
-	assetImage  = "kata-image"
-	assetConfig = "kata-config"
+	assetCH        = "cloud-hypervisor"
+	assetKernel    = "kata-kernel"
+	assetImage     = "kata-image"
+	assetConfig    = "kata-config"
+	assetVirtiofsd = "virtiofsd"
 )
 
-// actorRootfsDiskName is the actor's writable rootfs disk file under the actor
-// dir; it is the /dev/vdb backing path recorded in the snapshot config.json and
-// reopened verbatim on restore.
-const actorRootfsDiskName = "actor-rootfs.ext4"
-
-// goldenRootfsDiskName is the verbatim copy of the actor's /dev/vdb disk AS-OF the
-// golden snapshot, kept under the actor dir. reset-to-golden recreates /dev/vdb
-// from it on restore (byte-identical to what the snapshot's guest RAM/ext4 cache
-// expects), discarding the actor's later rootfs writes — gVisor semantics.
-const goldenRootfsDiskName = "golden-rootfs.ext4"
-
-// fileMissing reports whether path does not exist.
-func fileMissing(path string) bool {
-	_, err := os.Stat(path)
-	return os.IsNotExist(err)
-}
+// maxActorContainers is a sanity cap on containers per actor (all share the one
+// micro-VM + virtiofsd). 25 is far above any real pod.
+const maxActorContainers = 25
 
-// copyDiskFile copies a (sparse) disk image verbatim, preserving holes so the
-// (mostly-empty) ext4 image doesn't materialize its scratch blocks. Used to
-// save/restore the golden rootfs disk template.
-func copyDiskFile(ctx context.Context, src, dst string) error {
-	tmp := dst + ".tmp"
-	_ = os.Remove(tmp)
-	if out, err := exec.CommandContext(ctx, "cp", "--sparse=always", src, tmp).CombinedOutput(); err != nil {
-		return fmt.Errorf("cp %s -> %s: %w: %s", src, tmp, err, out)
-	}
-	if err := os.Rename(tmp, dst); err != nil {
-		return fmt.Errorf("rename %s -> %s: %w", tmp, dst, err)
-	}
-	return nil
+// overlayWorkloadID is the kata containerID of a container's overlay WORKLOAD,
+// distinct from its carrier container (the carrier keeps the bare container name so
+// the agent binds the RO base to /run/kata-containers/<name>/rootfs; the workload
+// overlays on top). Stable across the restore lineage (container names don't change).
+//
+// The "_ovl" separator is deliberately a character that is invalid in a Kubernetes
+// container name (DNS-1123 labels are [a-z0-9-]): the carrier id is the bare name, so a
+// workload id can never equal a carrier id (a bare name has no "_") nor another workload
+// id (names are unique within an actor) — even for containers named "x" and "x-ovl". A
+// "-ovl" suffix would let "x"'s workload id collide with the "x-ovl" carrier id.
+func overlayWorkloadID(name string) string { return name + "_ovl" }
+
+// actorContainer is one of the actor's containers prepared for the shared micro-VM:
+// its name (also the kata containerID + the overlay lower's find-paths subdir), the
+// host OCI bundle rootfs that backs the RO lower, and its OCI spec. The writable
+// overlay upper is a guest tmpfs (OverlayUpperBase(name)), so there is no host disk.
+type actorContainer struct {
+	name         string
+	bundleRootfs string
+	spec         *specs.Spec
 }
 
 // resolvedRuntime holds the concrete binary/config paths for a request, taken
@@ -129,6 +125,7 @@ func copyDiskFile(ctx context.Context, src, dst string) error {
 type resolvedRuntime struct {
 	chBinary   string // path to the cloud-hypervisor binary
 	configFile string // path to the kata configuration.toml
+	virtiofsd  string // path to virtiofsd (overlay RO lower); "" => "virtiofsd" on PATH
 }
 
 // firstNonEmpty returns the first non-empty string, or "" if all are empty.
@@ -147,13 +144,14 @@ func (s *AteomService) resolveRuntime(paths map[string]string) resolvedRuntime {
 	return resolvedRuntime{
 		chBinary:   firstNonEmpty(paths[assetCH], s.chBinary),
 		configFile: firstNonEmpty(paths[assetConfig], s.kataConfig),
+		virtiofsd:  paths[assetVirtiofsd],
 	}
 }
 
-// writeGuestResolvConf copies the worker pod's /etc/resolv.conf into the bundle
-// rootfs (before it's packed into the ext4 disk) so the guest gets cluster DNS:
-// ateom drops atelet's resolv.conf bind and sends no CreateSandbox.Dns, so the
-// guest can otherwise reach IPs but not resolve names.
+// writeGuestResolvConf copies the worker pod's /etc/resolv.conf into a container's
+// bundle rootfs (the overlay RO lower) so the guest gets cluster DNS: ateom drops
+// atelet's resolv.conf bind and sends no CreateSandbox.Dns, so the guest can
+// otherwise reach IPs but not resolve names.
 func writeGuestResolvConf(rootfs string) error {
 	content, err := os.ReadFile("/etc/resolv.conf")
 	if err != nil {
@@ -172,20 +170,18 @@ func writeGuestResolvConf(rootfs string) error {
 	return nil
 }
 
-// RunWorkload boots the actor as a cloud-hypervisor micro-VM that ateom owns.
+// RunWorkload boots the actor as a cloud-hypervisor micro-VM and starts its containers.
 //
-// ateom boots cloud-hypervisor itself — no kata shim — and gives the actor a
-// writable boot-time virtio-blk disk (/dev/vdb, built from the OCI bundle rootfs)
-// as its container rootfs. Rootfs data lives on that host-backed disk rather than
-// a guest tmpfs overlay-upper, so the CH snapshot is memory-only with no balloon
-// needed to reclaim a RAM-backed upper. It replicates the kata clh boot (vm.create
-// kernel+image, add-net, vm.boot) and the shim's post-boot work (agent
-// CreateSandbox + guest network config) before driving the kata-agent to start the
-// blk-rootfs container.
+// ateom boots cloud-hypervisor directly (no kata shim) and gives each container an
+// overlay rootfs: its OCI image read-only over virtio-fs (the lower) plus a guest
+// tmpfs (the writable upper). It drives the kata clh boot (vm.create kernel+image+fs,
+// add-net, vm.boot) and the post-boot setup the shim would otherwise do (agent
+// CreateSandbox + guest network config) before having the kata-agent assemble and
+// start each container.
 //
-// Contract with atelet (mirrors ateom-gvisor):
-//   - The runtime assets (guest kernel, guest OS image, cloud-hypervisor, base
-//     kata config) are on disk and passed as runtime asset paths.
+// Contract with atelet:
+//   - The runtime assets (guest kernel, guest OS image, cloud-hypervisor, virtiofsd,
+//     base kata config) are on disk and passed as runtime asset paths.
 //   - The OCI bundle (config.json + populated rootfs/) is prepared per container.
 func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkloadRequest) (resp *ateompb.RunWorkloadResponse, retErr error) {
 	s.lock.Lock()
@@ -197,24 +193,25 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload
 
 	s.actorLogger.EmitLifecycleLog("Actor starting", id, name, ns)
 
-	// KNOWN GAP vs the gVisor runtime: it runs multiple containers per actor; this
-	// runtime is single-container for now. Multi-container is a mechanical extension
-	// (one boot-time virtio-blk rootfs disk + agent CreateContainer per container,
-	// sharing the one guest/sandbox) and is tracked as follow-up work.
+	// All of the actor's containers share the one micro-VM (which is the pod
+	// sandbox): each gets its own overlay rootfs and its own kata-agent
+	// CreateContainer/StartContainer, driven below after the shared boot +
+	// CreateSandbox + guest networking.
 	containers := req.GetSpec().GetContainers()
-	if len(containers) != 1 {
-		return nil, status.Errorf(codes.Unimplemented, "ateom-microvm supports exactly one container, got %d", len(containers))
+	if len(containers) == 0 {
+		return nil, status.Error(codes.InvalidArgument, "actor spec has no containers")
+	}
+	if len(containers) > maxActorContainers {
+		return nil, status.Errorf(codes.Unimplemented, "ateom-microvm supports at most %d containers, got %d", maxActorContainers, len(containers))
 	}
-	containerName := containers[0].GetName()
 
-	// Owned-boot builds the CH vm.create itself, so it needs the guest kernel +
-	// image paths directly.
+	// ateom builds the CH vm.create itself, so it needs the guest kernel + image
+	// paths directly.
 	paths := req.GetRuntimeAssetPaths()
 	kernel, image := paths[assetKernel], paths[assetImage]
 	if kernel == "" || image == "" {
-		return nil, fmt.Errorf("owned-boot requires %q and %q asset paths", assetKernel, assetImage)
+		return nil, fmt.Errorf("ateom-microvm requires %q and %q asset paths", assetKernel, assetImage)
 	}
-	actorDir := ateompath.ActorPath(ns, name, id)
 	rr := s.resolveRuntime(paths)
 
 	// Networking (host side): per-activation veth into the interior netns. The
@@ -230,20 +227,11 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload
 		}
 	}()
 
-	bundle := ateompath.OCIBundlePath(ns, name, id, containerName)
-	spec, err := ensureKataCompatibleSpec(bundle, id, ateompath.AteomNetNSPath(s.podUID))
+	// Prepare each container's OCI spec + record its bundle rootfs (the overlay RO
+	// lower). No host disk — the rootfs is overlay(virtio-fs lower + guest-tmpfs upper).
+	ctrs, err := s.buildActorContainers(ns, name, id, containers)
 	if err != nil {
-		return nil, fmt.Errorf("while preparing kata OCI spec: %w", err)
-	}
-
-	// Build the actor's writable rootfs as a raw ext4 virtio-blk disk from the
-	// atelet-populated OCI bundle rootfs. This becomes /dev/vdb.
-	diskPath := filepath.Join(actorDir, actorRootfsDiskName)
-	if err := writeGuestResolvConf(filepath.Join(bundle, "rootfs")); err != nil {
-		return nil, fmt.Errorf("while writing guest resolv.conf: %w", err)
-	}
-	if err := kata.BuildExt4Image(ctx, filepath.Join(bundle, "rootfs"), diskPath); err != nil {
-		return nil, fmt.Errorf("while building actor rootfs disk: %w", err)
+		return nil, err
 	}
 
 	// Guest sizing + agent kernel params from the kata config.
@@ -258,6 +246,20 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload
 		return nil, fmt.Errorf("while creating VM dir: %w", err)
 	}
 
+	// Stage the overlay RO lowers (bind each image into the shared dir) + start the
+	// virtiofsd that serves them. CH connects to it at vm.create and demand-pages for
+	// the actor's lifetime, so ateom owns the process (killed in teardownActor).
+	vfsdCmd, err := s.stageOverlayLowers(ctx, rr, id, ctrs)
+	if err != nil {
+		return nil, err
+	}
+	defer func() {
+		if retErr != nil && vfsdCmd.Process != nil {
+			_ = vfsdCmd.Process.Kill()
+			_, _ = vfsdCmd.Process.Wait()
+		}
+	}()
+
 	// Launch a bare VMM (CH + api-socket); ateom owns this process for teardown.
 	apiSocket := filepath.Join(kata.VMDir(id), "clh-api.sock")
 	chCmd, client, err := ch.LaunchVMM(ctx, ch.LaunchVMMOptions{
@@ -276,11 +278,12 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload
 		}
 	}()
 
-	// Assemble the CH VmConfig (kata-compatible cmdline, RO image on /dev/vda +
-	// writable rootfs on /dev/vdb). serialLog is also read on a failed agent dial
+	// Assemble the CH VmConfig (kata-compatible cmdline, RO kata image on /dev/vda +
+	// the virtio-fs device for the overlay RO lower; no actor virtio-blk disks — the
+	// writable upper is a guest tmpfs). serialLog is also read on a failed agent dial
 	// below, so keep it here.
 	serialLog := filepath.Join(kata.VMDir(id), "serial.log")
-	vmCfg := buildVMConfig(id, kernel, image, diskPath, kparams, serialLog, memMiB, vcpus)
+	vmCfg := buildVMConfig(id, kernel, image, kparams, serialLog, memMiB, vcpus)
 	if err := client.CreateVM(ctx, vmCfg); err != nil {
 		return nil, fmt.Errorf("while creating VM: %w", err)
 	}
@@ -308,7 +311,7 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload
 	if err := client.BootVM(ctx); err != nil {
 		return nil, fmt.Errorf("while booting VM: %w", err)
 	}
-	slog.InfoContext(ctx, "Micro-VM booted (owned-boot)", slog.String("id", id), slog.String("api", apiSocket))
+	slog.InfoContext(ctx, "Micro-VM booted", slog.String("id", id), slog.String("api", apiSocket))
 
 	// Dial the kata-agent over hybrid-vsock. The agent only starts listening once
 	// the guest's init reaches kata-containers.target — well after CH creates the
@@ -335,25 +338,81 @@ func (s *AteomService) RunWorkload(ctx context.Context, req *ateompb.RunWorkload
 		}
 	}()
 
-	// Post-boot kata-agent setup: sandbox, guest networking, start the container.
-	if err := s.startActorContainer(ctx, ac, id, vsockPath, spec); err != nil {
+	// Post-boot kata-agent setup: sandbox, guest networking, start each container.
+	if err := s.startActorContainers(ctx, ac, id, vsockPath, ctrs); err != nil {
 		return nil, err
 	}
 
-	ra := &runningActor{chCmd: chCmd, apiSocket: apiSocket, containerName: containerName, baseID: id, logAgent: ac}
+	ra := &runningActor{chCmd: chCmd, vfsdCmd: vfsdCmd, apiSocket: apiSocket, baseID: id, logAgent: ac}
 	s.running[id] = ra
 
-	// Forward the actor container's stdout/stderr into the pod logs (parity with
-	// ateom-gvisor). StartBlkWorkload uses containerID==execID==id, so the agent
-	// keys the streams by id. The goroutines read over ac for the actor's lifetime
-	// and exit (io.EOF) when teardownActor closes ac.
-	s.startActorLogForwarding(ac, id, name, ns, containerName)
+	// Forward each container's stdout/stderr into the pod logs. The overlay workload's
+	// container/exec id is <name>_ovl (see startOverlayContainer), so key the streams by
+	// that and tag with the display container name. The goroutines read over ac for the
+	// actor's lifetime and exit (io.EOF) when teardownActor closes ac.
+	for _, c := range ctrs {
+		s.startActorLogForwarding(ac, id, overlayWorkloadID(c.name), c.name, name, ns)
+	}
 
 	s.actorLogger.EmitLifecycleLog("Actor started", id, name, ns)
-	slog.InfoContext(ctx, "Actor started (owned-boot, virtio-blk rootfs)", slog.String("id", id))
+	slog.InfoContext(ctx, "Actor started (overlay rootfs)", slog.String("id", id))
 	return &ateompb.RunWorkloadResponse{}, nil
 }
 
+// buildActorContainers prepares each of the actor's containers for the shared
+// micro-VM: it loads the OCI spec from the per-container bundle, injects guest DNS,
+// and records the bundle rootfs that backs the overlay's RO lower. No host disk is
+// built — the rootfs is overlay(virtio-fs RO lower + guest-tmpfs upper); the lowers
+// are bound into virtiofsd's shared dir in stageOverlayLowers after the sandbox state
+// is clean. Both RunWorkload and RestoreWorkload go through here.
+func (s *AteomService) buildActorContainers(ns, name, id string, containers []*ateompb.Container) ([]actorContainer, error) {
+	netnsPath := ateompath.AteomNetNSPath(s.podUID)
+	ctrs := make([]actorContainer, len(containers))
+	for i, c := range containers {
+		cn := c.GetName()
+		bundle := ateompath.OCIBundlePath(ns, name, id, cn)
+		spec, err := ensureKataCompatibleSpec(bundle, id, netnsPath)
+		if err != nil {
+			return nil, fmt.Errorf("while preparing kata OCI spec for %q: %w", cn, err)
+		}
+		bundleRootfs := filepath.Join(bundle, "rootfs")
+		// Write cluster DNS into the lower before it's served over virtio-fs: ateom
+		// drops atelet's resolv.conf bind and sends no CreateSandbox.Dns, so without
+		// this the guest can reach IPs but not resolve names. Doing it here covers both
+		// run and restore (both reconstruct the lower from the bundle).
+		if err := writeGuestResolvConf(bundleRootfs); err != nil {
+			return nil, fmt.Errorf("while writing guest resolv.conf for %q: %w", cn, err)
+		}
+		ctrs[i] = actorContainer{name: cn, bundleRootfs: bundleRootfs, spec: spec}
+	}
+	return ctrs, nil
+}
+
+// stageOverlayLowers makes each container's RO lower available to virtiofsd by
+// bind-mounting its OCI image rootfs into virtiofsd's find-paths location
+// (SharedDir(id)/<cid>/rootfs), then starts the one virtiofsd that serves them all.
+// Must run AFTER CleanupSandboxState (which wipes SharedDir) and the VM dir exists.
+// The returned virtiofsd cmd outlives this call (CH demand-pages from it); the caller
+// owns it (tracked on runningActor, killed in teardownActor).
+func (s *AteomService) stageOverlayLowers(ctx context.Context, rr resolvedRuntime, id string, ctrs []actorContainer) (*exec.Cmd, error) {
+	for _, c := range ctrs {
+		if err := kata.ReconstructSharedDirFromImage(ctx, c.bundleRootfs, id, c.name); err != nil {
+			return nil, fmt.Errorf("while staging overlay lower for %q: %w", c.name, err)
+		}
+	}
+	vfsdLog, _ := os.OpenFile(filepath.Join(kata.VMDir(id), "virtiofsd.log"), os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600)
+	vfsdCmd, err := kata.StartVirtiofsd(ctx, kata.VirtiofsdOptions{
+		Binary:     rr.virtiofsd,
+		SocketPath: kata.VirtiofsdSocketPath(id),
+		SharedDir:  kata.SharedDir(id),
+		Log:        vfsdLog,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("while starting virtiofsd: %w", err)
+	}
+	return vfsdCmd, nil
+}
+
 // guestConfig reads guest sizing + agent kernel params from the resolved kata
 // config, enabling the debug console (vsock 1026) for in-guest diagnostics and,
 // with kataDebug, raising the agent log level.
@@ -373,16 +432,13 @@ func (s *AteomService) guestConfig(rr resolvedRuntime) (memMiB, vcpus int, kpara
 	return cfg.MemoryMiB, cfg.VCPUs, kparams, nil
 }
 
-// buildVMConfig assembles the cloud-hypervisor VmConfig for the owned boot. The
-// kernel cmdline replicates kata's clh boot cmdline (verified against a live kata
-// snapshot's payload.cmdline): beyond the root/clh base params it MUST include
-// systemd.unit=kata-containers.target (else systemd boots the default target and
-// powers off — the guest exits ~6s in) and mask systemd-networkd (the agent owns
-// eth0). The console is ARCH-SPECIFIC: ttyAMA0 (PL011) on arm64, ttyS0 (8250) on
-// amd64 — the wrong one => "unable to open an initial console". The config's
-// kernel_params are appended; serial is captured to serialLog for boot debugging.
-// The RO guest image is /dev/vda, the writable rootfs /dev/vdb.
-func buildVMConfig(id, kernel, image, diskPath, kparams, serialLog string, memMiB, vcpus int) ch.VmConfig {
+// buildVMConfig assembles the cloud-hypervisor VmConfig. The kernel cmdline replicates
+// kata's clh boot cmdline; beyond the base params it must set
+// systemd.unit=kata-containers.target (else the guest powers off ~6s in) and mask
+// systemd-networkd (the agent owns eth0). The console is arch-specific: ttyAMA0 on
+// arm64, ttyS0 on amd64. /dev/vda is the RO guest image; the actor rootfs's RO lower is
+// the virtio-fs device on PCI segment 1 (hence num_pci_segments=2), with no actor disks.
+func buildVMConfig(id, kernel, image, kparams, serialLog string, memMiB, vcpus int) ch.VmConfig {
 	console := "ttyS0"
 	if runtime.GOARCH == "arm64" {
 		console = "ttyAMA0"
@@ -399,22 +455,28 @@ func buildVMConfig(id, kernel, image, diskPath, kparams, serialLog string, memMi
 		Payload: ch.PayloadConfig{Kernel: kernel, Cmdline: cmdline},
 		Disks: []ch.DiskConfig{
 			{Path: image, Readonly: true, ImageType: "Raw", NumQueues: int32(vcpus), QueueSize: 1024},
-			{Path: diskPath, Readonly: false, ImageType: "Raw", NumQueues: int32(vcpus), QueueSize: 1024},
 		},
-		Rng:    &ch.RngConfig{Src: "/dev/urandom"},
-		Serial: &ch.ConsoleConfig{Mode: "File", File: serialLog},
-		Vsock:  &ch.VsockConfig{Cid: 3, Socket: kata.VsockSocketPath(id)},
+		Fs: []ch.FsConfig{{
+			Tag: kata.FsTag, Socket: kata.VirtiofsdSocketPath(id),
+			NumQueues: 1, QueueSize: 1024, PciSegment: 1,
+		}},
+		Platform: &ch.PlatformConfig{NumPciSegments: 2},
+		Rng:      &ch.RngConfig{Src: "/dev/urandom"},
+		Serial:   &ch.ConsoleConfig{Mode: "File", File: serialLog},
+		Vsock:    &ch.VsockConfig{Cid: 3, Socket: kata.VsockSocketPath(id)},
 	}
 }
 
-// startActorContainer performs the post-boot kata-agent setup the shim normally
-// does at boot: establish the sandbox, configure guest networking (eth0
-// IP/MAC/MTU + routes), and start the actor container on its /dev/vdb rootfs. On
-// failure it dumps guest diagnostics over the debug console.
-func (s *AteomService) startActorContainer(ctx context.Context, ac *kata.AgentClient, id, vsockPath string, spec *specs.Spec) error {
-	// Establish the agent sandbox (the shim normally does this at boot).
+// startActorContainers performs the post-boot kata-agent setup the shim normally
+// does at boot: establish the sandbox once (mounting the kataShared virtio-fs base),
+// configure guest networking (eth0 IP/MAC/MTU + routes) once, then start each
+// container on its own overlay rootfs. On failure it dumps guest diagnostics.
+func (s *AteomService) startActorContainers(ctx context.Context, ac *kata.AgentClient, id, vsockPath string, ctrs []actorContainer) error {
+	// Establish the agent sandbox + the kataShared virtio-fs mount (the RO base for
+	// every container's overlay lower). All containers share it, so use the first
+	// container's hostname.
 	sbCtx, sbCancel := context.WithTimeout(ctx, 20*time.Second)
-	err := ac.CreateSandbox(sbCtx, &agentpb.CreateSandboxRequest{Hostname: spec.Hostname, SandboxId: id})
+	err := ac.CreateSandboxForActor(sbCtx, id, ctrs[0].spec.Hostname)
 	sbCancel()
 	if err != nil {
 		return fmt.Errorf("while creating agent sandbox: %w", err)
@@ -431,16 +493,38 @@ func (s *AteomService) startActorContainer(ctx context.Context, ac *kata.AgentCl
 		return fmt.Errorf("while configuring guest network: %w", err)
 	}
 
-	// Start the actor with its rootfs on /dev/vdb (single blk storage).
+	for _, c := range ctrs {
+		if err := startOverlayContainer(ctx, ac, vsockPath, c); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// startOverlayContainer brings up one container's rootfs as overlay(virtio-fs RO
+// lower + guest-tmpfs upper): a carrier container (id == name) eager-binds the RO base
+// to /run/kata-containers/<name>/rootfs, then the workload (id == <name>_ovl) overlays
+// it with a tmpfs upper. On failure it dumps the guest overlay state.
+func startOverlayContainer(ctx context.Context, ac *kata.AgentClient, vsockPath string, c actorContainer) error {
+	carrierCtx, carrierCancel := context.WithTimeout(ctx, 30*time.Second)
+	err := ac.CreateCarrier(carrierCtx, c.name, c.spec)
+	carrierCancel()
+	if err != nil {
+		dump := kata.DebugConsoleDump(ctx, vsockPath, "echo '== shared/containers =='; ls -la /run/kata-containers/shared/containers/ 2>&1 | head -40")
+		slog.ErrorContext(ctx, "carrier create failed; dump", slog.String("container", c.name), slog.String("dump", dump))
+		return fmt.Errorf("while creating carrier %q: %w", c.name, err)
+	}
+
+	upperBase := kata.OverlayUpperBase(c.name)
 	wlCtx, wlCancel := context.WithTimeout(ctx, 30*time.Second)
-	err = ac.StartBlkWorkload(wlCtx, id, "/dev/vdb", spec)
+	err = ac.StartOverlayWorkload(wlCtx, c.name, overlayWorkloadID(c.name), upperBase, c.spec)
 	wlCancel()
 	if err != nil {
 		dump := kata.DebugConsoleDump(ctx, vsockPath,
-			"echo '== /dev/vdb =='; ls -l /dev/vdb 2>&1; blkid /dev/vdb 2>&1; "+
-				"echo '== mounts =='; grep kata /proc/mounts 2>&1")
-		slog.ErrorContext(ctx, "blk workload failed; dump", slog.String("dump", dump))
-		return fmt.Errorf("while starting blk workload: %w", err)
+			"echo '== upper =='; ls -la "+upperBase+" 2>&1; echo '== lower =='; ls /run/kata-containers/"+c.name+"/rootfs/ 2>&1 | head; "+
+				"echo '== mounts =='; grep -E 'kata|overlay' /proc/mounts 2>&1")
+		slog.ErrorContext(ctx, "overlay workload failed; dump", slog.String("container", c.name), slog.String("dump", dump))
+		return fmt.Errorf("while starting overlay workload %q: %w", c.name, err)
 	}
 	return nil
 }
@@ -450,17 +534,18 @@ func (s *AteomService) startActorContainer(ctx context.Context, ac *kata.AgentCl
 // ReadStdout/ReadStderr) through the shared actorlog forwarder, which annotates
 // each line with the actor's ate.dev/* labels and writes it to the pod's stdout.
 //
-// The streams are keyed by containerID==execID==id (the value StartBlkWorkload
-// passed); lines are tagged with the container name (ate.dev/container_name). The
-// reader contexts are context.Background() — the goroutines are NOT bound to the RPC
-// that started them; they terminate when ac is closed (by teardownActor), which
-// makes the in-flight ReadStdout/ReadStderr fail and the StreamReader return
-// io.EOF, ending WrapContainerLogs. This keeps the agent connection (which ttrpc
-// allows concurrent Calls on) alive for forwarding while guaranteeing no goroutine
-// outlives the connection.
-func (s *AteomService) startActorLogForwarding(ac *kata.AgentClient, id, name, ns, containerName string) {
-	go s.actorLogger.WrapContainerLogs(kata.NewStdioReader(context.Background(), ac, id, id, false), id, name, ns, containerName)
-	go s.actorLogger.WrapContainerLogs(kata.NewStdioReader(context.Background(), ac, id, id, true), id, name, ns, containerName)
+// The streams are keyed by streamID == the kata containerID==execID (the overlay
+// workload id); lines are tagged with actorID + containerName
+// (ate.dev/container_name) so a multi-container actor demultiplexes.
+// The reader contexts are context.Background() — the goroutines are NOT bound to the
+// RPC that started them; they terminate when ac is closed (by teardownActor), which
+// makes the in-flight ReadStdout/ReadStderr fail and the StreamReader return io.EOF,
+// ending WrapContainerLogs. This keeps the agent connection (which ttrpc allows
+// concurrent Calls on) alive for forwarding while guaranteeing no goroutine outlives
+// the connection.
+func (s *AteomService) startActorLogForwarding(ac *kata.AgentClient, actorID, streamID, containerName, name, ns string) {
+	go s.actorLogger.WrapContainerLogs(kata.NewStdioReader(context.Background(), ac, streamID, streamID, false), actorID, name, ns, containerName)
+	go s.actorLogger.WrapContainerLogs(kata.NewStdioReader(context.Background(), ac, streamID, streamID, true), actorID, name, ns, containerName)
 }
 
 // dialAgentRetry polls DialAgent until the kata-agent answers the hybrid-vsock
diff --git a/cmd/ateom-microvm/service_integration_test.go b/cmd/ateom-microvm/service_integration_test.go
deleted file mode 100644
index ab79446ef..000000000
--- a/cmd/ateom-microvm/service_integration_test.go
+++ /dev/null
@@ -1,473 +0,0 @@
-//go:build linux
-
-// Copyright 2026 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/agent-substrate/substrate/cmd/ateom-microvm/internal/ch"
-	"github.com/agent-substrate/substrate/cmd/ateom-microvm/internal/kata"
-	"github.com/agent-substrate/substrate/internal/actorlog"
-	"github.com/agent-substrate/substrate/internal/ateompath"
-	"github.com/agent-substrate/substrate/internal/proto/ateompb"
-	"github.com/vishvananda/netns"
-)
-
-// TestServiceRunBlkRootfs covers the owned-boot cold-run path: ateom boots
-// cloud-hypervisor itself and gives the actor a writable boot-time virtio-blk
-// rootfs (/dev/vdb), then drives the kata-agent to start the container. It
-// exercises only run (no checkpoint/restore). Unlike TestServiceE2E it MUST pass
-// the guest kernel + image + base-config asset paths, because owned-boot builds
-// the CH vm.create itself rather than reading configuration.toml.
-//
-// Gated behind KATA_INTEGRATION=1. Required env:
-//
-//	KATA_ROOTFS_SRC=<dir>   a populated actor rootfs (becomes /dev/vdb)
-//	KATA_KERNEL=<path>      guest kernel (vmlinux.container)
-//	KATA_IMAGE=<path>       guest OS image (kata-containers.img, /dev/vda)
-//	KATA_CONFIG=<path>      a stock kata clh configuration.toml (for kernel_params + sizing)
-//
-// Optional: KATA_CH / KATA_VIRTIOFSD (defaults provided). Run as root on a host
-// with kata + /dev/kvm + mkfs.ext4 (e2fsprogs):
-//
-//	sudo KATA_INTEGRATION=1 KATA_ROOTFS_SRC=/path/to/rootfs KATA_KERNEL=... KATA_IMAGE=... \
-//	  KATA_CONFIG=... ./ateom-microvm.test -test.v -test.run BlkRootfs
-func TestServiceRunBlkRootfs(t *testing.T) {
-	if os.Getenv("KATA_INTEGRATION") != "1" {
-		t.Skip("set KATA_INTEGRATION=1 to run (requires kata + /dev/kvm + root + e2fsprogs)")
-	}
-	rootfsSrc := os.Getenv("KATA_ROOTFS_SRC")
-	if rootfsSrc == "" {
-		t.Fatal("KATA_ROOTFS_SRC is required")
-	}
-	kernel, image, cfg := os.Getenv("KATA_KERNEL"), os.Getenv("KATA_IMAGE"), os.Getenv("KATA_CONFIG")
-	if kernel == "" || image == "" || cfg == "" {
-		t.Fatal("KATA_KERNEL, KATA_IMAGE, and KATA_CONFIG are required for the owned-boot path")
-	}
-	chBin := envOrTest("KATA_CH", "/usr/local/bin/cloud-hypervisor")
-
-	ns, name := "default", "e2e-blk"
-	id := fmt.Sprintf("ateomchv-blk-%d", os.Getpid())
-	container := "app"
-
-	bundle := ateompath.OCIBundlePath(ns, name, id, container)
-	rootfs := filepath.Join(bundle, "rootfs")
-	if err := os.MkdirAll(rootfs, 0o755); err != nil {
-		t.Fatal(err)
-	}
-	if out, err := exec.Command("cp", "-a", rootfsSrc+"/.", rootfs+"/").CombinedOutput(); err != nil {
-		t.Fatalf("copying rootfs: %v: %s", err, out)
-	}
-	writeMinimalGvisorStyleSpec(t, bundle)
-
-	podUID := "testpod-blk"
-	_ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID))
-	interiorNetNS, err := createNetNSWithoutSwitching(ateompath.AteomNetNSName(podUID))
-	if err != nil {
-		t.Fatalf("creating interior netns: %v", err)
-	}
-	svc := NewService(podUID, chBin, "", true, interiorNetNS, actorlog.NewActorLogger(actorlog.NewSyncedWriter(os.Stdout), false))
-	ctx, cancel := context.WithTimeout(context.Background(), 180*time.Second)
-	defer cancel()
-
-	t.Cleanup(func() {
-		cctx, c := context.WithTimeout(context.Background(), 20*time.Second)
-		svc.teardownActor(cctx, id, svc.running[id], nil)
-		c()
-		_ = os.RemoveAll(ateompath.ActorPath(ns, name, id))
-		_ = os.RemoveAll(kata.VMDir(id))
-		_ = interiorNetNS.Close()
-		_ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID))
-	})
-
-	if _, err := svc.RunWorkload(ctx, &ateompb.RunWorkloadRequest{
-		ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id,
-		Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}},
-		RuntimeAssetPaths: map[string]string{
-			assetKernel: kernel,
-			assetImage:  image,
-			assetConfig: cfg,
-			assetCH:     chBin,
-		},
-	}); err != nil {
-		// Best-effort: dump the guest serial console (captured to VMDir/serial.log)
-		// so a boot failure shows the kernel/agent output.
-		if b, rerr := os.ReadFile(filepath.Join(kata.VMDir(id), "serial.log")); rerr == nil {
-			t.Logf("[serial.log tail]\n%s", lastLines(string(b), 60))
-		}
-		t.Fatalf("RunWorkload (owned-boot): %v", err)
-	}
-	t.Log("RunWorkload OK (owned-boot: CH booted by ateom, actor rootfs on /dev/vdb)")
-
-	// Liveness: the ateom-owned CH must be up and the VM Running.
-	client := ch.NewClient(filepath.Join(kata.VMDir(id), "clh-api.sock"))
-	if err := client.WaitReady(ctx, 10*time.Second); err != nil {
-		t.Fatalf("owned CH not ready: %v", err)
-	}
-	// Confirm the actor's rootfs really came from /dev/vdb (a marker visible via
-	// the guest debug console — the actor's own files live on the blk disk).
-	dump := kata.DebugConsoleDump(ctx, kata.VsockSocketPath(id),
-		"echo '== vdb =='; blkid /dev/vdb 2>&1; echo '== rootfs mount =='; grep vdb /proc/mounts 2>&1; echo '== ip =='; ip -4 addr show eth0 2>&1")
-	t.Logf("[guest] %s", dump)
-}
-
-// TestServiceCheckpointRestoreBlkRootfs exercises memory-only snapshot + restore
-// with in-RAM continuity: the owned-boot actor snapshots MEMORY-ONLY (no
-// shared-dir.tar, no balloon) and restores with its guest RAM intact. It writes a
-// sentinel into guest tmpfs (/run = RAM), checkpoints,
-// ships the snapshot dir, restores on a fresh CH process, and reads the sentinel
-// back — if RAM continuity holds it survives. Same gating/env as
-// TestServiceRunBlkRootfs.
-func TestServiceCheckpointRestoreBlkRootfs(t *testing.T) {
-	if os.Getenv("KATA_INTEGRATION") != "1" {
-		t.Skip("set KATA_INTEGRATION=1 to run (requires kata + /dev/kvm + root + e2fsprogs)")
-	}
-	rootfsSrc := os.Getenv("KATA_ROOTFS_SRC")
-	kernel, image, cfg := os.Getenv("KATA_KERNEL"), os.Getenv("KATA_IMAGE"), os.Getenv("KATA_CONFIG")
-	if rootfsSrc == "" || kernel == "" || image == "" || cfg == "" {
-		t.Fatal("KATA_ROOTFS_SRC, KATA_KERNEL, KATA_IMAGE, KATA_CONFIG are required")
-	}
-	chBin := envOrTest("KATA_CH", "/usr/local/bin/cloud-hypervisor")
-
-	ns, name := "default", "e2e-blkcr"
-	id := fmt.Sprintf("ateomchv-blkcr-%d", os.Getpid())
-	container := "app"
-
-	bundle := ateompath.OCIBundlePath(ns, name, id, container)
-	rootfs := filepath.Join(bundle, "rootfs")
-	if err := os.MkdirAll(rootfs, 0o755); err != nil {
-		t.Fatal(err)
-	}
-	if out, err := exec.Command("cp", "-a", rootfsSrc+"/.", rootfs+"/").CombinedOutput(); err != nil {
-		t.Fatalf("copying rootfs: %v: %s", err, out)
-	}
-	writeMinimalGvisorStyleSpec(t, bundle)
-
-	podUID := "testpod-blkcr"
-	_ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID))
-	interiorNetNS, err := createNetNSWithoutSwitching(ateompath.AteomNetNSName(podUID))
-	if err != nil {
-		t.Fatalf("creating interior netns: %v", err)
-	}
-	svc := NewService(podUID, chBin, "", true, interiorNetNS, actorlog.NewActorLogger(actorlog.NewSyncedWriter(os.Stdout), false))
-	ctx, cancel := context.WithTimeout(context.Background(), 240*time.Second)
-	defer cancel()
-	t.Cleanup(func() {
-		cctx, c := context.WithTimeout(context.Background(), 20*time.Second)
-		svc.teardownActor(cctx, id, svc.running[id], nil)
-		c()
-		_ = os.RemoveAll(ateompath.ActorPath(ns, name, id))
-		_ = os.RemoveAll(kata.VMDir(id))
-		_ = interiorNetNS.Close()
-		_ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID))
-	})
-
-	assets := map[string]string{assetKernel: kernel, assetImage: image, assetConfig: cfg, assetCH: chBin}
-	if _, err := svc.RunWorkload(ctx, &ateompb.RunWorkloadRequest{
-		ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id,
-		Spec:              &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}},
-		RuntimeAssetPaths: assets,
-	}); err != nil {
-		t.Fatalf("RunWorkload: %v", err)
-	}
-	t.Log("RunWorkload OK")
-
-	// Write an in-RAM (tmpfs /run) sentinel via the guest debug console.
-	const sentinel = "BLKROOT_CONTINUITY_OK_4242"
-	vsock := kata.VsockSocketPath(id)
-	_ = kata.DebugConsoleDump(ctx, vsock, "echo "+sentinel+" > /run/blkroot-sentinel; sync; echo wrote")
-	if got := kata.DebugConsoleDump(ctx, vsock, "cat /run/blkroot-sentinel"); !strings.Contains(got, sentinel) {
-		t.Fatalf("sentinel not readable pre-checkpoint: %q", got)
-	}
-	t.Log("wrote in-RAM sentinel")
-
-	// CheckpointWorkload — memory-only, no balloon/wipe.
-	if _, err := svc.CheckpointWorkload(ctx, &ateompb.CheckpointWorkloadRequest{
-		ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id,
-		Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}},
-	}); err != nil {
-		t.Fatalf("CheckpointWorkload: %v", err)
-	}
-	checkpointDir := ateompath.CheckpointStateDir(ns, name, id)
-	for _, f := range []string{"config.json", "state.json", "memory-ranges", "base-id"} {
-		if _, err := os.Stat(filepath.Join(checkpointDir, f)); err != nil {
-			t.Fatalf("checkpoint missing %q: %v", f, err)
-		}
-	}
-	if _, err := os.Stat(filepath.Join(checkpointDir, "shared-dir.tar")); err == nil {
-		t.Error("snapshot has shared-dir.tar — owned-boot must be MEMORY-ONLY (no virtio-fs base)")
-	}
-	t.Log("CheckpointWorkload OK (memory-only: config/state/memory-ranges/base-id, no shared-dir.tar)")
-
-	// Ship snapshot dir -> restore dir (simulating atelet object-storage round trip).
-	restoreDir := ateompath.RestoreStateDir(ns, name, id)
-	if err := os.MkdirAll(restoreDir, 0o700); err != nil {
-		t.Fatal(err)
-	}
-	if out, err := exec.Command("cp", "-a", checkpointDir+"/.", restoreDir+"/").CombinedOutput(); err != nil {
-		t.Fatalf("shipping snapshot: %v: %s", err, out)
-	}
-
-	// RestoreWorkload — reopen /dev/vdb, no virtiofsd/reconstruct.
-	if _, err := svc.RestoreWorkload(ctx, &ateompb.RestoreWorkloadRequest{
-		ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id,
-		Spec:              &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}},
-		RuntimeAssetPaths: assets,
-	}); err != nil {
-		t.Fatalf("RestoreWorkload: %v", err)
-	}
-	client := ch.NewClient(filepath.Join(kata.VMDir(id), "clh-api-restore.sock"))
-	if err := client.WaitReady(ctx, 10*time.Second); err != nil {
-		t.Fatalf("restored CH not ready: %v", err)
-	}
-	t.Log("RestoreWorkload OK")
-
-	// In-RAM continuity: the sentinel written before checkpoint must survive.
-	got := kata.DebugConsoleDump(ctx, vsock, "cat /run/blkroot-sentinel")
-	if !strings.Contains(got, sentinel) {
-		t.Fatalf("RAM continuity FAILED: sentinel gone after restore #1: %q", got)
-	}
-	t.Logf("cycle1 OK: memory-only snapshot + restore, in-RAM continuity (%q)", strings.TrimSpace(got))
-
-	// --- SECOND cycle: checkpoint-AFTER-restore. This is the OnDemand diff-snapshot
-	// case — CH writes only the faulted delta and CheckpointWorkload overlays it onto
-	// the restore source to rebuild a COMPLETE snapshot. If the merge is wrong the
-	// snapshot is incomplete and restore #2 boots a corrupt guest (sentinel gone /
-	// unreachable). Write a SECOND sentinel first so we also prove pages dirtied in
-	// THIS activation are captured by the merge. ---
-	const sentinel2 = "BLKROOT_CYCLE2_OK_8888"
-	_ = kata.DebugConsoleDump(ctx, vsock, "echo "+sentinel2+" > /run/blkroot-sentinel2; sync")
-	if _, err := svc.CheckpointWorkload(ctx, &ateompb.CheckpointWorkloadRequest{
-		ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id,
-		Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}},
-	}); err != nil {
-		t.Fatalf("CheckpointWorkload #2 (merge): %v", err)
-	}
-	// Ship the merged snapshot (overwrites restoreDir AFTER the merge read it).
-	if out, err := exec.Command("cp", "-a", checkpointDir+"/.", restoreDir+"/").CombinedOutput(); err != nil {
-		t.Fatalf("shipping snapshot #2: %v: %s", err, out)
-	}
-	if _, err := svc.RestoreWorkload(ctx, &ateompb.RestoreWorkloadRequest{
-		ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id,
-		Spec:              &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}},
-		RuntimeAssetPaths: assets,
-	}); err != nil {
-		t.Fatalf("RestoreWorkload #2: %v", err)
-	}
-	client2 := ch.NewClient(filepath.Join(kata.VMDir(id), "clh-api-restore.sock"))
-	if err := client2.WaitReady(ctx, 10*time.Second); err != nil {
-		t.Fatalf("restored CH #2 not ready: %v", err)
-	}
-	// BOTH sentinels must survive: sentinel (from cycle 1, an un-faulted source page
-	// recovered by the overlay) AND sentinel2 (dirtied this cycle, in CH's delta).
-	g1 := kata.DebugConsoleDump(ctx, vsock, "cat /run/blkroot-sentinel")
-	g2 := kata.DebugConsoleDump(ctx, vsock, "cat /run/blkroot-sentinel2")
-	if !strings.Contains(g1, sentinel) {
-		t.Fatalf("merge INCOMPLETE: cycle-1 sentinel lost after restore #2 (un-faulted source page dropped): %q", g1)
-	}
-	if !strings.Contains(g2, sentinel2) {
-		t.Fatalf("merge lost the cycle-2 delta: sentinel2 gone after restore #2: %q", g2)
-	}
-	t.Logf("OnDemand-merge OK: 2-cycle suspend/resume, both sentinels survived (%q | %q)",
-		strings.TrimSpace(g1), strings.TrimSpace(g2))
-}
-
-// TestServiceResetToGoldenBlkRootfs exercises reset-to-golden. From the
-// golden snapshot, each restore recreates /dev/vdb byte-identical to the golden
-// disk template, so an actor's rootfs writes do NOT persist into the next
-// activation, while in-RAM state from the golden snapshot DOES. Two restores from
-// the same golden snapshot: restore#1 writes a disk sentinel (runtime); restore#2
-// must NOT see it (disk reset), while the RAM sentinel survives both.
-func TestServiceResetToGoldenBlkRootfs(t *testing.T) {
-	if os.Getenv("KATA_INTEGRATION") != "1" {
-		t.Skip("set KATA_INTEGRATION=1 to run (requires kata + /dev/kvm + root + e2fsprogs)")
-	}
-	rootfsSrc := os.Getenv("KATA_ROOTFS_SRC")
-	kernel, image, cfg := os.Getenv("KATA_KERNEL"), os.Getenv("KATA_IMAGE"), os.Getenv("KATA_CONFIG")
-	if rootfsSrc == "" || kernel == "" || image == "" || cfg == "" {
-		t.Fatal("KATA_ROOTFS_SRC, KATA_KERNEL, KATA_IMAGE, KATA_CONFIG are required")
-	}
-	chBin := envOrTest("KATA_CH", "/usr/local/bin/cloud-hypervisor")
-
-	ns, name := "default", "e2e-blkrtg"
-	id := fmt.Sprintf("ateomchv-blkrtg-%d", os.Getpid())
-	container := "app"
-
-	bundle := ateompath.OCIBundlePath(ns, name, id, container)
-	if err := os.MkdirAll(filepath.Join(bundle, "rootfs"), 0o755); err != nil {
-		t.Fatal(err)
-	}
-	if out, err := exec.Command("cp", "-a", rootfsSrc+"/.", filepath.Join(bundle, "rootfs")+"/").CombinedOutput(); err != nil {
-		t.Fatalf("copying rootfs: %v: %s", err, out)
-	}
-	writeMinimalGvisorStyleSpec(t, bundle)
-
-	podUID := "testpod-blkrtg"
-	_ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID))
-	interiorNetNS, err := createNetNSWithoutSwitching(ateompath.AteomNetNSName(podUID))
-	if err != nil {
-		t.Fatalf("creating interior netns: %v", err)
-	}
-	svc := NewService(podUID, chBin, "", true, interiorNetNS, actorlog.NewActorLogger(actorlog.NewSyncedWriter(os.Stdout), false))
-	ctx, cancel := context.WithTimeout(context.Background(), 300*time.Second)
-	defer cancel()
-	t.Cleanup(func() {
-		cctx, c := context.WithTimeout(context.Background(), 20*time.Second)
-		svc.teardownActor(cctx, id, svc.running[id], nil)
-		c()
-		_ = os.RemoveAll(ateompath.ActorPath(ns, name, id))
-		_ = os.RemoveAll(kata.VMDir(id))
-		_ = interiorNetNS.Close()
-		_ = netns.DeleteNamed(ateompath.AteomNetNSName(podUID))
-	})
-
-	assets := map[string]string{assetKernel: kernel, assetImage: image, assetConfig: cfg, assetCH: chBin}
-	runReq := &ateompb.RunWorkloadRequest{
-		ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id,
-		Spec:              &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}},
-		RuntimeAssetPaths: assets,
-	}
-	restoreReq := &ateompb.RestoreWorkloadRequest{
-		ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id,
-		Spec:              &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}},
-		RuntimeAssetPaths: assets,
-	}
-	vsock := kata.VsockSocketPath(id)
-	const ramSentinel = "RAM_GOLDEN_OK_7777"
-	rootfsDir := "/run/kata-containers/" + id + "/rootfs"
-	const diskSentinel = "DISK_WRITE_SHOULD_RESET_9999"
-
-	// --- Golden: run, plant an in-RAM sentinel, checkpoint (saves golden snapshot
-	// + golden disk template), tear down. ---
-	if _, err := svc.RunWorkload(ctx, runReq); err != nil {
-		t.Fatalf("RunWorkload: %v", err)
-	}
-	_ = kata.DebugConsoleDump(ctx, vsock, "echo "+ramSentinel+" > /run/ram-sentinel; sync")
-	if _, err := svc.CheckpointWorkload(ctx, &ateompb.CheckpointWorkloadRequest{
-		ActorTemplateNamespace: ns, ActorTemplateName: name, ActorId: id,
-		Spec: &ateompb.WorkloadSpec{Containers: []*ateompb.Container{{Name: container}}},
-	}); err != nil {
-		t.Fatalf("CheckpointWorkload: %v", err)
-	}
-	// golden disk template must have been saved.
-	if _, err := os.Stat(filepath.Join(ateompath.ActorPath(ns, name, id), "golden-rootfs.ext4")); err != nil {
-		t.Fatalf("golden rootfs template not saved: %v", err)
-	}
-	checkpointDir := ateompath.CheckpointStateDir(ns, name, id)
-	restoreDir := ateompath.RestoreStateDir(ns, name, id)
-	if err := os.MkdirAll(restoreDir, 0o700); err != nil {
-		t.Fatal(err)
-	}
-	if out, err := exec.Command("cp", "-a", checkpointDir+"/.", restoreDir+"/").CombinedOutput(); err != nil {
-		t.Fatalf("shipping snapshot: %v: %s", err, out)
-	}
-	t.Log("golden checkpoint OK (snapshot + golden disk template saved)")
-
-	// --- Restore #1: disk reset from golden template; write a disk sentinel at
-	// runtime, confirm it lands, then tear down (discard). ---
-	if _, err := svc.RestoreWorkload(ctx, restoreReq); err != nil {
-		t.Fatalf("RestoreWorkload #1: %v", err)
-	}
-	if got := kata.DebugConsoleDump(ctx, vsock, "cat /run/ram-sentinel"); !strings.Contains(got, ramSentinel) {
-		t.Fatalf("restore#1 RAM continuity failed: %q", got)
-	}
-	_ = kata.DebugConsoleDump(ctx, vsock, "echo "+diskSentinel+" > "+rootfsDir+"/disk-sentinel; sync")
-	if got := kata.DebugConsoleDump(ctx, vsock, "cat "+rootfsDir+"/disk-sentinel"); !strings.Contains(got, diskSentinel) {
-		t.Fatalf("restore#1 disk sentinel did not land: %q", got)
-	}
-	t.Log("restore#1 OK: RAM sentinel present, disk sentinel written")
-	tdCtx, tdCancel := context.WithTimeout(ctx, 20*time.Second)
-	svc.teardownActor(tdCtx, id, svc.running[id], ch.NewClient(filepath.Join(kata.VMDir(id), "clh-api-restore.sock")))
-	tdCancel()
-	delete(svc.running, id)
-
-	// --- Restore #2: disk reset AGAIN from golden template — the disk sentinel
-	// from restore#1 must be GONE, while the golden RAM sentinel still survives. ---
-	if _, err := svc.RestoreWorkload(ctx, restoreReq); err != nil {
-		t.Fatalf("RestoreWorkload #2: %v", err)
-	}
-	if got := kata.DebugConsoleDump(ctx, vsock, "cat /run/ram-sentinel"); !strings.Contains(got, ramSentinel) {
-		t.Fatalf("restore#2 RAM continuity failed: %q", got)
-	}
-	got := kata.DebugConsoleDump(ctx, vsock, "cat "+rootfsDir+"/disk-sentinel 2>&1; echo END")
-	if strings.Contains(got, diskSentinel) {
-		t.Fatalf("reset-to-golden FAILED: disk sentinel persisted after restore#2: %q", got)
-	}
-	t.Logf("reset-to-golden OK: discarded the rootfs write (disk sentinel gone) while RAM continuity held: %q", strings.TrimSpace(got))
-}
-
-func lastLines(s string, n int) string {
-	lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
-	if len(lines) > n {
-		lines = lines[len(lines)-n:]
-	}
-	return strings.Join(lines, "\n") + "\n"
-}
-
-func envOrTest(key, def string) string {
-	if v := os.Getenv(key); v != "" {
-		return v
-	}
-	return def
-}
-
-// writeMinimalGvisorStyleSpec writes a deliberately minimal OCI spec (no
-// linux.resources / cgroupsPath) so the test exercises ensureKataCompatibleSpec.
-func writeMinimalGvisorStyleSpec(t *testing.T, bundle string) {
-	t.Helper()
-	spec := map[string]any{
-		"ociVersion": "1.0.2",
-		"process": map[string]any{
-			"user": map[string]any{"uid": 0, "gid": 0},
-			"args": []string{"sleep", "3600"},
-			"env":  []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"},
-			"cwd":  "/",
-			"capabilities": map[string]any{
-				"bounding":  []string{"CAP_KILL", "CAP_AUDIT_WRITE", "CAP_NET_BIND_SERVICE"},
-				"effective": []string{"CAP_KILL", "CAP_AUDIT_WRITE", "CAP_NET_BIND_SERVICE"},
-				"permitted": []string{"CAP_KILL", "CAP_AUDIT_WRITE", "CAP_NET_BIND_SERVICE"},
-			},
-		},
-		"root":     map[string]any{"path": "rootfs", "readonly": false},
-		"hostname": "ateomchv",
-		"mounts": []map[string]any{
-			{"destination": "/proc", "type": "proc", "source": "proc"},
-			{"destination": "/dev", "type": "tmpfs", "source": "tmpfs"},
-			{"destination": "/sys", "type": "sysfs", "source": "sysfs", "options": []string{"nosuid", "noexec", "nodev", "ro"}},
-		},
-		"linux": map[string]any{
-			"namespaces": []map[string]any{
-				{"type": "pid"}, {"type": "network"}, {"type": "ipc"}, {"type": "uts"}, {"type": "mount"},
-			},
-		},
-	}
-	b, err := json.MarshalIndent(spec, "", "  ")
-	if err != nil {
-		t.Fatal(err)
-	}
-	if err := os.WriteFile(filepath.Join(bundle, "config.json"), b, 0o600); err != nil {
-		t.Fatal(err)
-	}
-}

From 59544fb19a3a4354b3345275520ee519499e0062 Mon Sep 17 00:00:00 2001
From: Benjamin Elder <bentheelder@google.com>
Date: Thu, 25 Jun 2026 18:38:37 -0700
Subject: [PATCH 4/5] demos,hack,manifests,docs: ship virtiofsd + kata 3.32
 assets for the overlay rootfs

The overlay rootfs serves the image over virtio-fs, so the asset set gains virtiofsd
and moves to kata 3.32. virtiofsd is built from a pinned source commit because the
vhost-0.16 snapshot/restore fix isn't in a release tag yet (tracking:
gitlab.com/virtio-fs/virtiofsd work_items/236). assemble.sh builds it and the stagers
upload it to rustfs (kind) and GCS (GKE); the counter-microvm SandboxConfig lists the
virtiofsd asset for arm64 + amd64, and the sandboxconfig-assets VAP (with its envtest)
now requires virtiofsd for every microvm architecture.

The overlay formats nothing on the host, so it runs on the committed debian:stable-slim
worker base: drop the custom worker base (hack/ateom-base) and its use in
run-microvm-demo.sh. Update the asset README and architecture doc for the overlay.
---
 .ko.yaml                                      | 10 +--
 demos/counter/counter-microvm.yaml.tmpl       | 43 +++++++----
 docs/architecture.md                          |  4 +-
 hack/ateom-base/Dockerfile                    | 22 ------
 hack/microvm-assets/README.md                 | 15 ++--
 hack/microvm-assets/assemble.sh               | 52 ++++++++++---
 hack/microvm-assets/stage-to-gcs.sh           |  2 +-
 hack/microvm-assets/stage-to-rustfs.sh        |  2 +-
 hack/run-microvm-demo.sh                      | 74 +++++--------------
 .../ate-install/sandboxconfig-validation.yaml |  4 +-
 .../v1alpha1/sandboxconfig_validation_test.go | 15 +++-
 11 files changed, 120 insertions(+), 123 deletions(-)
 delete mode 100644 hack/ateom-base/Dockerfile

diff --git a/.ko.yaml b/.ko.yaml
index 8f61d60ea..4afb2e7e1 100644
--- a/.ko.yaml
+++ b/.ko.yaml
@@ -21,11 +21,7 @@ defaultPlatforms:
 baseImageOverrides:
   github.com/agent-substrate/substrate/demos/sandbox: alpine
   github.com/agent-substrate/substrate/demos/agent-secret: alpine
-  # ateom-microvm owns the cloud-hypervisor boot and builds the actor's writable
-  # virtio-blk rootfs at runtime, which needs mkfs.ext4 (e2fsprogs) plus glibc for
-  # the fetched cloud-hypervisor binary. The committed debian:stable-slim base has
-  # glibc + coreutils but NOT mkfs.ext4, so this default cannot build the rootfs on
-  # its own. hack/run-microvm-demo.sh builds hack/ateom-base (debian-slim +
-  # e2fsprogs) and overrides this base at build time via KO_CONFIG_PATH, so running
-  # the demo never edits this file. The committed default stays debian:stable-slim.
+  # ateom-microvm needs glibc (for the fetched cloud-hypervisor binary) and mount/umount
+  # (to bind the image into the virtiofsd shared dir) — both in debian:stable-slim but
+  # not in the distroless static default.
   github.com/agent-substrate/substrate/cmd/ateom-microvm: debian:stable-slim
diff --git a/demos/counter/counter-microvm.yaml.tmpl b/demos/counter/counter-microvm.yaml.tmpl
index e9f4e31e2..ee8bf6a85 100644
--- a/demos/counter/counter-microvm.yaml.tmpl
+++ b/demos/counter/counter-microvm.yaml.tmpl
@@ -16,15 +16,18 @@
 # in-RAM (atomic uint64), so a successful suspend/resume across pods shows the
 # count continuing — proving the guest memory snapshot round-tripped.
 #
-# The sandbox binaries (cloud-hypervisor, guest kernel, guest rootfs, base
-# configuration.toml) live on a cluster-scoped SandboxConfig, FETCHED at runtime
+# The sandbox binaries (cloud-hypervisor, virtiofsd, guest kernel, guest rootfs,
+# base configuration.toml) live on a cluster-scoped SandboxConfig, FETCHED at runtime
 # from the cluster object store bucket ${BUCKET_NAME} under kata-assets/ (rustfs on
-# kind, GCS on GKE) — NOT baked into the worker image. ateom owns the
-# cloud-hypervisor boot itself and gives the actor a writable virtio-blk rootfs, so
-# neither the kata shim nor virtiofsd is needed. The per-arch sha256 values below
-# are the asset sets produced by hack/microvm-assets/assemble.sh; atelet selects
-# the block matching the node's architecture, and each cluster's bucket holds that
-# arch's binaries at these paths (staged by run-microvm-demo[-kind].sh).
+# kind, GCS on GKE) — NOT baked into the worker image. ateom boots cloud-hypervisor
+# itself and gives the actor an overlay rootfs (virtio-fs RO lower + guest-tmpfs
+# upper), so it fetches virtiofsd (built from upstream main — vhost 0.16; the
+# snapshot/restore fix is not in a release tag yet). The kata containerd shim is NOT
+# fetched (ateom drives the kata-agent directly). kata assets are 3.32.0.
+# The per-arch sha256 values below are the asset sets produced by
+# hack/microvm-assets/assemble.sh; atelet selects the block matching the node's
+# architecture, and each cluster's bucket holds that arch's binaries at these paths
+# (staged by run-microvm-demo[-kind].sh).
 
 apiVersion: v1
 kind: Namespace
@@ -44,25 +47,39 @@ spec:
       cloud-hypervisor:
         url: "gs://${BUCKET_NAME}/kata-assets/cloud-hypervisor"
         sha256: "bf004ddc1a148f47caa87ac49a783b8dbd6bf9bc27abe522ed197df7b982d3b1"
+      # virtiofsd serves the overlay RO lower (virtio-fs); built from a pinned source
+      # commit (vhost 0.16 — the snapshot/restore fix is not in a release tag yet).
+      virtiofsd:
+        url: "gs://${BUCKET_NAME}/kata-assets/virtiofsd"
+        # virtiofsd is built from source (pinned commit in assemble.sh), so its binary
+        # is not byte-reproducible across toolchains/arches and can't carry a fixed pin.
+        # run-microvm-demo.sh computes this from the freshly-staged binary at deploy.
+        sha256: "${VIRTIOFSD_SHA256}"
       kata-kernel:
         url: "gs://${BUCKET_NAME}/kata-assets/vmlinux"
-        sha256: "a44d663f4ddad20a35527a3578fadef9beb23c1e5cb720e85d6928d6de70d3a1"
+        sha256: "f437320bab94f19105d12b932aa29735f0d54d2588218872254367f312c1027c"
       kata-image:
         url: "gs://${BUCKET_NAME}/kata-assets/rootfs.img"
-        sha256: "7ebd652760c881374c0a761d34addcb76d9a650e35c10c01b780ebcdd9a1f2aa"
+        sha256: "31ffb41177571c5654d3a28a2728eaac9d6d3daed90bb993f64e0b4b3ca6b235"
       kata-config:
         url: "gs://${BUCKET_NAME}/kata-assets/configuration-clh.toml"
-        sha256: "df504d9be0ed01765fdc8a9467955e1e671eb97724443f65a524bf914ccb818b"
+        sha256: "8a09a40543a527dbdc3ff26d229bae0de9aebb655475c28d7e5482dbedefa030"
+    # amd64 assets are kata 3.32 + virtiofsd (assemble.sh ARCH=amd64), matching arm64.
     amd64:
       cloud-hypervisor:
         url: "gs://${BUCKET_NAME}/kata-assets/cloud-hypervisor"
         sha256: "829af01ff075bb96c4f183905134c453a88d68cbabdc6b87df21098842581ee9"
+      virtiofsd:
+        url: "gs://${BUCKET_NAME}/kata-assets/virtiofsd"
+        # Built from source (see the arm64 note above); sha injected at deploy by
+        # run-microvm-demo.sh from the freshly-staged binary.
+        sha256: "${VIRTIOFSD_SHA256}"
       kata-kernel:
         url: "gs://${BUCKET_NAME}/kata-assets/vmlinux"
-        sha256: "a5f0af5fe536cd52c3ca214d15d81c577e5c5dc672947ab7980b91ddcb7c9d71"
+        sha256: "43701715ae2885f936bbe5c66a2de7c14dc51de7d19412d04833e4bbcf205bd0"
       kata-image:
         url: "gs://${BUCKET_NAME}/kata-assets/rootfs.img"
-        sha256: "ca9e06621b7edd2e056607c04db8bcebd92ad37ad4e37d18b8247d851feb0fae"
+        sha256: "e9548ff64f51c120791d3a2d1a81ebfd275df2bf0737368bd3e6381a6e967855"
       kata-config:
         url: "gs://${BUCKET_NAME}/kata-assets/configuration-clh.toml"
         sha256: "8cce580e5abf78c05c8e9b929c24a524b9a81fc47be4e2e4f38dcae5ef052be6"
diff --git a/docs/architecture.md b/docs/architecture.md
index 1e44dc84c..a00419f57 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -336,9 +336,9 @@ The node-level subsystem manages the physical execution of sandboxes and the mov
 
 A `WorkerPool` selects a **sandbox class** (`spec.sandboxClass`), and each class has a matching `ateom` herder image. The sandbox binaries themselves are not baked into the worker image — they are fetched at runtime from a cluster-scoped [`SandboxConfig`](api-guide.md#3-sandboxconfig-sandbox-binaries) and pinned into each snapshot's manifest so restores stay reproducible across runtime upgrades.
 
-  * **gVisor** (`ateom-gvisor`, the default): runs the workload under `runsc`. Suspend/resume uses gVisor's checkpoint/restore of the sandboxed process tree.
+  * **gVisor** (`ateom-gvisor`, the default): Runs the workload under `runsc` for kernel-level sandboxing. Suspend and resume leverage gVisor's native checkpoint/restore of the sandboxed process tree.
 
-  * **micro-VM** (`ateom-microvm`): runs the workload inside a [Kata Containers](https://katacontainers.io/) guest (Kata 3.31 guest assets) on the [Cloud Hypervisor](https://www.cloudhypervisor.org/) VMM. `ateom` owns the Cloud Hypervisor boot directly — there is **no Kata shim and no containerd daemon**: it launches Cloud Hypervisor, boots the guest kernel + OS image, and then drives the Kata agent over its hybrid-vsock ttrpc API itself (creating the sandbox, configuring guest networking, and starting the container). The actor's container rootfs is a writable boot-time virtio-blk disk (`/dev/vdb`) that `ateom` builds with `mkfs.ext4` from the OCI bundle, so rootfs writes land off guest RAM on a host-backed disk. Suspend captures a Cloud Hypervisor **memory-only snapshot** of the running guest (no memory balloon); resume relaunches Cloud Hypervisor with its **OnDemand** (userfaultfd) memory restore — demand-paging from the snapshot while a diff-merge folds newly-faulted pages back in to keep the snapshot complete — so full in-RAM state comes back on any worker, including a different node. On each restore `/dev/vdb` is recreated byte-identical to the golden image (**reset-to-golden**), so rootfs writes are discarded across suspend/resume (matching gVisor's semantics) while in-RAM state persists. The actor container's stdout/stderr is forwarded to the pod log with `ate.dev/*` labels (parity with `ateom-gvisor`). Micro-VM workers require `/dev/kvm` and nested-virtualization-capable nodes; the controller adds the KVM device mount and pins these pods to nodes labeled `ate.dev/sandboxClass=microvm`. See [`hack/microvm-assets/`](../hack/microvm-assets/) for assembling the asset set.
+  * **micro-VM** (`ateom-microvm`): Runs the workload inside a [Kata Containers](https://katacontainers.io/) guest on the [Cloud Hypervisor](https://www.cloudhypervisor.org/) VMM. Suspend and resume capture a memory-only VM snapshot and restore it on-demand using `userfaultfd` memory demand-paging, with container rootfs writes captured in guest RAM via a `tmpfs` overlay.
 
 ### Networking Stack (`atenet` + Envoy)
 
diff --git a/hack/ateom-base/Dockerfile b/hack/ateom-base/Dockerfile
deleted file mode 100644
index 9ac88ca0d..000000000
--- a/hack/ateom-base/Dockerfile
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2026 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-FROM debian:stable-slim
-# e2fsprogs provides mkfs.ext4, which the ateom-microvm worker uses to build the
-# actor's writable /dev/vdb ext4 rootfs from the OCI bundle. debian-slim also
-# provides coreutils (cp --sparse) and glibc for the fetched cloud-hypervisor binary.
-RUN apt-get update \
- && apt-get install -y --no-install-recommends e2fsprogs \
- && apt-get clean \
- && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
diff --git a/hack/microvm-assets/README.md b/hack/microvm-assets/README.md
index 5a43e095c..3f21a73a8 100644
--- a/hack/microvm-assets/README.md
+++ b/hack/microvm-assets/README.md
@@ -1,11 +1,14 @@
 # Micro-VM runtime assets + counter demo (kind, fetch-not-bake)
 
 The `microvm` runtime (`cmd/ateom-microvm`, kata + cloud-hypervisor) fetches its
-toolchain at runtime — nothing kata-specific is baked into the worker image. ateom owns
-the cloud-hypervisor boot and gives the actor a writable virtio-blk rootfs, so neither the
-kata shim nor virtiofsd is needed. The asset set is just four files:
+toolchain at runtime — nothing kata-specific is baked into the worker image. ateom drives
+the kata-agent directly (no kata shim, no containerd). Each actor container's rootfs is an
+overlay of a read-only lower (the OCI image, served into the guest over virtio-fs by
+`virtiofsd`) and a writable upper on a guest tmpfs, so `virtiofsd` is part of the asset
+set. The asset set is five files:
 
 - `cloud-hypervisor` — the VMM binary (fetched from its release)
+- `virtiofsd` — the virtio-fs daemon serving the RO lower (built from source; see `assemble.sh`)
 - `vmlinux` — the guest kernel (from kata-static)
 - `rootfs.img` — the guest rootfs image (from kata-static)
 - `configuration-clh.toml` — the base kata config (from kata-static)
@@ -16,9 +19,9 @@ available, `hack/create-kind-cluster.sh` mounts it into the node and labels the
 `ate.dev/sandboxClass=microvm`.
 
 > [!TIP]
-> `hack/run-microvm-demo.sh` automates the full bring-up below (ateom-base image, ko base
-> override, assets, control plane, demo apply) for kind OR GKE without editing committed
-> files. The steps here are the manual equivalent.
+> `hack/run-microvm-demo.sh` automates the full bring-up below (assets, control plane,
+> demo apply) for kind OR GKE without editing committed files. The steps here are the
+> manual equivalent.
 
 ## Steps (run on a KVM-capable Linux host matching the node arch)
 
diff --git a/hack/microvm-assets/assemble.sh b/hack/microvm-assets/assemble.sh
index e12a4d1b3..0b71469fe 100755
--- a/hack/microvm-assets/assemble.sh
+++ b/hack/microvm-assets/assemble.sh
@@ -18,14 +18,22 @@
 # ateom-microvm fetches at runtime (fetch-not-bake). Run this on a Linux
 # host of the TARGET arch.
 #
-# Produces, under $OUT, the four assets named as the SandboxConfig expects, plus
-# their sha256 sums (paste into demos/counter/counter-microvm.yaml.tmpl):
-#   cloud-hypervisor  vmlinux  rootfs.img  configuration-clh.toml
+# Produces, under $OUT, the five assets named as the SandboxConfig expects:
+#   cloud-hypervisor  virtiofsd  vmlinux  rootfs.img  configuration-clh.toml
+# The four DOWNLOADED assets are reproducible, so paste their sha256 sums into the
+# manifest (demos/counter/counter-microvm.yaml.tmpl). virtiofsd is built from source
+# (non-reproducible bytes), so its sha is NOT pinned there — run-microvm-demo.sh
+# computes it from the staged binary and injects it at deploy.
 #
-# ateom owns the cloud-hypervisor boot and gives the actor a writable virtio-blk
-# rootfs, so neither the kata shim nor virtiofsd is part of the asset set.
+# ateom drives the kata-agent directly (the kata containerd shim is NOT an asset). The
+# actor rootfs is overlay(virtio-fs RO lower + guest-tmpfs upper), so virtiofsd IS an
+# asset; it is built from source (pinned commit, see VIRTIOFSD_COMMIT below) because the
+# vhost-0.16 snapshot/restore fix (REPLY_ACK) is not in a release tag yet — the
+# kata-bundled v1.13.3 virtiofsd hangs CH's restore handshake. Tracking issue:
+# https://gitlab.com/virtio-fs/virtiofsd/-/work_items/236 — switch to a release once it
+# lands. Building it needs rust (rustup) + libcap-ng-dev libseccomp-dev pkg-config.
 #
-# Env: ARCH (arm64|amd64, default arm64), KATA_VER (3.31.0), CH_VER (v52.0),
+# Env: ARCH (arm64|amd64, default arm64), KATA_VER (3.32.0), CH_VER (v52.0),
 #      OUT (default ./bin/microvm-assets/$ARCH, under the gitignored bin/).
 
 set -o errexit -o nounset -o pipefail
@@ -33,7 +41,7 @@ set -o errexit -o nounset -o pipefail
 ROOT="$(git rev-parse --show-toplevel)"
 
 ARCH="${ARCH:-arm64}"
-KATA_VER="${KATA_VER:-3.31.0}"
+KATA_VER="${KATA_VER:-3.32.0}"
 CH_VER="${CH_VER:-v52.0}"
 OUT="${OUT:-${ROOT}/bin/microvm-assets/$ARCH}"
 WORK="$(mktemp -d)"
@@ -64,12 +72,36 @@ curl -fSL -o "${OUT}/cloud-hypervisor" \
   "https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${CH_VER}/${CH_ASSET}"
 chmod +x "${OUT}/cloud-hypervisor"
 
+# virtiofsd pinned commit. The vhost-0.16 / vhost-user-backend-0.22 snapshot-restore
+# fix (REPLY_ACK) is upstream but not in a release tag yet (tracking issue
+# https://gitlab.com/virtio-fs/virtiofsd/-/work_items/236) — the kata-bundled v1.13.3
+# (old vhost) hangs CH's restore handshake. Pin a known-good commit until a release
+# carries the fix.
+VIRTIOFSD_COMMIT="acb3d506a9f1b256fff7327023df85570caf1e75"
+echo ">> Building virtiofsd @ ${VIRTIOFSD_COMMIT} (vhost 0.16)..."
+# Build deps (Debian): apt-get install -y git libcap-ng-dev libseccomp-dev pkg-config; rust via rustup.
+if ! command -v cargo >/dev/null 2>&1; then
+  echo "cargo not found; install rust (rustup) + libcap-ng-dev libseccomp-dev pkg-config" >&2
+  exit 1
+fi
+git clone https://gitlab.com/virtio-fs/virtiofsd.git
+(
+  cd virtiofsd
+  git checkout --quiet "${VIRTIOFSD_COMMIT}"
+  grep -E '^(vhost|vhost-user-backend) =' Cargo.toml   # expect vhost 0.16 / backend 0.22
+  cargo build --release
+)
+cp "virtiofsd/target/release/virtiofsd" "${OUT}/virtiofsd"
+chmod +x "${OUT}/virtiofsd"
+
 echo
 echo ">> Assets assembled in ${OUT}:"
 cd "${OUT}"
-for f in cloud-hypervisor vmlinux rootfs.img configuration-clh.toml; do
+for f in cloud-hypervisor virtiofsd vmlinux rootfs.img configuration-clh.toml; do
   [ -f "$f" ] || { echo "MISSING: $f" >&2; exit 1; }
 done
+"${OUT}/virtiofsd" --version 2>/dev/null | head -1 || true
 echo
-echo ">> sha256 (paste into demos/counter/counter-microvm.yaml.tmpl runtime.assets):"
-sha256sum cloud-hypervisor vmlinux rootfs.img configuration-clh.toml
+echo ">> sha256 (paste the DOWNLOADED assets into counter-microvm.yaml.tmpl;"
+echo ">> virtiofsd's sha is injected at deploy by run-microvm-demo.sh, not pinned):"
+sha256sum cloud-hypervisor virtiofsd vmlinux rootfs.img configuration-clh.toml
diff --git a/hack/microvm-assets/stage-to-gcs.sh b/hack/microvm-assets/stage-to-gcs.sh
index 6ec9e5c12..cab3bdce0 100755
--- a/hack/microvm-assets/stage-to-gcs.sh
+++ b/hack/microvm-assets/stage-to-gcs.sh
@@ -33,7 +33,7 @@ BUCKET="${BUCKET:-ate-snapshots}"
 # gcloud uses its active config project. ${PROJECT_ID:+...} elides the flag entirely
 # when unset (same idiom as KUBECTL_CONTEXT in hack/run-microvm-demo.sh).
 echo ">> Uploading assets to gs://${BUCKET}/kata-assets/ ..."
-for f in cloud-hypervisor vmlinux rootfs.img configuration-clh.toml; do
+for f in cloud-hypervisor virtiofsd vmlinux rootfs.img configuration-clh.toml; do
   echo "   $f"
   gcloud storage cp ${PROJECT_ID:+--project="${PROJECT_ID}"} "${OUT}/${f}" "gs://${BUCKET}/kata-assets/${f}"
 done
diff --git a/hack/microvm-assets/stage-to-rustfs.sh b/hack/microvm-assets/stage-to-rustfs.sh
index 3f705dcfc..48875b656 100755
--- a/hack/microvm-assets/stage-to-rustfs.sh
+++ b/hack/microvm-assets/stage-to-rustfs.sh
@@ -48,7 +48,7 @@ sleep 3
 
 ENDPOINT="http://localhost:9000"
 echo ">> Uploading assets to s3://${BUCKET}/kata-assets/ via ${ENDPOINT}..."
-for f in cloud-hypervisor vmlinux rootfs.img configuration-clh.toml; do
+for f in cloud-hypervisor virtiofsd vmlinux rootfs.img configuration-clh.toml; do
   echo "   $f"
   aws --endpoint-url "${ENDPOINT}" s3 cp "${OUT}/${f}" "s3://${BUCKET}/kata-assets/${f}"
 done
diff --git a/hack/run-microvm-demo.sh b/hack/run-microvm-demo.sh
index 2ad3009c1..fe5b19eac 100755
--- a/hack/run-microvm-demo.sh
+++ b/hack/run-microvm-demo.sh
@@ -21,12 +21,6 @@
 # Like the other hack scripts, this sources .ate-dev-env.sh for the cluster /
 # registry / bucket settings unless NO_DEV_ENV is set.
 #
-# The committed .ko.yaml base for cmd/ateom-microvm is debian:stable-slim, which
-# lacks mkfs.ext4 (e2fsprogs). The worker needs mkfs.ext4 at runtime to build the
-# actor's virtio-blk rootfs, so this script builds hack/ateom-base (debian-slim +
-# e2fsprogs) and overrides ONLY that base at build time via a throwaway ko config
-# pointed at by KO_CONFIG_PATH — the committed .ko.yaml is never touched.
-#
 # Env (most come from .ate-dev-env.sh):
 #   KO_DOCKER_REPO   (required) image registry, e.g. gcr.io/PROJECT/ate-images for
 #                    GKE or localhost:5001 for kind.
@@ -34,7 +28,6 @@
 #   KUBECTL_CONTEXT  (optional) kube context; threaded into install + ko apply + kubectl.
 #   PROJECT_ID       (optional) GCP project for the GCS asset upload (GKE path).
 #   ARCH             target arch (default: from KO_DEFAULTPLATFORMS, else host arch).
-#   ATEOM_BASE_TAG   tag for the built ateom-base image (default: e2fsprogs).
 #   OUT              asset dir (default: $PWD/bin/microvm-assets/$ARCH, gitignored).
 #   ATE_INSTALL_KIND "true" for the kind path (stage assets to rustfs + install-ate-kind.sh);
 #                    default false uploads assets to GCS + uses install-ate.sh.
@@ -54,7 +47,6 @@ fi
 KO_DOCKER_REPO="${KO_DOCKER_REPO:-}"
 KUBECTL_CONTEXT="${KUBECTL_CONTEXT:-}"
 BUCKET_NAME="${BUCKET_NAME:-ate-snapshots}"
-ATEOM_BASE_TAG="${ATEOM_BASE_TAG:-e2fsprogs}"
 ATE_INSTALL_KIND="${ATE_INSTALL_KIND:-false}"
 
 # Target arch: match the images' platform (KO_DEFAULTPLATFORMS is set by
@@ -82,47 +74,9 @@ log() {
   echo -e "${COLOR_CYAN}[run-microvm-demo]: $*${COLOR_RESET}"
 }
 
-ATEOM_BASE_IMAGE="${KO_DOCKER_REPO}/ateom-base:${ATEOM_BASE_TAG}"
-
-# --- 2. build + push ateom-base (debian-slim + e2fsprogs) for the target arch -
-# We build with buildx --load (import into the local docker daemon) and then
-# `docker push`, NOT buildx --push: the buildkit builder runs in its own container
-# and cannot reach a localhost/kind registry, whereas the docker daemon can. --load
-# imports a single-platform image fine even when ARCH != the host arch. For a real
-# remote registry (e.g. gcr.io) the same daemon `docker push` works with its creds.
-log "Building ateom-base ${ATEOM_BASE_IMAGE} (linux/${ARCH})..."
-if docker buildx version >/dev/null 2>&1; then
-  log "  using: docker buildx build --load + docker push"
-  docker buildx build --platform "linux/${ARCH}" -t "${ATEOM_BASE_IMAGE}" --load hack/ateom-base
-else
-  log "  using: docker build + docker push (buildx unavailable)"
-  docker build -t "${ATEOM_BASE_IMAGE}" hack/ateom-base
-fi
-docker push "${ATEOM_BASE_IMAGE}"
-
-# --- 3. throwaway ko config overriding ONLY the ateom-microvm base -----------
-# KO_CONFIG_PATH points at a FILE that ko parses by extension, so it MUST end in
-# .yaml (a bare mktemp file is rejected: "Unsupported Config Type"). Use a temp dir
-# with a .yaml-named copy of the repo .ko.yaml and swap the one base line.
-KO_CONFIG_DIR="$(mktemp -d)"
-KO_CONFIG_TMP="${KO_CONFIG_DIR}/ko-override.yaml"
-trap 'rm -rf "${KO_CONFIG_DIR}"' EXIT
-cp "${ROOT}/.ko.yaml" "${KO_CONFIG_TMP}"
-
-OVERRIDE_KEY="github.com/agent-substrate/substrate/cmd/ateom-microvm"
-if ! grep -q "^  ${OVERRIDE_KEY}:" "${KO_CONFIG_TMP}"; then
-  echo "Error: could not find the cmd/ateom-microvm baseImageOverride line in .ko.yaml" >&2
-  exit 1
-fi
-# Replace only the value after the key (use | as the sed delimiter; the value has /).
-sed -i.bak "s|^  ${OVERRIDE_KEY}:.*|  ${OVERRIDE_KEY}: ${ATEOM_BASE_IMAGE}|" "${KO_CONFIG_TMP}"
-rm -f "${KO_CONFIG_TMP}.bak"
-export KO_CONFIG_PATH="${KO_CONFIG_TMP}"
-log "Using throwaway KO_CONFIG_PATH=${KO_CONFIG_PATH} (ateom-microvm base -> ${ATEOM_BASE_IMAGE})"
-
-# --- 4. assets: assemble (if missing) then stage to rustfs (kind) / GCS (GKE) --
+# --- 2. assets: assemble (if missing) then stage to rustfs (kind) / GCS (GKE) --
 need_assemble=false
-for f in cloud-hypervisor vmlinux rootfs.img configuration-clh.toml; do
+for f in cloud-hypervisor virtiofsd vmlinux rootfs.img configuration-clh.toml; do
   if [[ ! -f "${OUT}/${f}" ]]; then
     need_assemble=true
     break
@@ -135,7 +89,7 @@ else
   log "Assets already present in ${OUT}; skipping assemble."
 fi
 
-# Upload the four assets under kata-assets/, where atelet fetches them: the
+# Upload the five assets under kata-assets/, where atelet fetches them: the
 # in-cluster rustfs (port-forwarded, S3 API) on kind, or the GCS bucket on GKE.
 if [[ "${ATE_INSTALL_KIND}" == "true" ]]; then
   log "Staging assets to in-cluster rustfs bucket ${BUCKET_NAME} (kata-assets/)..."
@@ -145,7 +99,7 @@ else
   OUT="${OUT}" BUCKET="${BUCKET_NAME}" hack/microvm-assets/stage-to-gcs.sh
 fi
 
-# --- 5. deploy the control plane --------------------------------------------
+# --- 3. deploy the control plane --------------------------------------------
 log "Deploying the ate control plane (--deploy-ate-system)..."
 if [[ "${ATE_INSTALL_KIND}" == "true" ]]; then
   # install-ate-kind.sh sets NO_DEV_ENV/KO_DOCKER_REPO/ARCH/ATE_INSTALL_KIND itself.
@@ -155,15 +109,23 @@ else
   KUBECTL_CONTEXT="${KUBECTL_CONTEXT}" hack/install-ate.sh --deploy-ate-system
 fi
 
-# --- 6. apply the demo ------------------------------------------------------
-# Use ./hack/run-tool.sh ko so ko honors KO_CONFIG_PATH + KO_DOCKER_REPO. Only
-# ko apply/create/delete/run accept args after `--`; thread --context there
-# (mirrors the run_ko helper in hack/install-ate.sh).
+# --- 4. apply the demo ------------------------------------------------------
+# Use ./hack/run-tool.sh ko so ko honors KO_DOCKER_REPO (the committed .ko.yaml base
+# is used as-is — no override). Only ko apply/create/delete/run accept args after
+# `--`; thread --context there (mirrors the run_ko helper in hack/install-ate.sh).
 log "Applying the counter-microvm demo manifest..."
-sed "s|\${BUCKET_NAME}|${BUCKET_NAME}|g" demos/counter/counter-microvm.yaml.tmpl \
+# virtiofsd is built from source (pinned commit in assemble.sh), so its binary bytes
+# are not reproducible across toolchains/arches and its sha can't be a fixed pin in the
+# manifest. Compute it from the freshly-staged binary and inject it, so the deployed
+# SandboxConfig always matches whatever was staged. The downloaded assets
+# (cloud-hypervisor/kernel/rootfs/config) keep their committed, reproducible per-arch shas.
+VIRTIOFSD_SHA256="$(sha256sum "${OUT}/virtiofsd" | awk '{print $1}')"
+sed -e "s|\${BUCKET_NAME}|${BUCKET_NAME}|g" \
+    -e "s|\${VIRTIOFSD_SHA256}|${VIRTIOFSD_SHA256}|g" \
+    demos/counter/counter-microvm.yaml.tmpl \
   | ./hack/run-tool.sh ko apply -f - ${KUBECTL_CONTEXT:+-- --context="${KUBECTL_CONTEXT}"}
 
-# --- 7. next steps ----------------------------------------------------------
+# --- 5. next steps ----------------------------------------------------------
 KCTX_FLAG=""
 if [[ -n "${KUBECTL_CONTEXT}" ]]; then
   KCTX_FLAG=" --context=${KUBECTL_CONTEXT}"
diff --git a/manifests/ate-install/sandboxconfig-validation.yaml b/manifests/ate-install/sandboxconfig-validation.yaml
index 7e96857f8..d350d10b4 100644
--- a/manifests/ate-install/sandboxconfig-validation.yaml
+++ b/manifests/ate-install/sandboxconfig-validation.yaml
@@ -41,9 +41,9 @@ spec:
       object.spec.sandboxClass != 'microvm' ||
       (has(object.spec.assets) && size(object.spec.assets) > 0 &&
        object.spec.assets.all(arch,
-         ['cloud-hypervisor', 'kata-kernel', 'kata-image', 'kata-config']
+         ['cloud-hypervisor', 'virtiofsd', 'kata-kernel', 'kata-image', 'kata-config']
            .all(name, name in object.spec.assets[arch])))
-    message: "a microvm SandboxConfig must define cloud-hypervisor, kata-kernel, kata-image, and kata-config assets for every architecture under spec.assets"
+    message: "a microvm SandboxConfig must define cloud-hypervisor, virtiofsd, kata-kernel, kata-image, and kata-config assets for every architecture under spec.assets"
 ---
 apiVersion: admissionregistration.k8s.io/v1
 kind: ValidatingAdmissionPolicyBinding
diff --git a/pkg/api/v1alpha1/sandboxconfig_validation_test.go b/pkg/api/v1alpha1/sandboxconfig_validation_test.go
index 3bb28dbba..20423e674 100644
--- a/pkg/api/v1alpha1/sandboxconfig_validation_test.go
+++ b/pkg/api/v1alpha1/sandboxconfig_validation_test.go
@@ -48,13 +48,13 @@ func sandboxConfig(name string, class SandboxClass, assets map[string]map[string
 func runscAsset() AssetFile { return AssetFile{URL: "gs://bucket/runsc", SHA256: validSHA256} }
 
 // microVMAssets returns a full, valid micro-VM asset set for one architecture:
-// the four assets the policy requires. ateom owns the cloud-hypervisor boot and
-// gives the actor a writable virtio-blk rootfs, so the set has no kata-shim or
-// virtiofsd.
+// the five assets the policy requires. The overlay rootfs serves the OCI image
+// over virtio-fs, so virtiofsd is part of the set.
 func microVMAssets() map[string]AssetFile {
 	a := AssetFile{URL: "gs://bucket/asset", SHA256: validSHA256}
 	return map[string]AssetFile{
 		"cloud-hypervisor": a,
+		"virtiofsd":        a,
 		"kata-kernel":      a,
 		"kata-image":       a,
 		"kata-config":      a,
@@ -133,6 +133,15 @@ func TestSandboxConfigValidation(t *testing.T) {
 		}()}),
 		wantErr: true,
 		errMsg:  "microvm SandboxConfig must define",
+	}, {
+		name: "microvm missing virtiofsd",
+		sc: sandboxConfig("bad-microvm-novfsd", "microvm", map[string]map[string]AssetFile{"amd64": func() map[string]AssetFile {
+			m := microVMAssets()
+			delete(m, "virtiofsd")
+			return m
+		}()}),
+		wantErr: true,
+		errMsg:  "microvm SandboxConfig must define",
 	}, {
 		name:    "gvisor arch missing runsc",
 		sc:      sandboxConfig("bad-no-runsc", SandboxClassGvisor, map[string]map[string]AssetFile{"amd64": {"notrunsc": runscAsset()}}),

From 31bd53b19672bb87cbaf042a4451270f57b4b486 Mon Sep 17 00:00:00 2001
From: Benjamin Elder <bentheelder@google.com>
Date: Thu, 25 Jun 2026 18:38:37 -0700
Subject: [PATCH 5/5] ateom-microvm: drop stale owned-boot / reset-to-golden
 comments

Terminology and accuracy fixup in files the overlay change doesn't otherwise touch:
the runtime no longer resets the rootfs to golden (the overlay's tmpfs upper persists
in the memory snapshot), and "owned-boot" was local jargon for ateom booting
cloud-hypervisor itself. Comments only.
---
 cmd/ateom-microvm/internal/kata/config.go      |  6 +++---
 cmd/ateom-microvm/internal/kata/config_test.go |  2 +-
 cmd/ateom-microvm/net.go                       | 10 +++++-----
 cmd/ateom-microvm/spec.go                      |  9 ++++-----
 4 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/cmd/ateom-microvm/internal/kata/config.go b/cmd/ateom-microvm/internal/kata/config.go
index 553059d49..7591e402f 100644
--- a/cmd/ateom-microvm/internal/kata/config.go
+++ b/cmd/ateom-microvm/internal/kata/config.go
@@ -32,8 +32,8 @@ type KataConfig struct {
 	VCPUs int
 	// KernelParams is the guest kernel command line ([hypervisor.clh]
 	// kernel_params): the kata-agent parameters (agent.log, the systemd target,
-	// etc.). The owned boot appends these to the cloud-hypervisor payload cmdline,
-	// since there is no kata shim to inject them.
+	// etc.). ateom appends these to the cloud-hypervisor payload cmdline, since
+	// there is no kata shim to inject them.
 	KernelParams string
 }
 
@@ -52,7 +52,7 @@ type clhConfigTOML struct {
 // ParseConfig reads the guest sizing and kernel_params from a kata
 // configuration.toml. memDefault/vcpuDefault are substituted when the key is
 // absent or non-positive (kata also accepts default_vcpus = -1 meaning "all host
-// CPUs", which the owned boot does not support).
+// CPUs", which ateom does not support).
 func ParseConfig(base []byte, memDefault, vcpuDefault int) (KataConfig, error) {
 	var c clhConfigTOML
 	if err := toml.Unmarshal(base, &c); err != nil {
diff --git a/cmd/ateom-microvm/internal/kata/config_test.go b/cmd/ateom-microvm/internal/kata/config_test.go
index 776aa7e00..dbced02ff 100644
--- a/cmd/ateom-microvm/internal/kata/config_test.go
+++ b/cmd/ateom-microvm/internal/kata/config_test.go
@@ -49,7 +49,7 @@ func TestParseConfig(t *testing.T) {
 
 // TestParseConfigDefaults asserts the mem/vcpu defaults kick in when the keys are
 // absent or non-positive (kata also accepts default_vcpus = -1 meaning "all host
-// CPUs", which the owned boot does not support).
+// CPUs", which ateom does not support).
 func TestParseConfigDefaults(t *testing.T) {
 	for _, tc := range []struct {
 		name string
diff --git a/cmd/ateom-microvm/net.go b/cmd/ateom-microvm/net.go
index e74db2bd3..cc047d4f2 100644
--- a/cmd/ateom-microvm/net.go
+++ b/cmd/ateom-microvm/net.go
@@ -69,9 +69,9 @@ const (
 	// gateway MAC keeps the frozen entry valid on every pod.
 	hostVethMAC = "02:a8:1e:00:00:01"
 
-	// actorGuestMAC is the FIXED MAC for the guest's eth0 (the CH virtio-net) on
-	// the ateom-owned-boot path. Fixed for the same reason as hostVethMAC: a cold
-	// boot freezes this MAC into the guest+snapshot, and restore re-adds the
+	// actorGuestMAC is the FIXED MAC for the guest's eth0 (the CH virtio-net).
+	// Fixed for the same reason as hostVethMAC: a cold boot freezes this MAC into
+	// the guest+snapshot, and restore re-adds the
 	// virtio-net under the same MAC (SnapshotNetDevices reads it back), so the
 	// guest's frozen interface config stays valid across pods. Distinct from the
 	// gateway MAC (…:01).
@@ -599,8 +599,8 @@ func (s *AteomService) setupRestoreTap(ctx context.Context, name string, queuePa
 	return fds, nil
 }
 
-// actorVethMTU reads the MTU of the actor veth (eth0 in the interior netns) so the
-// owned-boot path can configure the guest eth0 with a matching MTU via the agent
+// actorVethMTU reads the MTU of the actor veth (eth0 in the interior netns) so
+// ateom can configure the guest eth0 with a matching MTU via the agent
 // (UpdateInterface). Defaults to 1500 if the link can't be read.
 func (s *AteomService) actorVethMTU(ctx context.Context) int {
 	mtu := 1500
diff --git a/cmd/ateom-microvm/spec.go b/cmd/ateom-microvm/spec.go
index 8b9d5ca08..7962bc5aa 100644
--- a/cmd/ateom-microvm/spec.go
+++ b/cmd/ateom-microvm/spec.go
@@ -90,11 +90,10 @@ func ensureKataCompatibleSpec(bundle, id, netnsPath string) (*specs.Spec, error)
 	//
 	// KNOWN GAP vs the gVisor runtime: this also drops atelet's read-only actor
 	// identity bind mount (/run/ate/actor-id). The micro-VM guest can't see host
-	// paths (the rootfs is a virtio-blk disk, not a shared filesystem), and
-	// reset-to-golden restores guest RAM + rootfs from the golden snapshot, so a
-	// per-actor file written into the rootfs would be shadowed/incorrect on restore.
-	// Exposing the identity needs a per-actor volume injected from OUTSIDE the golden
-	// state; not yet implemented. No micro-VM workload depends on it today.
+	// paths (the rootfs is an overlay of a virtio-fs base + a guest-RAM upper, not a
+	// host bind), so atelet's host-path identity mount has nothing to bind to.
+	// Exposing the identity needs a per-actor volume plumbed into the guest; not yet
+	// implemented. No micro-VM workload depends on it today.
 	spec.Mounts = defaultKataMounts()
 
 	out, err := json.MarshalIndent(&spec, "", "  ")