Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions .ko.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,7 @@ defaultPlatforms:
baseImageOverrides:
github.com/agent-substrate/substrate/demos/sandbox: alpine
github.com/agent-substrate/substrate/demos/agent-secret: alpine
# ateom-microvm owns the cloud-hypervisor boot and builds the actor's writable
# virtio-blk rootfs at runtime, which needs mkfs.ext4 (e2fsprogs) plus glibc for
# the fetched cloud-hypervisor binary. The committed debian:stable-slim base has
# glibc + coreutils but NOT mkfs.ext4, so this default cannot build the rootfs on
# its own. hack/run-microvm-demo.sh builds hack/ateom-base (debian-slim +
# e2fsprogs) and overrides this base at build time via KO_CONFIG_PATH, so running
# the demo never edits this file. The committed default stays debian:stable-slim.
# ateom-microvm needs glibc (for the fetched cloud-hypervisor binary) and mount/umount
# (to bind the image into the virtiofsd shared dir) — both in debian:stable-slim but
# not in the distroless static default.
github.com/agent-substrate/substrate/cmd/ateom-microvm: debian:stable-slim
51 changes: 22 additions & 29 deletions cmd/ateom-microvm/checkpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ import (

// CheckpointWorkload suspends the actor and writes a portable CH snapshot.
//
// Contract with atelet (mirrors ateom-gvisor): after we return, atelet uploads
// the checkpoint dir to object storage, then tears down bundles and resets the
// actor dir.
// Contract with atelet: after we return, atelet uploads the checkpoint dir to object
// storage, then tears down bundles and resets the actor dir.
//
// ateom drives the ateom-owned CH's REST api-socket: pause -> snapshot
// file://<CheckpointStateDir> (config.json + state.json + sparse memory-ranges) ->
// tear the VMM down. The actor's rootfs lives on the host-backed /dev/vdb, not a
// guest tmpfs overlay-upper, so the snapshot is naturally memory-only and small —
// no RAM-backed upper to wipe and no balloon to inflate before snapshot.
// ateom drives the CH REST api-socket: pause -> snapshot file://<CheckpointStateDir>
// (config.json + state.json + sparse memory-ranges) -> tear the VMM down. Each
// container's rootfs is overlay(virtio-fs RO lower + guest-tmpfs upper), so the
// writable upper lives in guest RAM and is captured by the memory snapshot — process
// memory and rootfs writes both persist across suspend/resume. The RO lower is
// reconstructed from the OCI image at restore, so nothing rootfs-related ships here.
func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.CheckpointWorkloadRequest) (*ateompb.CheckpointWorkloadResponse, error) {
s.lock.Lock()
defer s.lock.Unlock()
Expand Down Expand Up @@ -79,9 +79,9 @@ func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.Chec
}

// Record the FROZEN base id (the id the guest's virtio-fs find-paths are pinned
// to, <baseID>/rootfs). For a cold (owned-boot) actor this is its own id; for a
// restored actor it is the golden id propagated via ra.baseID (set from the
// snapshot we restored from). RestoreWorkload reads this to lay the
// to, <baseID>/rootfs). For a cold-run actor this is its own id; for a restored
// actor it is the golden id propagated via ra.baseID (set from the snapshot we
// restored from). RestoreWorkload reads this to lay the
// reconstructed-from-image base at the path the guest expects. We can NOT derive
// it from config.json (its socket paths get rewritten to the current id on every
// restore, losing the invariant golden id).
Expand Down Expand Up @@ -120,25 +120,13 @@ func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.Chec
slog.String("id", id), slog.Duration("merge", time.Since(tMerge)))
}

// reset-to-golden support: save the actor's /dev/vdb AS-OF this (paused,
// consistent) snapshot as a verbatim golden template, so future restores can
// recreate the disk byte-identical to what the snapshot's guest RAM expects
// while discarding the actor's later rootfs writes. Saved once (the first/golden
// checkpoint) and kept; best-effort (without it, restore reopens the live disk =
// continuity). TODO: ship the template with the snapshot for cross-node restore
// (it's golden, shipped once per template, like the OCI base).
actorDir := ateompath.ActorPath(ns, name, id)
if tmpl := filepath.Join(actorDir, goldenRootfsDiskName); fileMissing(tmpl) {
if cerr := copyDiskFile(ctx, filepath.Join(actorDir, actorRootfsDiskName), tmpl); cerr != nil {
slog.WarnContext(ctx, "Failed to save golden rootfs template; restore will reopen live disk", slog.Any("err", cerr))
} else {
slog.InfoContext(ctx, "Saved golden rootfs disk template", slog.String("id", id))
}
}
// Nothing rootfs-related ships: the overlay's writable upper is a guest tmpfs, so
// the actor's rootfs writes are already in the memory snapshot above, and the RO
// lower is reconstructed from the OCI image at restore (it never changes).

// Report exactly the files we wrote so atelet ships precisely the CH snapshot
// (config.json + state.json + memory-ranges + base-id), not gVisor's fixed set.
// Memory-only: the RO base is reconstructed from the OCI image at restore.
// (config.json + state.json + memory-ranges + base-id). The RO base is
// reconstructed from the OCI image at restore.
snapshotFiles, err := listFiles(checkpointDir)
if err != nil {
return nil, fmt.Errorf("while listing snapshot files: %w", err)
Expand All @@ -151,7 +139,7 @@ func (s *AteomService) CheckpointWorkload(ctx context.Context, req *ateompb.Chec
dTeardown := time.Since(tTeardown)
delete(s.running, id)

// Tear down the per-activation actor network (mirrors gVisor).
// Tear down the per-activation actor network.
if err := s.cleanupActorNetwork(ctx); err != nil {
slog.WarnContext(ctx, "Failed to clean up actor network after checkpoint", slog.Any("err", err))
}
Expand Down Expand Up @@ -207,6 +195,11 @@ func (s *AteomService) teardownActor(ctx context.Context, id string, ra *running
_ = ra.chCmd.Process.Kill()
_, _ = ra.chCmd.Process.Wait()
}
// Kill the virtiofsd serving the overlay RO lower (after CH, its only client).
if ra.vfsdCmd != nil && ra.vfsdCmd.Process != nil {
_ = ra.vfsdCmd.Process.Kill()
_, _ = ra.vfsdCmd.Process.Wait()
}
}

// Sweep any leftover per-sandbox host-side state + orphaned per-sandbox
Expand Down
47 changes: 31 additions & 16 deletions cmd/ateom-microvm/internal/ch/createvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,35 @@ import (
)

// VmConfig is the body of /api/v1/vm.create — the subset of cloud-hypervisor's
// VmConfig ateom sets to boot a kata guest itself (the "ateom owns the boot"
// path, replacing the kata shim). Modeled on kata's clh driver
// (src/runtime/virtcontainers/clh.go) and the proven suspend-bench vmConfig.
// vm.create + vm.boot use PUT (empirically accepted by CH, like the bench).
// VmConfig ateom sets to boot the kata guest. Modeled on kata's clh driver
// (src/runtime/virtcontainers/clh.go). vm.create + vm.boot are issued with PUT.
type VmConfig struct {
Cpus CpusConfig `json:"cpus"`
Memory MemoryConfig `json:"memory"`
Payload PayloadConfig `json:"payload"`
Disks []DiskConfig `json:"disks,omitempty"`
Rng *RngConfig `json:"rng,omitempty"`
Serial *ConsoleConfig `json:"serial,omitempty"`
Console *ConsoleConfig `json:"console,omitempty"`
Vsock *VsockConfig `json:"vsock,omitempty"`
Cpus CpusConfig `json:"cpus"`
Memory MemoryConfig `json:"memory"`
Payload PayloadConfig `json:"payload"`
Disks []DiskConfig `json:"disks,omitempty"`
Fs []FsConfig `json:"fs,omitempty"`
Rng *RngConfig `json:"rng,omitempty"`
Serial *ConsoleConfig `json:"serial,omitempty"`
Console *ConsoleConfig `json:"console,omitempty"`
Vsock *VsockConfig `json:"vsock,omitempty"`
Platform *PlatformConfig `json:"platform,omitempty"`
}

// FsConfig is a virtio-fs device backed by a vhost-user (virtiofsd) socket. The
// overlay rootfs path uses it as the RO lower; the guest mounts it via the FsTag.
type FsConfig struct {
Tag string `json:"tag"`
Socket string `json:"socket"`
NumQueues int32 `json:"num_queues,omitempty"`
QueueSize int32 `json:"queue_size,omitempty"`
PciSegment int32 `json:"pci_segment,omitempty"`
}

// PlatformConfig sets VM-wide platform options. NumPciSegments must be >1 when a
// virtio-fs device sits on a non-zero PCI segment (kata puts fs on segment 1).
type PlatformConfig struct {
NumPciSegments int32 `json:"num_pci_segments,omitempty"`
}

// CpusConfig sets the boot/max vCPU counts.
Expand All @@ -56,10 +72,9 @@ type PayloadConfig struct {
Cmdline string `json:"cmdline"`
}

// DiskConfig is one virtio-blk disk. The kata guest image is disk 0 (/dev/vda,
// readonly); ateom appends the actor rootfs as disk 1 (/dev/vdb, writable). The
// guest sees disks in config order. NumQueues/QueueSize mirror kata's clh
// (num_queues = vcpus, queue_size = 1024).
// DiskConfig is one virtio-blk disk. The only disk is the kata guest image
// (/dev/vda, read-only); the actor rootfs is an overlay served over virtio-fs, not a
// disk. NumQueues/QueueSize mirror kata's clh (num_queues = vcpus, queue_size = 1024).
type DiskConfig struct {
Path string `json:"path"`
Readonly bool `json:"readonly"`
Expand Down
20 changes: 10 additions & 10 deletions cmd/ateom-microvm/internal/kata/agentclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,9 @@ func (a *AgentClient) StartContainer(ctx context.Context, containerID string) er

// CreateSandbox establishes the agent's sandbox context (sandbox id, hostname,
// sandbox pidns) before any container is created. The kata shim normally issues
// this once at VM boot; on the ateom-owned-boot path (no shim) ateom must call it
// itself so the agent has a sandbox to attach containers to. Storages is empty —
// the actor rootfs arrives as a per-container "blk" storage, not a sandbox mount.
// this once at VM boot; ateom (no shim) must call it itself so the agent has a
// sandbox to attach containers to. Storages carries the shared virtio-fs mount
// (the overlay lowers); each container's rootfs is assembled per-container.
// Mirrors grpc.AgentService/CreateSandbox (returns google.protobuf.Empty).
func (a *AgentClient) CreateSandbox(ctx context.Context, req *agentpb.CreateSandboxRequest) error {
if err := a.client.Call(ctx, "grpc.AgentService", "CreateSandbox", req, &emptypb.Empty{}); err != nil {
Expand All @@ -169,10 +169,10 @@ func (a *AgentClient) CreateSandbox(ctx context.Context, req *agentpb.CreateSand
return nil
}

// UpdateInterface configures a guest network interface (the kata shim's job; on
// the owned-boot path ateom does it). The agent matches the link by HwAddr, then
// applies the name/IP/MTU. Mirrors grpc.AgentService/UpdateInterface (returns the
// resulting Interface).
// UpdateInterface configures a guest network interface (the kata shim's job, which
// ateom does itself). The agent matches the link by HwAddr, then applies the
// name/IP/MTU. Mirrors grpc.AgentService/UpdateInterface (returns the resulting
// Interface).
func (a *AgentClient) UpdateInterface(ctx context.Context, iface *agentpb.Interface) error {
req := &agentpb.UpdateInterfaceRequest{Interface: iface}
if err := a.client.Call(ctx, "grpc.AgentService", "UpdateInterface", req, &agentpb.Interface{}); err != nil {
Expand Down Expand Up @@ -208,7 +208,7 @@ func (a *AgentClient) AddARPNeighbors(ctx context.Context, neighbors []*agentpb.
// buffered (up to max), so callers loop until it returns an error — the agent
// returns an error/EOF-like status once the stream ends (container exit / connection
// close). Mirrors grpc.AgentService/ReadStdout. The kata-agent keys the stream by
// ExecId, which the owned-boot path sets equal to ContainerId (see StartBlkWorkload).
// ExecId, which ateom sets equal to ContainerId.
func (a *AgentClient) ReadStdout(ctx context.Context, containerID, execID string, max uint32) ([]byte, error) {
resp := &agentpb.ReadStreamResponse{}
req := &agentpb.ReadStreamRequest{ContainerId: containerID, ExecId: execID, Len: max}
Expand Down Expand Up @@ -246,8 +246,8 @@ type StreamReader struct {
}

// NewStdioReader returns an io.Reader over the container's stdout (stderr=false)
// or stderr (stderr=true). execID matches the value passed to StartBlkWorkload
// (equal to containerID on the owned-boot path).
// or stderr (stderr=true). execID equals containerID (ateom sets ExecId ==
// ContainerId when it creates the container).
func NewStdioReader(ctx context.Context, ac *AgentClient, containerID, execID string, stderr bool) *StreamReader {
return &StreamReader{ctx: ctx, ac: ac, containerID: containerID, execID: execID, stderr: stderr}
}
Expand Down
6 changes: 3 additions & 3 deletions cmd/ateom-microvm/internal/kata/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ type KataConfig struct {
VCPUs int
// KernelParams is the guest kernel command line ([hypervisor.clh]
// kernel_params): the kata-agent parameters (agent.log, the systemd target,
// etc.). The owned boot appends these to the cloud-hypervisor payload cmdline,
// since there is no kata shim to inject them.
// etc.). ateom appends these to the cloud-hypervisor payload cmdline, since
// there is no kata shim to inject them.
KernelParams string
}

Expand All @@ -52,7 +52,7 @@ type clhConfigTOML struct {
// ParseConfig reads the guest sizing and kernel_params from a kata
// configuration.toml. memDefault/vcpuDefault are substituted when the key is
// absent or non-positive (kata also accepts default_vcpus = -1 meaning "all host
// CPUs", which the owned boot does not support).
// CPUs", which ateom does not support).
func ParseConfig(base []byte, memDefault, vcpuDefault int) (KataConfig, error) {
var c clhConfigTOML
if err := toml.Unmarshal(base, &c); err != nil {
Expand Down
2 changes: 1 addition & 1 deletion cmd/ateom-microvm/internal/kata/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func TestParseConfig(t *testing.T) {

// TestParseConfigDefaults asserts the mem/vcpu defaults kick in when the keys are
// absent or non-positive (kata also accepts default_vcpus = -1 meaning "all host
// CPUs", which the owned boot does not support).
// CPUs", which ateom does not support).
func TestParseConfigDefaults(t *testing.T) {
for _, tc := range []struct {
name string
Expand Down
140 changes: 0 additions & 140 deletions cmd/ateom-microvm/internal/kata/disk.go

This file was deleted.

Loading
Loading