diff --git a/lib/forkvm/README.md b/lib/forkvm/README.md index 6e956fec..a90c6e63 100644 --- a/lib/forkvm/README.md +++ b/lib/forkvm/README.md @@ -59,14 +59,22 @@ instead of reusing the source identity. - Network override fields are supplied at snapshot load to bind the fork to its own TAP device. - Vsock CID remains stable for snapshot-based flows. +- Standby forks hardlink the source's snapshot mem-file instead of copying it: + fanout costs no memory I/O and every fork of a snapshot faults against one + inode, so the kernel page cache (and the UFFD pager cache, keyed by the + inherited snapshot cache key) is shared across siblings. Deleting the source + is still safe immediately — unlink only drops a name; forks keep the inode + alive via its link count. +- Sharing an inode is safe because Firecracker mmaps the mem-file MAP_PRIVATE + (guest writes never reach the file) and the only file writer — the in-place + diff-snapshot merge on standby — first replaces any mem-file with `nlink > 1` + with a private copy (reflink-cloned where the filesystem supports FICLONE, + sparse-copied otherwise). A fork therefore becomes fully independent at its + first standby, and a source that standbys while forks still share its base + unshares the same way instead of mutating memory a fork reads. - When the Firecracker snapshot memory backend is configured as UFFD, UFFD is used as a one-shot acceleration for the first restore of a newly forked - standby snapshot. The fork initially reuses the source snapshot memory as the - pager backing file instead of cloning the large memory file during fanout. -- That deferred memory clone is paid when the fork later enters standby. Before - Firecracker writes the fork's diff snapshot, Hypeman materializes the fork's - own `snapshot-latest/memory` file from the original backing memory. After that - point the fork has a normal on-disk snapshot base, independent from the source. + standby snapshot. - Subsequent direct restores of that same fork use Firecracker's normal file-backed memory backend. If that standby fork is itself forked again, the new child gets its own one-shot UFFD restore. diff --git a/lib/guestmemory/controller_test.go b/lib/guestmemory/controller_test.go index 0b7ea7e9..4720ec7a 100644 --- a/lib/guestmemory/controller_test.go +++ b/lib/guestmemory/controller_test.go @@ -47,11 +47,9 @@ func (s *stubHypervisor) Shutdown(ctx context.Context) error { return nil } func (s *stubHypervisor) GetVMInfo(ctx context.Context) (*hypervisor.VMInfo, error) { return &hypervisor.VMInfo{State: hypervisor.StateRunning}, nil } -func (s *stubHypervisor) Pause(ctx context.Context) error { return nil } -func (s *stubHypervisor) Resume(ctx context.Context) error { return nil } -func (s *stubHypervisor) Snapshot(ctx context.Context, destPath string, _ hypervisor.SnapshotOptions) error { - return nil -} +func (s *stubHypervisor) Pause(ctx context.Context) error { return nil } +func (s *stubHypervisor) Resume(ctx context.Context) error { return nil } +func (s *stubHypervisor) Snapshot(ctx context.Context, destPath string) error { return nil } func (s *stubHypervisor) ResizeMemory(ctx context.Context, bytes int64) error { return nil } func (s *stubHypervisor) ResizeMemoryAndWait(ctx context.Context, bytes int64, timeout time.Duration) error { return nil diff --git a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go index 8f6f2484..1d3a7f14 100644 --- a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go +++ b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go @@ -160,7 +160,7 @@ func (c *CloudHypervisor) Resume(ctx context.Context) error { } // Snapshot creates a VM snapshot. -func (c *CloudHypervisor) Snapshot(ctx context.Context, destPath string, _ hypervisor.SnapshotOptions) error { +func (c *CloudHypervisor) Snapshot(ctx context.Context, destPath string) error { snapshotURL := "file://" + destPath snapshotConfig := vmm.VmSnapshotConfig{DestinationUrl: &snapshotURL} resp, err := c.client.PutVmSnapshotWithResponse(ctx, snapshotConfig) diff --git a/lib/hypervisor/firecracker/config_test.go b/lib/hypervisor/firecracker/config_test.go index fcbe7c6b..c5e8dc6c 100644 --- a/lib/hypervisor/firecracker/config_test.go +++ b/lib/hypervisor/firecracker/config_test.go @@ -101,37 +101,6 @@ func TestSnapshotLoadParamsSupportsUFFDBackend(t *testing.T) { assert.Equal(t, "/tmp/pager.sock", load.MemBackend.BackendPath) } -func TestMaterializeDeferredSnapshotMemory(t *testing.T) { - t.Parallel() - - sourcePath := filepath.Join(t.TempDir(), "source-memory") - snapshotDir := filepath.Join(t.TempDir(), "snapshot-latest") - require.NoError(t, os.WriteFile(sourcePath, []byte("memory"), 0644)) - - require.NoError(t, materializeDeferredSnapshotMemory(snapshotDir, sourcePath)) - - got, err := os.ReadFile(filepath.Join(snapshotDir, "memory")) - require.NoError(t, err) - assert.Equal(t, []byte("memory"), got) -} - -func TestMaterializeDeferredSnapshotMemoryUsesRetainedSnapshotAlternate(t *testing.T) { - t.Parallel() - - root := t.TempDir() - sourcePath := filepath.Join(root, "snapshots", "snapshot-base", "memory") - alternatePath := filepath.Join(root, "snapshots", "snapshot-latest", "memory") - destPath := filepath.Join(t.TempDir(), "snapshot-latest") - require.NoError(t, os.MkdirAll(filepath.Dir(alternatePath), 0755)) - require.NoError(t, os.WriteFile(alternatePath, []byte("memory"), 0644)) - - require.NoError(t, materializeDeferredSnapshotMemory(destPath, sourcePath)) - - got, err := os.ReadFile(filepath.Join(destPath, "memory")) - require.NoError(t, err) - assert.Equal(t, []byte("memory"), got) -} - func TestToBalloonConfig(t *testing.T) { cfg := hypervisor.VMConfig{ GuestMemory: hypervisor.GuestMemoryConfig{ diff --git a/lib/hypervisor/firecracker/firecracker.go b/lib/hypervisor/firecracker/firecracker.go index c2bb7db3..3512f653 100644 --- a/lib/hypervisor/firecracker/firecracker.go +++ b/lib/hypervisor/firecracker/firecracker.go @@ -14,7 +14,6 @@ import ( "strings" "time" - "github.com/kernel/hypeman/lib/forkvm" "github.com/kernel/hypeman/lib/hypervisor" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" @@ -113,13 +112,10 @@ func (f *Firecracker) Resume(ctx context.Context) error { return nil } -func (f *Firecracker) Snapshot(ctx context.Context, destPath string, opts hypervisor.SnapshotOptions) error { +func (f *Firecracker) Snapshot(ctx context.Context, destPath string) error { if err := os.MkdirAll(destPath, 0755); err != nil { return fmt.Errorf("create snapshot directory: %w", err) } - if err := materializeDeferredSnapshotMemory(destPath, opts.DeferredMemoryBackingPath); err != nil { - return err - } params := toSnapshotCreateParams(destPath) if _, err := f.do(ctx, http.MethodPut, "/snapshot/create", params, http.StatusNoContent); err != nil { return fmt.Errorf("create snapshot: %w", err) @@ -127,62 +123,6 @@ func (f *Firecracker) Snapshot(ctx context.Context, destPath string, opts hyperv return nil } -func materializeDeferredSnapshotMemory(destPath, sourcePath string) error { - sourcePath = strings.TrimSpace(sourcePath) - if sourcePath == "" { - return nil - } - targetPath := filepath.Join(destPath, "memory") - if _, err := os.Stat(targetPath); err == nil { - return nil - } else if !os.IsNotExist(err) { - return fmt.Errorf("stat deferred snapshot memory target: %w", err) - } - resolvedSourcePath, err := resolveDeferredSnapshotMemorySourcePath(sourcePath) - if err != nil { - return err - } - if err := forkvm.CopyRegularFile(resolvedSourcePath, targetPath); err != nil { - return fmt.Errorf("materialize deferred snapshot memory: %w", err) - } - return nil -} - -func resolveDeferredSnapshotMemorySourcePath(sourcePath string) (string, error) { - if _, err := os.Stat(sourcePath); err == nil { - return sourcePath, nil - } else if !os.IsNotExist(err) { - return "", fmt.Errorf("stat deferred snapshot memory source: %w", err) - } - - alternatePath := alternateRetainedSnapshotMemoryPath(sourcePath) - if alternatePath == "" { - return sourcePath, nil - } - if _, err := os.Stat(alternatePath); err == nil { - return alternatePath, nil - } else if !os.IsNotExist(err) { - return "", fmt.Errorf("stat alternate deferred snapshot memory source: %w", err) - } - return sourcePath, nil -} - -func alternateRetainedSnapshotMemoryPath(sourcePath string) string { - if filepath.Base(sourcePath) != "memory" { - return "" - } - snapshotDir := filepath.Dir(sourcePath) - snapshotsDir := filepath.Dir(snapshotDir) - switch filepath.Base(snapshotDir) { - case "snapshot-base": - return filepath.Join(snapshotsDir, "snapshot-latest", "memory") - case "snapshot-latest": - return filepath.Join(snapshotsDir, "snapshot-base", "memory") - default: - return "" - } -} - func (f *Firecracker) ResizeMemory(ctx context.Context, bytes int64) error { return hypervisor.ErrNotSupported } diff --git a/lib/hypervisor/hypervisor.go b/lib/hypervisor/hypervisor.go index ce96fa85..24736258 100644 --- a/lib/hypervisor/hypervisor.go +++ b/lib/hypervisor/hypervisor.go @@ -138,10 +138,6 @@ type RestoreOptions struct { SnapshotMemorySessionID string } -type SnapshotOptions struct { - DeferredMemoryBackingPath string -} - // ForkNetworkConfig contains network identity fields for fork preparation. type ForkNetworkConfig struct { TAPDevice string @@ -199,7 +195,7 @@ type Hypervisor interface { // Snapshot creates a VM snapshot at the given path. // Check Capabilities().SupportsSnapshot before calling. - Snapshot(ctx context.Context, destPath string, opts SnapshotOptions) error + Snapshot(ctx context.Context, destPath string) error // ResizeMemory changes the VM's memory allocation. // Check Capabilities().SupportsHotplugMemory before calling. diff --git a/lib/hypervisor/qemu/qemu.go b/lib/hypervisor/qemu/qemu.go index 70809f90..1da70965 100644 --- a/lib/hypervisor/qemu/qemu.go +++ b/lib/hypervisor/qemu/qemu.go @@ -140,7 +140,7 @@ func (q *QEMU) Resume(ctx context.Context) error { // Snapshot creates a VM snapshot using QEMU's migrate-to-file mechanism. // The VM state is saved to destPath/memory file. // The VM config is copied to destPath for restore (QEMU requires exact arg match). -func (q *QEMU) Snapshot(ctx context.Context, destPath string, _ hypervisor.SnapshotOptions) error { +func (q *QEMU) Snapshot(ctx context.Context, destPath string) error { // QEMU uses migrate to file for snapshots // The "file:" protocol is deprecated in QEMU 7.2+, use "exec:cat > path" instead memoryFile := destPath + "/memory" diff --git a/lib/hypervisor/tracing.go b/lib/hypervisor/tracing.go index f15a25fc..0ee93ae0 100644 --- a/lib/hypervisor/tracing.go +++ b/lib/hypervisor/tracing.go @@ -217,14 +217,14 @@ func (h *tracingHypervisor) Resume(ctx context.Context) (err error) { return h.next.Resume(ctx) } -func (h *tracingHypervisor) Snapshot(ctx context.Context, destPath string, opts SnapshotOptions) (err error) { +func (h *tracingHypervisor) Snapshot(ctx context.Context, destPath string) (err error) { ctx, span := startTraceSpan(ctx, h.tracer, "hypervisor.snapshot", h.spanAttrs( attribute.String("operation", "snapshot"), )..., ) defer func() { finishTraceSpan(span, err) }() - return h.next.Snapshot(ctx, destPath, opts) + return h.next.Snapshot(ctx, destPath) } func (h *tracingHypervisor) ResizeMemory(ctx context.Context, bytes int64) (err error) { diff --git a/lib/hypervisor/tracing_test.go b/lib/hypervisor/tracing_test.go index eb8162ed..409e1bd0 100644 --- a/lib/hypervisor/tracing_test.go +++ b/lib/hypervisor/tracing_test.go @@ -24,11 +24,9 @@ func (fakeHypervisor) Shutdown(context.Context) error { return nil } func (fakeHypervisor) GetVMInfo(context.Context) (*VMInfo, error) { return &VMInfo{State: StateRunning}, nil } -func (fakeHypervisor) Pause(context.Context) error { return nil } -func (fakeHypervisor) Resume(context.Context) error { return nil } -func (fakeHypervisor) Snapshot(context.Context, string, SnapshotOptions) error { - return nil -} +func (fakeHypervisor) Pause(context.Context) error { return nil } +func (fakeHypervisor) Resume(context.Context) error { return nil } +func (fakeHypervisor) Snapshot(context.Context, string) error { return nil } func (fakeHypervisor) ResizeMemory(context.Context, int64) error { return nil } func (fakeHypervisor) ResizeMemoryAndWait(context.Context, int64, time.Duration) error { return nil @@ -43,11 +41,9 @@ func (fakeHypervisorGetVMInfoError) Shutdown(context.Context) error { return nil func (fakeHypervisorGetVMInfoError) GetVMInfo(context.Context) (*VMInfo, error) { return nil, errors.New("vm info failed") } -func (fakeHypervisorGetVMInfoError) Pause(context.Context) error { return nil } -func (fakeHypervisorGetVMInfoError) Resume(context.Context) error { return nil } -func (fakeHypervisorGetVMInfoError) Snapshot(context.Context, string, SnapshotOptions) error { - return nil -} +func (fakeHypervisorGetVMInfoError) Pause(context.Context) error { return nil } +func (fakeHypervisorGetVMInfoError) Resume(context.Context) error { return nil } +func (fakeHypervisorGetVMInfoError) Snapshot(context.Context, string) error { return nil } func (fakeHypervisorGetVMInfoError) ResizeMemory(context.Context, int64) error { return nil } diff --git a/lib/hypervisor/vz/client.go b/lib/hypervisor/vz/client.go index 81953d30..2fb0a62a 100644 --- a/lib/hypervisor/vz/client.go +++ b/lib/hypervisor/vz/client.go @@ -301,7 +301,7 @@ func (c *Client) rawVMState(ctx context.Context) (string, error) { return info.State, nil } -func (c *Client) Snapshot(ctx context.Context, destPath string, _ hypervisor.SnapshotOptions) error { +func (c *Client) Snapshot(ctx context.Context, destPath string) error { req := snapshotRequest{DestinationPath: destPath} body, err := json.Marshal(req) if err != nil { diff --git a/lib/instances/firecracker_memfile.go b/lib/instances/firecracker_memfile.go new file mode 100644 index 00000000..665b8624 --- /dev/null +++ b/lib/instances/firecracker_memfile.go @@ -0,0 +1,72 @@ +package instances + +import ( + "context" + "fmt" + "os" + "path/filepath" + "syscall" + "time" + + "github.com/kernel/hypeman/lib/forkvm" + "github.com/kernel/hypeman/lib/logger" +) + +// linkForkFirecrackerMemFile hardlinks the source's snapshot mem-file into the +// fork's guest dir so all forks of a snapshot share one inode: fanout costs no +// copy I/O, and the pager's backing reads hit the kernel page cache warmed by +// sibling forks. Falls back to a reflink/sparse copy when linking fails. +// Sharing an inode is safe because Firecracker mmaps the mem-file MAP_PRIVATE +// and the only file writer, the standby diff snapshot, unshares first via +// ensureExclusiveSnapshotMemoryOwnership. +func linkForkFirecrackerMemFile(ctx context.Context, srcGuestDir, dstGuestDir string) error { + srcMem := firecrackerSnapshotMemoryPathInGuestDir(srcGuestDir) + dstMem := firecrackerSnapshotMemoryPathInGuestDir(dstGuestDir) + if err := os.MkdirAll(filepath.Dir(dstMem), 0755); err != nil { + return fmt.Errorf("create fork snapshot dir: %w", err) + } + err := os.Link(srcMem, dstMem) + if err == nil { + return nil + } + logger.FromContext(ctx).WarnContext(ctx, "hardlink of fork snapshot memory failed; falling back to copy", + "source", srcMem, "target", dstMem, "error", err) + if err := forkvm.CopyRegularFile(srcMem, dstMem); err != nil { + return fmt.Errorf("copy fork snapshot memory: %w", err) + } + return nil +} + +// ensureExclusiveSnapshotMemoryOwnership replaces the snapshot mem-file with a +// private copy when other hardlinks to its inode exist (fanout forks). +// Firecracker merges diff snapshots by writing dirty pages into this file in +// place, which must never mutate memory another instance still reads. +func ensureExclusiveSnapshotMemoryOwnership(ctx context.Context, snapshotDir string) error { + memPath := filepath.Join(snapshotDir, "memory") + info, err := os.Stat(memPath) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return fmt.Errorf("stat snapshot memory: %w", err) + } + stat, ok := info.Sys().(*syscall.Stat_t) + if !ok || stat.Nlink <= 1 { + return nil + } + + start := time.Now() + tmpPath := memPath + ".unshare.tmp" + _ = os.Remove(tmpPath) + if err := forkvm.CopyRegularFile(memPath, tmpPath); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("copy shared snapshot memory: %w", err) + } + if err := os.Rename(tmpPath, memPath); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("replace shared snapshot memory: %w", err) + } + logger.FromContext(ctx).InfoContext(ctx, "unshared snapshot memory before diff snapshot", + "path", memPath, "links", stat.Nlink, "duration", time.Since(start).String()) + return nil +} diff --git a/lib/instances/firecracker_test.go b/lib/instances/firecracker_test.go index 20fe7983..2456f5ff 100644 --- a/lib/instances/firecracker_test.go +++ b/lib/instances/firecracker_test.go @@ -762,21 +762,24 @@ func TestFCUFFDOneShotLifecycle(t *testing.T) { writeGuestFile(t, ctx, parent, "/root/uffd-lifecycle/parent-after-uffd", "parent-disk") writeGuestFile(t, ctx, parent, "/dev/shm/uffd-lifecycle/parent-after-uffd", "parent-memory") parentSnapshotMemoryPath := filepath.Join(p.InstanceSnapshotLatest(parentID), "memory") - require.NoFileExists(t, parentSnapshotMemoryPath, "UFFD snapshot fanout should defer the memory clone while running") + parentBaseMemoryPath := filepath.Join(p.InstanceSnapshotBase(parentID), "memory") + require.FileExists(t, parentBaseMemoryPath, "UFFD snapshot fanout should hardlink the mem-file (retained as diff base after restore)") + requireSameInode(t, snapshotMemoryPath, parentBaseMemoryPath) parentMeta, err := mgr.loadMetadata(parentID) require.NoError(t, err) require.False(t, parentMeta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) - require.Equal(t, snapshotMemoryPath, parentMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.NotEmpty(t, parentMeta.StoredMetadata.FirecrackerUFFDSessionID) + require.NoError(t, mgr.DeleteSnapshot(ctx, snapshot.Id), "source snapshot must be deletable while a fork is running from it") + snapshotDeleted = true + parent, err = mgr.StandbyInstance(ctx, parentID, StandbyInstanceRequest{}) require.NoError(t, err) require.Equal(t, StateStandby, parent.State) - require.FileExists(t, parentSnapshotMemoryPath, "standby should materialize a normal file-backed snapshot from the deferred UFFD backing") + require.FileExists(t, parentSnapshotMemoryPath, "standby should produce a file-backed snapshot") parentMeta, err = mgr.loadMetadata(parentID) require.NoError(t, err) require.False(t, parentMeta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) - require.Empty(t, parentMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.Empty(t, parentMeta.StoredMetadata.FirecrackerUFFDSessionID) parent, err = mgr.RestoreInstance(ctx, parentID) @@ -789,7 +792,6 @@ func TestFCUFFDOneShotLifecycle(t *testing.T) { parentMeta, err = mgr.loadMetadata(parentID) require.NoError(t, err) require.False(t, parentMeta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) - require.Empty(t, parentMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.Empty(t, parentMeta.StoredMetadata.FirecrackerUFFDSessionID) require.FileExists(t, filepath.Join(p.InstanceSnapshotBase(parentID), "memory"), "file-backed resume should retain the standby snapshot as the next diff base") @@ -821,12 +823,12 @@ func TestFCUFFDOneShotLifecycle(t *testing.T) { assertGuestFileAbsent(t, ctx, parent, "/dev/shm/uffd-lifecycle/child-only") childSnapshotMemoryPath := filepath.Join(p.InstanceSnapshotLatest(childID), "memory") - require.NoFileExists(t, childSnapshotMemoryPath, "running-source child should defer the memory clone while running") + childBaseMemoryPath := filepath.Join(p.InstanceSnapshotBase(childID), "memory") + require.FileExists(t, childBaseMemoryPath, "running-source child should hardlink the mem-file (retained as diff base after restore)") + requireSameInode(t, filepath.Join(p.InstanceSnapshotBase(parentID), "memory"), childBaseMemoryPath) childMeta, err := mgr.loadMetadata(childID) require.NoError(t, err) require.False(t, childMeta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) - require.NotEmpty(t, childMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) - require.FileExists(t, childMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.NotEmpty(t, childMeta.StoredMetadata.FirecrackerUFFDSessionID) parent, err = mgr.StandbyInstance(ctx, parentID, StandbyInstanceRequest{}) @@ -836,17 +838,16 @@ func TestFCUFFDOneShotLifecycle(t *testing.T) { require.FileExists(t, parentSnapshotMemoryPath) parentMeta, err = mgr.loadMetadata(parentID) require.NoError(t, err) - require.Empty(t, parentMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.Empty(t, parentMeta.StoredMetadata.FirecrackerUFFDSessionID) child, err = mgr.StandbyInstance(ctx, childID, StandbyInstanceRequest{}) require.NoError(t, err) require.Equal(t, StateStandby, child.State) - require.FileExists(t, childSnapshotMemoryPath, "child standby should materialize from its deferred UFFD backing") + require.FileExists(t, childSnapshotMemoryPath, "child standby should produce a file-backed snapshot") + requireDifferentInode(t, filepath.Join(p.InstanceSnapshotLatest(parentID), "memory"), childSnapshotMemoryPath) childMeta, err = mgr.loadMetadata(childID) require.NoError(t, err) require.False(t, childMeta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) - require.Empty(t, childMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.Empty(t, childMeta.StoredMetadata.FirecrackerUFFDSessionID) child, err = mgr.RestoreInstance(ctx, childID) @@ -911,6 +912,24 @@ func requireGuestTmpfs(t *testing.T, ctx context.Context, inst *Instance) { require.Equal(t, 0, exitCode, output) } +func requireDifferentInode(t *testing.T, pathA, pathB string) { + t.Helper() + infoA, err := os.Stat(pathA) + require.NoError(t, err) + infoB, err := os.Stat(pathB) + require.NoError(t, err) + require.False(t, os.SameFile(infoA, infoB), "%s and %s must not share an inode", pathA, pathB) +} + +func requireSameInode(t *testing.T, pathA, pathB string) { + t.Helper() + infoA, err := os.Stat(pathA) + require.NoError(t, err) + infoB, err := os.Stat(pathB) + require.NoError(t, err) + require.True(t, os.SameFile(infoA, infoB), "%s and %s must share an inode", pathA, pathB) +} + func writeGuestFile(t *testing.T, ctx context.Context, inst *Instance, path, contents string) { t.Helper() output, exitCode, err := execCommand(ctx, inst, "sh", "-c", "mkdir -p \"$1\" && printf '%s' \"$2\" > \"$3\" && sync", "sh", filepath.Dir(path), contents, path) @@ -1025,14 +1044,15 @@ func TestFirecrackerForkIsolation(t *testing.T) { }) require.Equal(t, StateStandby, fork.State) - // Fork's mem-file must be a separate inode from the source's. Hardlinking - // or symlinking would share the inode and allow later writes to corrupt - // the source. + // Fork creation hardlinks the mem-file: the fork shares the source's inode + // (zero-copy fanout, shared page cache) and the standby path unshares it + // before any diff-snapshot write. The byte-identity assertions below are + // the real isolation guard. forkMemPath := filepath.Join(p.InstanceSnapshotLatest(forkID), "memory") forkAfterCreate, err := fingerprintFile(forkMemPath) require.NoError(t, err, "fingerprint fork mem-file after fork") - require.NotEqual(t, sourceBefore.inode, forkAfterCreate.inode, - "fork mem-file must not share an inode with the source") + require.Equal(t, sourceBefore.inode, forkAfterCreate.inode, + "fork mem-file should hardlink the source's inode at creation") sourceAfterFork, err := fingerprintFile(sourceMemPath) require.NoError(t, err) @@ -1068,6 +1088,12 @@ func TestFirecrackerForkIsolation(t *testing.T) { require.NoError(t, err) require.Equal(t, StateStandby, fork.State) + // The fork's standby must have unshared the hardlink before diff-writing. + forkAfterStandby, err := fingerprintFile(forkMemPath) + require.NoError(t, err, "fingerprint fork mem-file after fork standby") + require.NotEqual(t, sourceBefore.inode, forkAfterStandby.inode, + "fork standby must unshare the mem-file before writing its diff snapshot") + // Source mem-file must STILL be byte-identical after the fork's full // lifecycle (restore + write + standby/diff-snapshot). sourceAfterForkStandby, err := fingerprintFile(sourceMemPath) diff --git a/lib/instances/firecracker_uffd.go b/lib/instances/firecracker_uffd.go index 18ceb137..667e4e5d 100644 --- a/lib/instances/firecracker_uffd.go +++ b/lib/instances/firecracker_uffd.go @@ -8,7 +8,6 @@ import ( "os" "path/filepath" "strings" - "sync" "syscall" "time" @@ -29,10 +28,6 @@ func useFirecrackerUFFDOnNextRestore(hvType hypervisor.Type, sourceIsStandby boo return hvType == hypervisor.TypeFirecracker && sourceIsStandby && targetState != StateStopped } -func (m *manager) shouldDeferFirecrackerSnapshotMemoryCopy(stored *StoredMetadata, sourceIsStandby bool, targetState State) bool { - return m.useFirecrackerUFFD(stored) && useFirecrackerUFFDOnNextRestore(stored.HypervisorType, sourceIsStandby, targetState) -} - func clearFirecrackerUFFDRestoreState(stored *StoredMetadata) { if stored == nil { return @@ -40,87 +35,12 @@ func clearFirecrackerUFFDRestoreState(stored *StoredMetadata) { stored.FirecrackerUseUFFDOnNextRestore = false stored.FirecrackerUFFDSessionID = "" stored.FirecrackerUFFDPagerVersion = "" - stored.FirecrackerDeferredSnapshotMemoryPath = "" } func firecrackerSnapshotMemoryPathInGuestDir(guestDir string) string { return filepath.Join(guestDir, firecrackerSnapshotMemoryRelPath) } -func (m *manager) lockFirecrackerSnapshotSource(path string) func() { - key := firecrackerSnapshotSourceLockKey(path) - if key == "" { - return func() {} - } - value, _ := m.snapshotSourceLocks.LoadOrStore(key, &sync.Mutex{}) - mu := value.(*sync.Mutex) - mu.Lock() - return mu.Unlock -} - -func firecrackerSnapshotSourceLockKey(path string) string { - path = filepath.Clean(strings.TrimSpace(path)) - if path == "." || path == "" { - return "" - } - if filepath.Base(path) != "memory" { - return path - } - snapshotDir := filepath.Dir(path) - if base := filepath.Base(snapshotDir); base != "snapshot-latest" && base != "snapshot-base" { - return path - } - snapshotsDir := filepath.Dir(snapshotDir) - if filepath.Base(snapshotsDir) != "snapshots" { - return path - } - return filepath.Dir(snapshotsDir) -} - -func resolveFirecrackerSnapshotMemoryBackingPath(memoryPath string) (string, error) { - if _, err := os.Stat(memoryPath); err == nil { - return memoryPath, nil - } else if !os.IsNotExist(err) { - return "", fmt.Errorf("stat firecracker snapshot memory backing: %w", err) - } - - alternatePath := alternateFirecrackerRetainedSnapshotMemoryPath(memoryPath) - if alternatePath == "" { - return memoryPath, nil - } - if _, err := os.Stat(alternatePath); err == nil { - return alternatePath, nil - } else if !os.IsNotExist(err) { - return "", fmt.Errorf("stat alternate firecracker snapshot memory backing: %w", err) - } - return memoryPath, nil -} - -func alternateFirecrackerRetainedSnapshotMemoryPath(memoryPath string) string { - if filepath.Base(memoryPath) != "memory" { - return "" - } - snapshotDir := filepath.Dir(memoryPath) - snapshotsDir := filepath.Dir(snapshotDir) - switch filepath.Base(snapshotDir) { - case "snapshot-base": - return filepath.Join(snapshotsDir, "snapshot-latest", "memory") - case "snapshot-latest": - return filepath.Join(snapshotsDir, "snapshot-base", "memory") - default: - return "" - } -} - -func firecrackerDeferredSnapshotMemoryPath(stored *StoredMetadata, guestDir string) string { - if stored != nil { - if path := strings.TrimSpace(stored.FirecrackerDeferredSnapshotMemoryPath); path != "" { - return path - } - } - return firecrackerSnapshotMemoryPathInGuestDir(guestDir) -} - func (m *manager) firecrackerSnapshotRestoreOptions(stored *StoredMetadata, snapshotDir string) (hypervisor.RestoreOptions, error) { opts := hypervisor.RestoreOptions{SnapshotMemoryBackend: hypervisor.SnapshotMemoryBackendFile} if stored == nil || stored.HypervisorType != hypervisor.TypeFirecracker { @@ -134,14 +54,7 @@ func (m *manager) firecrackerSnapshotRestoreOptions(stored *StoredMetadata, snap return opts, fmt.Errorf("firecracker uffd snapshot restore is enabled but the pager is not configured") } - backingMemoryPath := strings.TrimSpace(stored.FirecrackerDeferredSnapshotMemoryPath) - if backingMemoryPath == "" { - backingMemoryPath = filepath.Join(snapshotDir, "memory") - } - backingMemoryPath, err := resolveFirecrackerSnapshotMemoryBackingPath(backingMemoryPath) - if err != nil { - return opts, err - } + backingMemoryPath := filepath.Join(snapshotDir, "memory") cacheKey := strings.TrimSpace(stored.FirecrackerSnapshotCacheKey) if cacheKey == "" { var err error diff --git a/lib/instances/firecracker_uffd_test.go b/lib/instances/firecracker_uffd_test.go index dc5ed57d..cd7720b6 100644 --- a/lib/instances/firecracker_uffd_test.go +++ b/lib/instances/firecracker_uffd_test.go @@ -101,10 +101,10 @@ func TestForkInstanceFromStandbyArmsOneShotUFFDForFirecrackerRestore(t *testing. require.NoError(t, err) assert.True(t, meta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) assert.Equal(t, "shared-template-cache", meta.StoredMetadata.FirecrackerSnapshotCacheKey) - assert.Equal(t, sourceMemoryPath, meta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) assert.Empty(t, meta.StoredMetadata.FirecrackerUFFDSessionID) assert.Empty(t, meta.StoredMetadata.FirecrackerUFFDPagerVersion) - assert.NoFileExists(t, firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.InstanceDir(forked.Id))) + forkMemoryPath := firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.InstanceDir(forked.Id)) + assertSameInode(t, sourceMemoryPath, forkMemoryPath) assert.FileExists(t, filepath.Join(mgr.paths.InstanceSnapshotLatest(forked.Id), "state")) } @@ -135,7 +135,6 @@ func TestForkInstanceFromStandbyDoesNotArmOneShotUFFDForStoppedTarget(t *testing require.NoError(t, err) assert.False(t, meta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) assert.Equal(t, "shared-template-cache", meta.StoredMetadata.FirecrackerSnapshotCacheKey) - assert.Empty(t, meta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) assert.FileExists(t, firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.InstanceDir(forked.Id))) } @@ -171,10 +170,10 @@ func TestForkSnapshotFromStandbyArmsOneShotUFFDForFirecrackerRestore(t *testing. require.NoError(t, err) assert.True(t, meta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) assert.Equal(t, "shared-snapshot-cache", meta.StoredMetadata.FirecrackerSnapshotCacheKey) - assert.Equal(t, firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.SnapshotGuestDir(snap.Id)), meta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) assert.Empty(t, meta.StoredMetadata.FirecrackerUFFDSessionID) assert.Empty(t, meta.StoredMetadata.FirecrackerUFFDPagerVersion) - assert.NoFileExists(t, firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.InstanceDir(forked.Id))) + forkMemoryPath := firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.InstanceDir(forked.Id)) + assertSameInode(t, firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.SnapshotGuestDir(snap.Id)), forkMemoryPath) assert.FileExists(t, filepath.Join(mgr.paths.InstanceSnapshotLatest(forked.Id), "state")) } @@ -235,26 +234,63 @@ func TestRestoreInstanceClearsOneShotUFFDFlagAfterSuccessfulRestore(t *testing.T assert.False(t, updated.StoredMetadata.FirecrackerUseUFFDOnNextRestore) } -func TestResolveFirecrackerSnapshotMemoryBackingPathUsesRetainedBaseAlternate(t *testing.T) { +func assertDifferentInode(t *testing.T, pathA, pathB string) { + t.Helper() + infoA, err := os.Stat(pathA) + require.NoError(t, err) + infoB, err := os.Stat(pathB) + require.NoError(t, err) + assert.False(t, os.SameFile(infoA, infoB), "%s and %s must not share an inode", pathA, pathB) +} + +func assertSameInode(t *testing.T, pathA, pathB string) { + t.Helper() + infoA, err := os.Stat(pathA) + require.NoError(t, err) + infoB, err := os.Stat(pathB) + require.NoError(t, err) + assert.True(t, os.SameFile(infoA, infoB), "%s and %s must share an inode", pathA, pathB) +} + +func TestEnsureExclusiveSnapshotMemoryOwnershipUnsharesHardlinkedMemory(t *testing.T) { t.Parallel() root := t.TempDir() - latestMemoryPath := filepath.Join(root, "snapshots", "snapshot-latest", "memory") - baseMemoryPath := filepath.Join(root, "snapshots", "snapshot-base", "memory") - require.NoError(t, os.MkdirAll(filepath.Dir(baseMemoryPath), 0755)) - require.NoError(t, os.WriteFile(baseMemoryPath, []byte("base memory"), 0644)) + snapshotDir := filepath.Join(root, "snapshots", "snapshot-latest") + require.NoError(t, os.MkdirAll(snapshotDir, 0755)) + memPath := filepath.Join(snapshotDir, "memory") + require.NoError(t, os.WriteFile(memPath, []byte("shared memory"), 0644)) + forkLinkPath := filepath.Join(root, "fork-memory") + require.NoError(t, os.Link(memPath, forkLinkPath)) + + require.NoError(t, ensureExclusiveSnapshotMemoryOwnership(context.Background(), snapshotDir)) - resolved, err := resolveFirecrackerSnapshotMemoryBackingPath(latestMemoryPath) + assertDifferentInode(t, memPath, forkLinkPath) + unshared, err := os.ReadFile(memPath) require.NoError(t, err) - assert.Equal(t, baseMemoryPath, resolved) + assert.Equal(t, []byte("shared memory"), unshared) + forkContent, err := os.ReadFile(forkLinkPath) + require.NoError(t, err) + assert.Equal(t, []byte("shared memory"), forkContent) } -func TestFirecrackerSnapshotSourceLockKeyUsesGuestDirectory(t *testing.T) { +func TestEnsureExclusiveSnapshotMemoryOwnershipSkipsPrivateMemory(t *testing.T) { t.Parallel() - guestDir := filepath.Join(t.TempDir(), "guests", "source") - assert.Equal(t, guestDir, firecrackerSnapshotSourceLockKey(filepath.Join(guestDir, "snapshots", "snapshot-latest", "memory"))) - assert.Equal(t, guestDir, firecrackerSnapshotSourceLockKey(filepath.Join(guestDir, "snapshots", "snapshot-base", "memory"))) + snapshotDir := filepath.Join(t.TempDir(), "snapshot-latest") + require.NoError(t, os.MkdirAll(snapshotDir, 0755)) + memPath := filepath.Join(snapshotDir, "memory") + require.NoError(t, os.WriteFile(memPath, []byte("private memory"), 0644)) + before, err := os.Stat(memPath) + require.NoError(t, err) + + require.NoError(t, ensureExclusiveSnapshotMemoryOwnership(context.Background(), snapshotDir)) + + after, err := os.Stat(memPath) + require.NoError(t, err) + assert.True(t, os.SameFile(before, after), "private mem-file must not be rewritten") + + require.NoError(t, ensureExclusiveSnapshotMemoryOwnership(context.Background(), filepath.Join(t.TempDir(), "missing"))) } func installOneShotFirecrackerStarter(t *testing.T, mgr *manager) { @@ -318,7 +354,7 @@ func (oneShotFirecrackerTestHypervisor) Resume(context.Context) error { return nil } -func (oneShotFirecrackerTestHypervisor) Snapshot(context.Context, string, hypervisor.SnapshotOptions) error { +func (oneShotFirecrackerTestHypervisor) Snapshot(context.Context, string) error { return nil } diff --git a/lib/instances/fork.go b/lib/instances/fork.go index 30b0d116..ea6d3a4c 100644 --- a/lib/instances/fork.go +++ b/lib/instances/fork.go @@ -7,7 +7,6 @@ import ( "fmt" "hash/crc32" "os" - "path/filepath" "strings" "time" @@ -102,14 +101,6 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR } return nil, "", false, fmt.Errorf("restore source instance after fork: %w", restoreErr) } - if forkErr == nil && targetState != StateStopped { - if err := m.repointForkDeferredSnapshotMemoryToSourceBase(forked.Id, id); err != nil { - forkErr = fmt.Errorf("update fork deferred snapshot memory backing: %w", err) - if cleanupErr := m.cleanupForkInstanceOnError(ctx, forked.Id); cleanupErr != nil { - forkErr = fmt.Errorf("%v; additionally failed to cleanup forked instance %s: %v", forkErr, forked.Id, cleanupErr) - } - } - } if restoredSource != nil && !restoredSource.NetworkEnabled && (restoredSource.State == StateRunning || restoredSource.State == StateInitializing) { if err := ensureGuestAgentReadyForForkPhase(ctx, &restoredSource.StoredMetadata, "after restoring running fork source"); err != nil { if forkErr != nil { @@ -211,29 +202,6 @@ func generateForkSourceVsockCID(sourceID, forkID string, current int64) int64 { return cid } -func (m *manager) repointForkDeferredSnapshotMemoryToSourceBase(forkID, sourceID string) error { - meta, err := m.loadMetadata(forkID) - if err != nil { - return err - } - stored := &meta.StoredMetadata - if stored.HypervisorType != hypervisor.TypeFirecracker || stored.FirecrackerDeferredSnapshotMemoryPath == "" { - return nil - } - - sourceLatestMemoryPath := firecrackerSnapshotMemoryPathInGuestDir(m.paths.InstanceDir(sourceID)) - if stored.FirecrackerDeferredSnapshotMemoryPath != sourceLatestMemoryPath { - return nil - } - - sourceBaseMemoryPath := filepath.Join(m.paths.InstanceSnapshotBase(sourceID), "memory") - if _, err := os.Stat(sourceBaseMemoryPath); err != nil { - return fmt.Errorf("stat source retained base memory %q: %w", sourceBaseMemoryPath, err) - } - stored.FirecrackerDeferredSnapshotMemoryPath = sourceBaseMemoryPath - return m.saveMetadata(meta) -} - func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id string, req ForkInstanceRequest, supportValidated bool) (*Instance, bool, error) { log := logger.FromContext(ctx) @@ -289,28 +257,14 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin srcDir := m.paths.InstanceDir(id) dstDir := m.paths.InstanceDir(forkID) - deferredSnapshotMemoryPath := "" - if m.shouldDeferFirecrackerSnapshotMemoryCopy(stored, source.State == StateStandby, targetState) { - deferredSnapshotMemoryPath = firecrackerDeferredSnapshotMemoryPath(stored, srcDir) - } - unlockSnapshotSource := func() {} - snapshotSourceLocked := false - if deferredSnapshotMemoryPath != "" { - unlockSnapshotSource = m.lockFirecrackerSnapshotSource(deferredSnapshotMemoryPath) - snapshotSourceLocked = true - } - defer func() { - if snapshotSourceLocked { - unlockSnapshotSource() - } - }() + shareMemFile := stored.HypervisorType == hypervisor.TypeFirecracker && source.State == StateStandby cu := cleanup.Make(func() { _ = os.RemoveAll(dstDir) }) defer cu.Clean() - if err := m.copyForkSourceGuestDirectory(ctx, source.State, id, stored, srcDir, dstDir, deferredSnapshotMemoryPath); err != nil { + if err := m.copyForkSourceGuestDirectory(ctx, source.State, id, stored, srcDir, dstDir, shareMemFile); err != nil { return nil, false, err } @@ -336,10 +290,8 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin forkMeta.FirecrackerUFFDSessionID = "" forkMeta.FirecrackerUFFDPagerVersion = "" forkMeta.FirecrackerUseUFFDOnNextRestore = useFirecrackerUFFDOnNextRestore(forkMeta.HypervisorType, source.State == StateStandby, targetState) - forkMeta.FirecrackerDeferredSnapshotMemoryPath = deferredSnapshotMemoryPath if source.State != StateStandby { forkMeta.FirecrackerSnapshotCacheKey = "" - forkMeta.FirecrackerDeferredSnapshotMemoryPath = "" } // Forks are new instances; phase accounting must not inherit the source's // cumulative durations. The first transition into the fork's runtime @@ -402,10 +354,6 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin } restoreNeedsSourceLock = prepareResult.RequiresSnapshotSourceAlias } - if snapshotSourceLocked { - unlockSnapshotSource() - snapshotSourceLocked = false - } newMeta := &metadata{StoredMetadata: forkMeta} if err := m.saveForkMetadata(ctx, newMeta); err != nil { diff --git a/lib/instances/lifecycle_noop_test.go b/lib/instances/lifecycle_noop_test.go index b430b567..5ca7515f 100644 --- a/lib/instances/lifecycle_noop_test.go +++ b/lib/instances/lifecycle_noop_test.go @@ -40,7 +40,7 @@ func (h lifecycleNoopHypervisor) GetVMInfo(context.Context) (*hypervisor.VMInfo, } func (h lifecycleNoopHypervisor) Pause(context.Context) error { return nil } func (h lifecycleNoopHypervisor) Resume(context.Context) error { return nil } -func (h lifecycleNoopHypervisor) Snapshot(context.Context, string, hypervisor.SnapshotOptions) error { +func (h lifecycleNoopHypervisor) Snapshot(context.Context, string) error { return nil } func (h lifecycleNoopHypervisor) ResizeMemory(context.Context, int64) error { diff --git a/lib/instances/manager.go b/lib/instances/manager.go index b1bf7578..8e8e25f3 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -171,7 +171,6 @@ type manager struct { resourceValidator ResourceValidator // Optional validator for aggregate resource limits instanceLocks sync.Map // map[string]*sync.RWMutex - per-instance locks forkMetadataMu sync.Mutex - snapshotSourceLocks sync.Map // map[string]*sync.Mutex - Firecracker snapshot source alias locks, retained for process lifetime bootMarkerScans sync.Map // map[string]time.Time next allowed boot-marker rescan hypervisorStateCache sync.Map // map[string]hypervisorStateCacheEntry - last observed hypervisor state per instance hostTopology *HostTopology // Cached host CPU topology diff --git a/lib/instances/restore.go b/lib/instances/restore.go index f41c7725..dfe5e6f5 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -380,8 +380,8 @@ func (m *manager) restoreInstance( // before markers ever hydrated we resume in Initializing. resumePhase, _ := runningPhaseFromMarkers(stored) stored.Phases.Record(resumePhase, time.Now().UTC()) - // The one-shot restore has been consumed, but a UFFD-backed VM may still - // need its pager session until the next standby materializes snapshot memory. + // The one-shot restore has been consumed, but a UFFD-backed VM keeps its + // pager session serving faults until the next standby or stop closes it. stored.FirecrackerUseUFFDOnNextRestore = false meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { diff --git a/lib/instances/snapshot.go b/lib/instances/snapshot.go index 8a97ace8..97d2ca88 100644 --- a/lib/instances/snapshot.go +++ b/lib/instances/snapshot.go @@ -417,25 +417,8 @@ func (m *manager) forkSnapshot(ctx context.Context, snapshotID string, req ForkS if target != nil && target.State == compressionJobStateRunning { m.recordSnapshotCompressionPreemption(ctx, snapshotCompressionPreemptionForkSnapshot, target.Target) } - snapshotGuestDir := m.paths.SnapshotGuestDir(snapshotID) - deferredSource := rec.StoredMetadata - deferredSource.HypervisorType = targetHypervisor - deferredSnapshotMemoryPath := "" - if m.shouldDeferFirecrackerSnapshotMemoryCopy(&deferredSource, rec.Snapshot.Kind == SnapshotKindStandby, targetState) { - deferredSnapshotMemoryPath = firecrackerDeferredSnapshotMemoryPath(&rec.StoredMetadata, snapshotGuestDir) - } - unlockSnapshotSource := func() {} - snapshotSourceLocked := false - if deferredSnapshotMemoryPath != "" { - unlockSnapshotSource = m.lockFirecrackerSnapshotSource(deferredSnapshotMemoryPath) - snapshotSourceLocked = true - } - defer func() { - if snapshotSourceLocked { - unlockSnapshotSource() - } - }() - if err := m.copySnapshotGuestDirectoryForFork(ctx, snapshotID, rec.StoredMetadata.HypervisorType, dstDir, deferredSnapshotMemoryPath); err != nil { + shareMemFile := rec.StoredMetadata.HypervisorType == hypervisor.TypeFirecracker && rec.Snapshot.Kind == SnapshotKindStandby + if err := m.copySnapshotGuestDirectoryForFork(ctx, snapshotID, rec.StoredMetadata.HypervisorType, dstDir, shareMemFile); err != nil { return nil, err } @@ -469,10 +452,8 @@ func (m *manager) forkSnapshot(ctx context.Context, snapshotID string, req ForkS forkMeta.FirecrackerUFFDSessionID = "" forkMeta.FirecrackerUFFDPagerVersion = "" forkMeta.FirecrackerUseUFFDOnNextRestore = useFirecrackerUFFDOnNextRestore(targetHypervisor, rec.Snapshot.Kind == SnapshotKindStandby, targetState) - forkMeta.FirecrackerDeferredSnapshotMemoryPath = deferredSnapshotMemoryPath if rec.Snapshot.Kind != SnapshotKindStandby { forkMeta.FirecrackerSnapshotCacheKey = "" - forkMeta.FirecrackerDeferredSnapshotMemoryPath = "" } if rec.Snapshot.Kind == SnapshotKindStandby { forkMeta.VsockCID = rec.StoredMetadata.VsockCID @@ -513,10 +494,6 @@ func (m *manager) forkSnapshot(ctx context.Context, snapshotID string, req ForkS "target_data_dir", forkMeta.DataDir) } } - if snapshotSourceLocked { - unlockSnapshotSource() - snapshotSourceLocked = false - } if err := m.saveMetadata(&metadata{StoredMetadata: forkMeta}); err != nil { return nil, fmt.Errorf("save fork metadata: %w", err) diff --git a/lib/instances/snapshot_alias_lock.go b/lib/instances/snapshot_alias_lock.go index 00180e59..87420e88 100644 --- a/lib/instances/snapshot_alias_lock.go +++ b/lib/instances/snapshot_alias_lock.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "os" "github.com/kernel/hypeman/lib/forkvm" "github.com/kernel/hypeman/lib/hypervisor" @@ -40,13 +41,13 @@ func copyGuestDirectoryWithAliasReadLock(srcDir, dstDir string) error { }) } -func (m *manager) copyForkSourceGuestDirectory(ctx context.Context, sourceState State, sourceID string, stored *StoredMetadata, srcDir, dstDir, deferredSnapshotMemoryPath string) error { +func (m *manager) copyForkSourceGuestDirectory(ctx context.Context, sourceState State, sourceID string, stored *StoredMetadata, srcDir, dstDir string, shareMemFile bool) error { ctx, span := m.tracerOrDefault().Start(ctx, "instances.fork.copy_guest_directory", trace.WithAttributes( attribute.String("operation", "fork_copy_guest_directory"), attribute.String("instance_id", sourceID), attribute.String("source_state", string(sourceState)), - attribute.Bool("deferred_snapshot_memory", deferredSnapshotMemoryPath != ""), + attribute.Bool("share_mem_file", shareMemFile), ), ) var retErr error @@ -66,18 +67,13 @@ func (m *manager) copyForkSourceGuestDirectory(ctx context.Context, sourceState readyDone(nil) } - copyOptions := forkvm.CopyOptions{} - if deferredSnapshotMemoryPath != "" { - copyOptions.SkipRelativePaths = map[string]struct{}{firecrackerSnapshotMemoryRelPath: {}} - } - _, cloneDone := m.startLifecycleStep(ctx, "instances.fork.copy_guest_directory.clone", attribute.String("operation", "fork_copy_guest_directory_clone"), attribute.String("instance_id", sourceID), - attribute.Bool("deferred_snapshot_memory", deferredSnapshotMemoryPath != ""), + attribute.Bool("share_mem_file", shareMemFile), ) retErr = withSnapshotSourceAliasReadLock(func() error { - if err := forkvm.CopyGuestDirectoryWithOptions(srcDir, dstDir, copyOptions); err != nil { + if err := cloneGuestDirectoryForFork(ctx, srcDir, dstDir, shareMemFile); err != nil { if errors.Is(err, forkvm.ErrSparseCopyUnsupported) { return fmt.Errorf("fork requires sparse-capable filesystem (SEEK_DATA/SEEK_HOLE unsupported): %w", err) } @@ -89,13 +85,13 @@ func (m *manager) copyForkSourceGuestDirectory(ctx context.Context, sourceState return retErr } -func (m *manager) copySnapshotGuestDirectoryForFork(ctx context.Context, snapshotID string, hvType hypervisor.Type, dstDir, deferredSnapshotMemoryPath string) error { +func (m *manager) copySnapshotGuestDirectoryForFork(ctx context.Context, snapshotID string, hvType hypervisor.Type, dstDir string, shareMemFile bool) error { ctx, span := m.tracerOrDefault().Start(ctx, "instances.snapshot.copy_guest_directory", trace.WithAttributes( attribute.String("operation", "snapshot_copy_guest_directory"), attribute.String("snapshot_id", snapshotID), attribute.String("hypervisor", string(hvType)), - attribute.Bool("deferred_snapshot_memory", deferredSnapshotMemoryPath != ""), + attribute.Bool("share_mem_file", shareMemFile), ), ) var retErr error @@ -113,18 +109,13 @@ func (m *manager) copySnapshotGuestDirectoryForFork(ctx context.Context, snapsho } readyDone(nil) - copyOptions := forkvm.CopyOptions{} - if deferredSnapshotMemoryPath != "" { - copyOptions.SkipRelativePaths = map[string]struct{}{firecrackerSnapshotMemoryRelPath: {}} - } - _, cloneDone := m.startLifecycleStep(ctx, "instances.snapshot.copy_guest_directory.clone", attribute.String("operation", "snapshot_copy_guest_directory_clone"), attribute.String("snapshot_id", snapshotID), - attribute.Bool("deferred_snapshot_memory", deferredSnapshotMemoryPath != ""), + attribute.Bool("share_mem_file", shareMemFile), ) retErr = withSnapshotSourceAliasReadLock(func() error { - if err := forkvm.CopyGuestDirectoryWithOptions(m.paths.SnapshotGuestDir(snapshotID), dstDir, copyOptions); err != nil { + if err := cloneGuestDirectoryForFork(ctx, m.paths.SnapshotGuestDir(snapshotID), dstDir, shareMemFile); err != nil { if errors.Is(err, forkvm.ErrSparseCopyUnsupported) { return fmt.Errorf("fork from snapshot requires sparse-capable filesystem (SEEK_DATA/SEEK_HOLE unsupported): %w", err) } @@ -135,3 +126,27 @@ func (m *manager) copySnapshotGuestDirectoryForFork(ctx context.Context, snapsho cloneDone(retErr) return retErr } + +// cloneGuestDirectoryForFork copies a guest directory for a fork. When +// shareMemFile is set and the source has a raw snapshot mem-file, the mem-file +// is skipped from the copy walk and hardlinked into place instead. +func cloneGuestDirectoryForFork(ctx context.Context, srcDir, dstDir string, shareMemFile bool) error { + srcMem := firecrackerSnapshotMemoryPathInGuestDir(srcDir) + if shareMemFile { + if _, err := os.Stat(srcMem); err != nil { + shareMemFile = false + } + } + + copyOptions := forkvm.CopyOptions{} + if shareMemFile { + copyOptions.SkipRelativePaths = map[string]struct{}{firecrackerSnapshotMemoryRelPath: {}} + } + if err := forkvm.CopyGuestDirectoryWithOptions(srcDir, dstDir, copyOptions); err != nil { + return err + } + if shareMemFile { + return linkForkFirecrackerMemFile(ctx, srcDir, dstDir) + } + return nil +} diff --git a/lib/instances/standby.go b/lib/instances/standby.go index af683a3e..6913a989 100644 --- a/lib/instances/standby.go +++ b/lib/instances/standby.go @@ -138,6 +138,20 @@ func (m *manager) standbyInstance( } return nil, fmt.Errorf("prepare retained snapshot target: %w", err) } + // The diff snapshot below writes dirty pages into the mem-file in + // place; if fanout forks still hardlink its inode, replace it with a + // private copy first so their memory is never mutated. + if err := ensureExclusiveSnapshotMemoryOwnership(ctx, snapshotDir); err != nil { + if resumeErr := hv.Resume(ctx); resumeErr != nil { + log.ErrorContext(ctx, "failed to resume VM after snapshot memory unshare error", "instance_id", id, "error", resumeErr) + } + if promotedExistingBase { + if rollbackErr := discardPromotedRetainedSnapshotTarget(snapshotDir); rollbackErr != nil { + log.WarnContext(ctx, "failed to discard promoted snapshot target after unshare error", "instance_id", id, "error", rollbackErr) + } + } + return nil, fmt.Errorf("unshare snapshot memory: %w", err) + } } log.DebugContext(ctx, "creating snapshot", "instance_id", id, "snapshot_dir", snapshotDir) snapshotCtx, snapshotSpanEnd := m.startLifecycleStep(ctx, "create_snapshot", @@ -146,11 +160,7 @@ func (m *manager) standbyInstance( attribute.String("operation", "create_snapshot"), attribute.Bool("reuse_snapshot_base", reuseSnapshotBase), ) - snapshotOptions := hypervisor.SnapshotOptions{} - if stored.HypervisorType == hypervisor.TypeFirecracker { - snapshotOptions.DeferredMemoryBackingPath = stored.FirecrackerDeferredSnapshotMemoryPath - } - if err := createSnapshot(snapshotCtx, hv, snapshotDir, reuseSnapshotBase, snapshotOptions); err != nil { + if err := createSnapshot(snapshotCtx, hv, snapshotDir, reuseSnapshotBase); err != nil { snapshotSpanEnd(err) // Snapshot failed - try to resume VM log.ErrorContext(ctx, "snapshot failed, attempting to resume VM", "instance_id", id, "error", err) @@ -279,7 +289,7 @@ func (m *manager) standbyInstance( } // createSnapshot creates a snapshot using the hypervisor interface -func createSnapshot(ctx context.Context, hv hypervisor.Hypervisor, snapshotDir string, reuseSnapshotBase bool, opts hypervisor.SnapshotOptions) error { +func createSnapshot(ctx context.Context, hv hypervisor.Hypervisor, snapshotDir string, reuseSnapshotBase bool) error { log := logger.FromContext(ctx) // Remove old snapshot if the hypervisor does not support reusing snapshots @@ -295,7 +305,7 @@ func createSnapshot(ctx context.Context, hv hypervisor.Hypervisor, snapshotDir s // Create snapshot via hypervisor API log.DebugContext(ctx, "invoking hypervisor snapshot API", "snapshot_dir", snapshotDir) - if err := hv.Snapshot(ctx, snapshotDir, opts); err != nil { + if err := hv.Snapshot(ctx, snapshotDir); err != nil { return fmt.Errorf("snapshot: %w", err) } diff --git a/lib/instances/types.go b/lib/instances/types.go index 6ff42406..1ff97575 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -131,11 +131,10 @@ type StoredMetadata struct { HypervisorPID *int // Hypervisor process ID (may be stale after host restart) // Firecracker UFFD snapshot restore metadata. - FirecrackerSnapshotCacheKey string - FirecrackerUseUFFDOnNextRestore bool - FirecrackerUFFDSessionID string - FirecrackerUFFDPagerVersion string - FirecrackerDeferredSnapshotMemoryPath string + FirecrackerSnapshotCacheKey string + FirecrackerUseUFFDOnNextRestore bool + FirecrackerUFFDSessionID string + FirecrackerUFFDPagerVersion string // Paths SocketPath string // Path to API socket diff --git a/lib/uffdpager/VERSION b/lib/uffdpager/VERSION index b1e80bb2..845639ee 100644 --- a/lib/uffdpager/VERSION +++ b/lib/uffdpager/VERSION @@ -1 +1 @@ -0.1.3 +0.1.4