From f26dc2446a628abe224749d02a9e9990e87dd4e5 Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Wed, 1 Jul 2026 21:53:50 +0000 Subject: [PATCH 1/2] fork: give Firecracker forks their own mem-file instead of deferring the copy Standby forks previously skipped the mem-file copy and stored a path to the source's memory file, serving UFFD faults from it and copying it only at the fork's next standby. That left forks silently dependent on the source snapshot or instance: deleting or stopping the source stranded standby forks, and the source's next diff snapshot mutated the shared file in place under running forks. Fork copies now always include the mem-file via the existing reflink-first directory copy, so a fork owns its memory from creation and the source can be deleted immediately. UFFD one-shot restore is unchanged except that the pager serves from the fork-local file; the shared page cache still works because forks inherit the source's cache key and the cloned files are byte-identical. Removes the deferred-path machinery: FirecrackerDeferredSnapshotMemoryPath, materialize-on-standby, base/latest alternate path resolution, snapshot source locks, the repoint step after running-source forks, and SnapshotOptions. --- lib/forkvm/README.md | 15 ++-- lib/forkvm/copy.go | 13 --- lib/forkvm/copy_test.go | 20 ----- lib/guestmemory/controller_test.go | 8 +- .../cloudhypervisor/cloudhypervisor.go | 2 +- lib/hypervisor/firecracker/config_test.go | 31 ------- lib/hypervisor/firecracker/firecracker.go | 62 +------------ lib/hypervisor/hypervisor.go | 6 +- lib/hypervisor/qemu/qemu.go | 2 +- lib/hypervisor/tracing.go | 4 +- lib/hypervisor/tracing_test.go | 16 ++-- lib/hypervisor/vz/client.go | 2 +- lib/instances/firecracker_test.go | 31 ++++--- lib/instances/firecracker_uffd.go | 89 +------------------ lib/instances/firecracker_uffd_test.go | 42 ++++----- lib/instances/fork.go | 55 +----------- lib/instances/lifecycle_noop_test.go | 2 +- lib/instances/manager.go | 1 - lib/instances/restore.go | 4 +- lib/instances/snapshot.go | 26 +----- lib/instances/snapshot_alias_lock.go | 22 +---- lib/instances/standby.go | 10 +-- lib/instances/types.go | 9 +- 23 files changed, 79 insertions(+), 393 deletions(-) diff --git a/lib/forkvm/README.md b/lib/forkvm/README.md index 6e956fec..1796401d 100644 --- a/lib/forkvm/README.md +++ b/lib/forkvm/README.md @@ -59,14 +59,17 @@ instead of reusing the source identity. - Network override fields are supplied at snapshot load to bind the fork to its own TAP device. - Vsock CID remains stable for snapshot-based flows. +- Fork copies always give the fork its own mem-file (reflink-cloned where the + filesystem supports FICLONE, sparse-copied otherwise), so a fork never + depends on the source snapshot or source instance after creation. Deleting + the source is safe immediately, and the source's later diff snapshots cannot + mutate memory a fork reads. - When the Firecracker snapshot memory backend is configured as UFFD, UFFD is used as a one-shot acceleration for the first restore of a newly forked - standby snapshot. The fork initially reuses the source snapshot memory as the - pager backing file instead of cloning the large memory file during fanout. -- That deferred memory clone is paid when the fork later enters standby. Before - Firecracker writes the fork's diff snapshot, Hypeman materializes the fork's - own `snapshot-latest/memory` file from the original backing memory. After that - point the fork has a normal on-disk snapshot base, independent from the source. + standby snapshot. The pager serves pages from the fork's own mem-file; + because forks inherit the source's snapshot cache key and their mem-files are + byte-identical clones, the pager's page cache is shared across all forks of + the same snapshot. - Subsequent direct restores of that same fork use Firecracker's normal file-backed memory backend. If that standby fork is itself forked again, the new child gets its own one-shot UFFD restore. diff --git a/lib/forkvm/copy.go b/lib/forkvm/copy.go index 26a35802..4a17d01d 100644 --- a/lib/forkvm/copy.go +++ b/lib/forkvm/copy.go @@ -33,22 +33,12 @@ type copyState struct { reflinkDead bool } -// CopyOptions controls which guest-directory files are copied. -type CopyOptions struct { - SkipRelativePaths map[string]struct{} -} - // CopyGuestDirectory recursively copies a guest directory to a new destination. // Regular files are cloned via reflink (FICLONE) when the underlying filesystem // supports it; otherwise we fall back to a sparse extent copy // (SEEK_DATA/SEEK_HOLE). Runtime sockets and logs are skipped because they are // host-runtime artifacts. func CopyGuestDirectory(srcDir, dstDir string) error { - return CopyGuestDirectoryWithOptions(srcDir, dstDir, CopyOptions{}) -} - -// CopyGuestDirectoryWithOptions is CopyGuestDirectory with optional path skips. -func CopyGuestDirectoryWithOptions(srcDir, dstDir string, opts CopyOptions) error { srcInfo, err := os.Stat(srcDir) if err != nil { return fmt.Errorf("stat source directory: %w", err) @@ -78,9 +68,6 @@ func CopyGuestDirectoryWithOptions(srcDir, dstDir string, opts CopyOptions) erro if relPath == "." { return nil } - if _, ok := opts.SkipRelativePaths[filepath.Clean(relPath)]; ok { - return nil - } if d.IsDir() && shouldSkipDirectory(relPath) { return filepath.SkipDir } diff --git a/lib/forkvm/copy_test.go b/lib/forkvm/copy_test.go index 00634b26..5dc44c8e 100644 --- a/lib/forkvm/copy_test.go +++ b/lib/forkvm/copy_test.go @@ -44,26 +44,6 @@ func TestCopyGuestDirectory(t *testing.T) { assert.Equal(t, "metadata.json", linkTarget) } -func TestCopyGuestDirectoryWithOptionsSkipsRelativePaths(t *testing.T) { - src := filepath.Join(t.TempDir(), "src") - dst := filepath.Join(t.TempDir(), "dst") - - require.NoError(t, os.MkdirAll(filepath.Join(src, "snapshots", "snapshot-latest"), 0755)) - require.NoError(t, os.WriteFile(filepath.Join(src, "snapshots", "snapshot-latest", "memory"), []byte("memory"), 0644)) - require.NoError(t, os.WriteFile(filepath.Join(src, "snapshots", "snapshot-latest", "state"), []byte("state"), 0644)) - require.NoError(t, os.WriteFile(filepath.Join(src, "overlay.raw"), []byte("overlay"), 0644)) - - require.NoError(t, CopyGuestDirectoryWithOptions(src, dst, CopyOptions{ - SkipRelativePaths: map[string]struct{}{ - filepath.Join("snapshots", "snapshot-latest", "memory"): {}, - }, - })) - - assert.NoFileExists(t, filepath.Join(dst, "snapshots", "snapshot-latest", "memory")) - assert.FileExists(t, filepath.Join(dst, "snapshots", "snapshot-latest", "state")) - assert.FileExists(t, filepath.Join(dst, "overlay.raw")) -} - func TestCopyRegularFile(t *testing.T) { src := filepath.Join(t.TempDir(), "src", "memory") dst := filepath.Join(t.TempDir(), "dst", "snapshots", "snapshot-latest", "memory") diff --git a/lib/guestmemory/controller_test.go b/lib/guestmemory/controller_test.go index 0b7ea7e9..4720ec7a 100644 --- a/lib/guestmemory/controller_test.go +++ b/lib/guestmemory/controller_test.go @@ -47,11 +47,9 @@ func (s *stubHypervisor) Shutdown(ctx context.Context) error { return nil } func (s *stubHypervisor) GetVMInfo(ctx context.Context) (*hypervisor.VMInfo, error) { return &hypervisor.VMInfo{State: hypervisor.StateRunning}, nil } -func (s *stubHypervisor) Pause(ctx context.Context) error { return nil } -func (s *stubHypervisor) Resume(ctx context.Context) error { return nil } -func (s *stubHypervisor) Snapshot(ctx context.Context, destPath string, _ hypervisor.SnapshotOptions) error { - return nil -} +func (s *stubHypervisor) Pause(ctx context.Context) error { return nil } +func (s *stubHypervisor) Resume(ctx context.Context) error { return nil } +func (s *stubHypervisor) Snapshot(ctx context.Context, destPath string) error { return nil } func (s *stubHypervisor) ResizeMemory(ctx context.Context, bytes int64) error { return nil } func (s *stubHypervisor) ResizeMemoryAndWait(ctx context.Context, bytes int64, timeout time.Duration) error { return nil diff --git a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go index 8f6f2484..1d3a7f14 100644 --- a/lib/hypervisor/cloudhypervisor/cloudhypervisor.go +++ b/lib/hypervisor/cloudhypervisor/cloudhypervisor.go @@ -160,7 +160,7 @@ func (c *CloudHypervisor) Resume(ctx context.Context) error { } // Snapshot creates a VM snapshot. -func (c *CloudHypervisor) Snapshot(ctx context.Context, destPath string, _ hypervisor.SnapshotOptions) error { +func (c *CloudHypervisor) Snapshot(ctx context.Context, destPath string) error { snapshotURL := "file://" + destPath snapshotConfig := vmm.VmSnapshotConfig{DestinationUrl: &snapshotURL} resp, err := c.client.PutVmSnapshotWithResponse(ctx, snapshotConfig) diff --git a/lib/hypervisor/firecracker/config_test.go b/lib/hypervisor/firecracker/config_test.go index fcbe7c6b..c5e8dc6c 100644 --- a/lib/hypervisor/firecracker/config_test.go +++ b/lib/hypervisor/firecracker/config_test.go @@ -101,37 +101,6 @@ func TestSnapshotLoadParamsSupportsUFFDBackend(t *testing.T) { assert.Equal(t, "/tmp/pager.sock", load.MemBackend.BackendPath) } -func TestMaterializeDeferredSnapshotMemory(t *testing.T) { - t.Parallel() - - sourcePath := filepath.Join(t.TempDir(), "source-memory") - snapshotDir := filepath.Join(t.TempDir(), "snapshot-latest") - require.NoError(t, os.WriteFile(sourcePath, []byte("memory"), 0644)) - - require.NoError(t, materializeDeferredSnapshotMemory(snapshotDir, sourcePath)) - - got, err := os.ReadFile(filepath.Join(snapshotDir, "memory")) - require.NoError(t, err) - assert.Equal(t, []byte("memory"), got) -} - -func TestMaterializeDeferredSnapshotMemoryUsesRetainedSnapshotAlternate(t *testing.T) { - t.Parallel() - - root := t.TempDir() - sourcePath := filepath.Join(root, "snapshots", "snapshot-base", "memory") - alternatePath := filepath.Join(root, "snapshots", "snapshot-latest", "memory") - destPath := filepath.Join(t.TempDir(), "snapshot-latest") - require.NoError(t, os.MkdirAll(filepath.Dir(alternatePath), 0755)) - require.NoError(t, os.WriteFile(alternatePath, []byte("memory"), 0644)) - - require.NoError(t, materializeDeferredSnapshotMemory(destPath, sourcePath)) - - got, err := os.ReadFile(filepath.Join(destPath, "memory")) - require.NoError(t, err) - assert.Equal(t, []byte("memory"), got) -} - func TestToBalloonConfig(t *testing.T) { cfg := hypervisor.VMConfig{ GuestMemory: hypervisor.GuestMemoryConfig{ diff --git a/lib/hypervisor/firecracker/firecracker.go b/lib/hypervisor/firecracker/firecracker.go index c2bb7db3..3512f653 100644 --- a/lib/hypervisor/firecracker/firecracker.go +++ b/lib/hypervisor/firecracker/firecracker.go @@ -14,7 +14,6 @@ import ( "strings" "time" - "github.com/kernel/hypeman/lib/forkvm" "github.com/kernel/hypeman/lib/hypervisor" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" @@ -113,13 +112,10 @@ func (f *Firecracker) Resume(ctx context.Context) error { return nil } -func (f *Firecracker) Snapshot(ctx context.Context, destPath string, opts hypervisor.SnapshotOptions) error { +func (f *Firecracker) Snapshot(ctx context.Context, destPath string) error { if err := os.MkdirAll(destPath, 0755); err != nil { return fmt.Errorf("create snapshot directory: %w", err) } - if err := materializeDeferredSnapshotMemory(destPath, opts.DeferredMemoryBackingPath); err != nil { - return err - } params := toSnapshotCreateParams(destPath) if _, err := f.do(ctx, http.MethodPut, "/snapshot/create", params, http.StatusNoContent); err != nil { return fmt.Errorf("create snapshot: %w", err) @@ -127,62 +123,6 @@ func (f *Firecracker) Snapshot(ctx context.Context, destPath string, opts hyperv return nil } -func materializeDeferredSnapshotMemory(destPath, sourcePath string) error { - sourcePath = strings.TrimSpace(sourcePath) - if sourcePath == "" { - return nil - } - targetPath := filepath.Join(destPath, "memory") - if _, err := os.Stat(targetPath); err == nil { - return nil - } else if !os.IsNotExist(err) { - return fmt.Errorf("stat deferred snapshot memory target: %w", err) - } - resolvedSourcePath, err := resolveDeferredSnapshotMemorySourcePath(sourcePath) - if err != nil { - return err - } - if err := forkvm.CopyRegularFile(resolvedSourcePath, targetPath); err != nil { - return fmt.Errorf("materialize deferred snapshot memory: %w", err) - } - return nil -} - -func resolveDeferredSnapshotMemorySourcePath(sourcePath string) (string, error) { - if _, err := os.Stat(sourcePath); err == nil { - return sourcePath, nil - } else if !os.IsNotExist(err) { - return "", fmt.Errorf("stat deferred snapshot memory source: %w", err) - } - - alternatePath := alternateRetainedSnapshotMemoryPath(sourcePath) - if alternatePath == "" { - return sourcePath, nil - } - if _, err := os.Stat(alternatePath); err == nil { - return alternatePath, nil - } else if !os.IsNotExist(err) { - return "", fmt.Errorf("stat alternate deferred snapshot memory source: %w", err) - } - return sourcePath, nil -} - -func alternateRetainedSnapshotMemoryPath(sourcePath string) string { - if filepath.Base(sourcePath) != "memory" { - return "" - } - snapshotDir := filepath.Dir(sourcePath) - snapshotsDir := filepath.Dir(snapshotDir) - switch filepath.Base(snapshotDir) { - case "snapshot-base": - return filepath.Join(snapshotsDir, "snapshot-latest", "memory") - case "snapshot-latest": - return filepath.Join(snapshotsDir, "snapshot-base", "memory") - default: - return "" - } -} - func (f *Firecracker) ResizeMemory(ctx context.Context, bytes int64) error { return hypervisor.ErrNotSupported } diff --git a/lib/hypervisor/hypervisor.go b/lib/hypervisor/hypervisor.go index ce96fa85..24736258 100644 --- a/lib/hypervisor/hypervisor.go +++ b/lib/hypervisor/hypervisor.go @@ -138,10 +138,6 @@ type RestoreOptions struct { SnapshotMemorySessionID string } -type SnapshotOptions struct { - DeferredMemoryBackingPath string -} - // ForkNetworkConfig contains network identity fields for fork preparation. type ForkNetworkConfig struct { TAPDevice string @@ -199,7 +195,7 @@ type Hypervisor interface { // Snapshot creates a VM snapshot at the given path. // Check Capabilities().SupportsSnapshot before calling. - Snapshot(ctx context.Context, destPath string, opts SnapshotOptions) error + Snapshot(ctx context.Context, destPath string) error // ResizeMemory changes the VM's memory allocation. // Check Capabilities().SupportsHotplugMemory before calling. diff --git a/lib/hypervisor/qemu/qemu.go b/lib/hypervisor/qemu/qemu.go index 70809f90..1da70965 100644 --- a/lib/hypervisor/qemu/qemu.go +++ b/lib/hypervisor/qemu/qemu.go @@ -140,7 +140,7 @@ func (q *QEMU) Resume(ctx context.Context) error { // Snapshot creates a VM snapshot using QEMU's migrate-to-file mechanism. // The VM state is saved to destPath/memory file. // The VM config is copied to destPath for restore (QEMU requires exact arg match). -func (q *QEMU) Snapshot(ctx context.Context, destPath string, _ hypervisor.SnapshotOptions) error { +func (q *QEMU) Snapshot(ctx context.Context, destPath string) error { // QEMU uses migrate to file for snapshots // The "file:" protocol is deprecated in QEMU 7.2+, use "exec:cat > path" instead memoryFile := destPath + "/memory" diff --git a/lib/hypervisor/tracing.go b/lib/hypervisor/tracing.go index f15a25fc..0ee93ae0 100644 --- a/lib/hypervisor/tracing.go +++ b/lib/hypervisor/tracing.go @@ -217,14 +217,14 @@ func (h *tracingHypervisor) Resume(ctx context.Context) (err error) { return h.next.Resume(ctx) } -func (h *tracingHypervisor) Snapshot(ctx context.Context, destPath string, opts SnapshotOptions) (err error) { +func (h *tracingHypervisor) Snapshot(ctx context.Context, destPath string) (err error) { ctx, span := startTraceSpan(ctx, h.tracer, "hypervisor.snapshot", h.spanAttrs( attribute.String("operation", "snapshot"), )..., ) defer func() { finishTraceSpan(span, err) }() - return h.next.Snapshot(ctx, destPath, opts) + return h.next.Snapshot(ctx, destPath) } func (h *tracingHypervisor) ResizeMemory(ctx context.Context, bytes int64) (err error) { diff --git a/lib/hypervisor/tracing_test.go b/lib/hypervisor/tracing_test.go index eb8162ed..409e1bd0 100644 --- a/lib/hypervisor/tracing_test.go +++ b/lib/hypervisor/tracing_test.go @@ -24,11 +24,9 @@ func (fakeHypervisor) Shutdown(context.Context) error { return nil } func (fakeHypervisor) GetVMInfo(context.Context) (*VMInfo, error) { return &VMInfo{State: StateRunning}, nil } -func (fakeHypervisor) Pause(context.Context) error { return nil } -func (fakeHypervisor) Resume(context.Context) error { return nil } -func (fakeHypervisor) Snapshot(context.Context, string, SnapshotOptions) error { - return nil -} +func (fakeHypervisor) Pause(context.Context) error { return nil } +func (fakeHypervisor) Resume(context.Context) error { return nil } +func (fakeHypervisor) Snapshot(context.Context, string) error { return nil } func (fakeHypervisor) ResizeMemory(context.Context, int64) error { return nil } func (fakeHypervisor) ResizeMemoryAndWait(context.Context, int64, time.Duration) error { return nil @@ -43,11 +41,9 @@ func (fakeHypervisorGetVMInfoError) Shutdown(context.Context) error { return nil func (fakeHypervisorGetVMInfoError) GetVMInfo(context.Context) (*VMInfo, error) { return nil, errors.New("vm info failed") } -func (fakeHypervisorGetVMInfoError) Pause(context.Context) error { return nil } -func (fakeHypervisorGetVMInfoError) Resume(context.Context) error { return nil } -func (fakeHypervisorGetVMInfoError) Snapshot(context.Context, string, SnapshotOptions) error { - return nil -} +func (fakeHypervisorGetVMInfoError) Pause(context.Context) error { return nil } +func (fakeHypervisorGetVMInfoError) Resume(context.Context) error { return nil } +func (fakeHypervisorGetVMInfoError) Snapshot(context.Context, string) error { return nil } func (fakeHypervisorGetVMInfoError) ResizeMemory(context.Context, int64) error { return nil } diff --git a/lib/hypervisor/vz/client.go b/lib/hypervisor/vz/client.go index 81953d30..2fb0a62a 100644 --- a/lib/hypervisor/vz/client.go +++ b/lib/hypervisor/vz/client.go @@ -301,7 +301,7 @@ func (c *Client) rawVMState(ctx context.Context) (string, error) { return info.State, nil } -func (c *Client) Snapshot(ctx context.Context, destPath string, _ hypervisor.SnapshotOptions) error { +func (c *Client) Snapshot(ctx context.Context, destPath string) error { req := snapshotRequest{DestinationPath: destPath} body, err := json.Marshal(req) if err != nil { diff --git a/lib/instances/firecracker_test.go b/lib/instances/firecracker_test.go index 20fe7983..6df2e9bc 100644 --- a/lib/instances/firecracker_test.go +++ b/lib/instances/firecracker_test.go @@ -762,21 +762,24 @@ func TestFCUFFDOneShotLifecycle(t *testing.T) { writeGuestFile(t, ctx, parent, "/root/uffd-lifecycle/parent-after-uffd", "parent-disk") writeGuestFile(t, ctx, parent, "/dev/shm/uffd-lifecycle/parent-after-uffd", "parent-memory") parentSnapshotMemoryPath := filepath.Join(p.InstanceSnapshotLatest(parentID), "memory") - require.NoFileExists(t, parentSnapshotMemoryPath, "UFFD snapshot fanout should defer the memory clone while running") + parentBaseMemoryPath := filepath.Join(p.InstanceSnapshotBase(parentID), "memory") + require.FileExists(t, parentBaseMemoryPath, "UFFD snapshot fanout should clone a fork-local mem-file (retained as diff base after restore)") + requireDifferentInode(t, snapshotMemoryPath, parentBaseMemoryPath) parentMeta, err := mgr.loadMetadata(parentID) require.NoError(t, err) require.False(t, parentMeta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) - require.Equal(t, snapshotMemoryPath, parentMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.NotEmpty(t, parentMeta.StoredMetadata.FirecrackerUFFDSessionID) + require.NoError(t, mgr.DeleteSnapshot(ctx, snapshot.Id), "source snapshot must be deletable while a fork is running from it") + snapshotDeleted = true + parent, err = mgr.StandbyInstance(ctx, parentID, StandbyInstanceRequest{}) require.NoError(t, err) require.Equal(t, StateStandby, parent.State) - require.FileExists(t, parentSnapshotMemoryPath, "standby should materialize a normal file-backed snapshot from the deferred UFFD backing") + require.FileExists(t, parentSnapshotMemoryPath, "standby should write a file-backed snapshot onto the fork-local mem-file") parentMeta, err = mgr.loadMetadata(parentID) require.NoError(t, err) require.False(t, parentMeta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) - require.Empty(t, parentMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.Empty(t, parentMeta.StoredMetadata.FirecrackerUFFDSessionID) parent, err = mgr.RestoreInstance(ctx, parentID) @@ -789,7 +792,6 @@ func TestFCUFFDOneShotLifecycle(t *testing.T) { parentMeta, err = mgr.loadMetadata(parentID) require.NoError(t, err) require.False(t, parentMeta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) - require.Empty(t, parentMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.Empty(t, parentMeta.StoredMetadata.FirecrackerUFFDSessionID) require.FileExists(t, filepath.Join(p.InstanceSnapshotBase(parentID), "memory"), "file-backed resume should retain the standby snapshot as the next diff base") @@ -821,12 +823,12 @@ func TestFCUFFDOneShotLifecycle(t *testing.T) { assertGuestFileAbsent(t, ctx, parent, "/dev/shm/uffd-lifecycle/child-only") childSnapshotMemoryPath := filepath.Join(p.InstanceSnapshotLatest(childID), "memory") - require.NoFileExists(t, childSnapshotMemoryPath, "running-source child should defer the memory clone while running") + childBaseMemoryPath := filepath.Join(p.InstanceSnapshotBase(childID), "memory") + require.FileExists(t, childBaseMemoryPath, "running-source child should clone a fork-local mem-file (retained as diff base after restore)") + requireDifferentInode(t, filepath.Join(p.InstanceSnapshotBase(parentID), "memory"), childBaseMemoryPath) childMeta, err := mgr.loadMetadata(childID) require.NoError(t, err) require.False(t, childMeta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) - require.NotEmpty(t, childMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) - require.FileExists(t, childMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.NotEmpty(t, childMeta.StoredMetadata.FirecrackerUFFDSessionID) parent, err = mgr.StandbyInstance(ctx, parentID, StandbyInstanceRequest{}) @@ -836,17 +838,15 @@ func TestFCUFFDOneShotLifecycle(t *testing.T) { require.FileExists(t, parentSnapshotMemoryPath) parentMeta, err = mgr.loadMetadata(parentID) require.NoError(t, err) - require.Empty(t, parentMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.Empty(t, parentMeta.StoredMetadata.FirecrackerUFFDSessionID) child, err = mgr.StandbyInstance(ctx, childID, StandbyInstanceRequest{}) require.NoError(t, err) require.Equal(t, StateStandby, child.State) - require.FileExists(t, childSnapshotMemoryPath, "child standby should materialize from its deferred UFFD backing") + require.FileExists(t, childSnapshotMemoryPath, "child standby should write a file-backed snapshot onto the fork-local mem-file") childMeta, err = mgr.loadMetadata(childID) require.NoError(t, err) require.False(t, childMeta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) - require.Empty(t, childMeta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) require.Empty(t, childMeta.StoredMetadata.FirecrackerUFFDSessionID) child, err = mgr.RestoreInstance(ctx, childID) @@ -911,6 +911,15 @@ func requireGuestTmpfs(t *testing.T, ctx context.Context, inst *Instance) { require.Equal(t, 0, exitCode, output) } +func requireDifferentInode(t *testing.T, pathA, pathB string) { + t.Helper() + infoA, err := os.Stat(pathA) + require.NoError(t, err) + infoB, err := os.Stat(pathB) + require.NoError(t, err) + require.False(t, os.SameFile(infoA, infoB), "%s and %s must not share an inode", pathA, pathB) +} + func writeGuestFile(t *testing.T, ctx context.Context, inst *Instance, path, contents string) { t.Helper() output, exitCode, err := execCommand(ctx, inst, "sh", "-c", "mkdir -p \"$1\" && printf '%s' \"$2\" > \"$3\" && sync", "sh", filepath.Dir(path), contents, path) diff --git a/lib/instances/firecracker_uffd.go b/lib/instances/firecracker_uffd.go index 18ceb137..667e4e5d 100644 --- a/lib/instances/firecracker_uffd.go +++ b/lib/instances/firecracker_uffd.go @@ -8,7 +8,6 @@ import ( "os" "path/filepath" "strings" - "sync" "syscall" "time" @@ -29,10 +28,6 @@ func useFirecrackerUFFDOnNextRestore(hvType hypervisor.Type, sourceIsStandby boo return hvType == hypervisor.TypeFirecracker && sourceIsStandby && targetState != StateStopped } -func (m *manager) shouldDeferFirecrackerSnapshotMemoryCopy(stored *StoredMetadata, sourceIsStandby bool, targetState State) bool { - return m.useFirecrackerUFFD(stored) && useFirecrackerUFFDOnNextRestore(stored.HypervisorType, sourceIsStandby, targetState) -} - func clearFirecrackerUFFDRestoreState(stored *StoredMetadata) { if stored == nil { return @@ -40,87 +35,12 @@ func clearFirecrackerUFFDRestoreState(stored *StoredMetadata) { stored.FirecrackerUseUFFDOnNextRestore = false stored.FirecrackerUFFDSessionID = "" stored.FirecrackerUFFDPagerVersion = "" - stored.FirecrackerDeferredSnapshotMemoryPath = "" } func firecrackerSnapshotMemoryPathInGuestDir(guestDir string) string { return filepath.Join(guestDir, firecrackerSnapshotMemoryRelPath) } -func (m *manager) lockFirecrackerSnapshotSource(path string) func() { - key := firecrackerSnapshotSourceLockKey(path) - if key == "" { - return func() {} - } - value, _ := m.snapshotSourceLocks.LoadOrStore(key, &sync.Mutex{}) - mu := value.(*sync.Mutex) - mu.Lock() - return mu.Unlock -} - -func firecrackerSnapshotSourceLockKey(path string) string { - path = filepath.Clean(strings.TrimSpace(path)) - if path == "." || path == "" { - return "" - } - if filepath.Base(path) != "memory" { - return path - } - snapshotDir := filepath.Dir(path) - if base := filepath.Base(snapshotDir); base != "snapshot-latest" && base != "snapshot-base" { - return path - } - snapshotsDir := filepath.Dir(snapshotDir) - if filepath.Base(snapshotsDir) != "snapshots" { - return path - } - return filepath.Dir(snapshotsDir) -} - -func resolveFirecrackerSnapshotMemoryBackingPath(memoryPath string) (string, error) { - if _, err := os.Stat(memoryPath); err == nil { - return memoryPath, nil - } else if !os.IsNotExist(err) { - return "", fmt.Errorf("stat firecracker snapshot memory backing: %w", err) - } - - alternatePath := alternateFirecrackerRetainedSnapshotMemoryPath(memoryPath) - if alternatePath == "" { - return memoryPath, nil - } - if _, err := os.Stat(alternatePath); err == nil { - return alternatePath, nil - } else if !os.IsNotExist(err) { - return "", fmt.Errorf("stat alternate firecracker snapshot memory backing: %w", err) - } - return memoryPath, nil -} - -func alternateFirecrackerRetainedSnapshotMemoryPath(memoryPath string) string { - if filepath.Base(memoryPath) != "memory" { - return "" - } - snapshotDir := filepath.Dir(memoryPath) - snapshotsDir := filepath.Dir(snapshotDir) - switch filepath.Base(snapshotDir) { - case "snapshot-base": - return filepath.Join(snapshotsDir, "snapshot-latest", "memory") - case "snapshot-latest": - return filepath.Join(snapshotsDir, "snapshot-base", "memory") - default: - return "" - } -} - -func firecrackerDeferredSnapshotMemoryPath(stored *StoredMetadata, guestDir string) string { - if stored != nil { - if path := strings.TrimSpace(stored.FirecrackerDeferredSnapshotMemoryPath); path != "" { - return path - } - } - return firecrackerSnapshotMemoryPathInGuestDir(guestDir) -} - func (m *manager) firecrackerSnapshotRestoreOptions(stored *StoredMetadata, snapshotDir string) (hypervisor.RestoreOptions, error) { opts := hypervisor.RestoreOptions{SnapshotMemoryBackend: hypervisor.SnapshotMemoryBackendFile} if stored == nil || stored.HypervisorType != hypervisor.TypeFirecracker { @@ -134,14 +54,7 @@ func (m *manager) firecrackerSnapshotRestoreOptions(stored *StoredMetadata, snap return opts, fmt.Errorf("firecracker uffd snapshot restore is enabled but the pager is not configured") } - backingMemoryPath := strings.TrimSpace(stored.FirecrackerDeferredSnapshotMemoryPath) - if backingMemoryPath == "" { - backingMemoryPath = filepath.Join(snapshotDir, "memory") - } - backingMemoryPath, err := resolveFirecrackerSnapshotMemoryBackingPath(backingMemoryPath) - if err != nil { - return opts, err - } + backingMemoryPath := filepath.Join(snapshotDir, "memory") cacheKey := strings.TrimSpace(stored.FirecrackerSnapshotCacheKey) if cacheKey == "" { var err error diff --git a/lib/instances/firecracker_uffd_test.go b/lib/instances/firecracker_uffd_test.go index dc5ed57d..47d59371 100644 --- a/lib/instances/firecracker_uffd_test.go +++ b/lib/instances/firecracker_uffd_test.go @@ -101,10 +101,13 @@ func TestForkInstanceFromStandbyArmsOneShotUFFDForFirecrackerRestore(t *testing. require.NoError(t, err) assert.True(t, meta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) assert.Equal(t, "shared-template-cache", meta.StoredMetadata.FirecrackerSnapshotCacheKey) - assert.Equal(t, sourceMemoryPath, meta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) assert.Empty(t, meta.StoredMetadata.FirecrackerUFFDSessionID) assert.Empty(t, meta.StoredMetadata.FirecrackerUFFDPagerVersion) - assert.NoFileExists(t, firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.InstanceDir(forked.Id))) + forkMemoryPath := firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.InstanceDir(forked.Id)) + forkMemory, err := os.ReadFile(forkMemoryPath) + require.NoError(t, err) + assert.Equal(t, []byte("source memory"), forkMemory, "fork must own a local copy of the source mem-file") + assertDifferentInode(t, sourceMemoryPath, forkMemoryPath) assert.FileExists(t, filepath.Join(mgr.paths.InstanceSnapshotLatest(forked.Id), "state")) } @@ -135,7 +138,6 @@ func TestForkInstanceFromStandbyDoesNotArmOneShotUFFDForStoppedTarget(t *testing require.NoError(t, err) assert.False(t, meta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) assert.Equal(t, "shared-template-cache", meta.StoredMetadata.FirecrackerSnapshotCacheKey) - assert.Empty(t, meta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) assert.FileExists(t, firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.InstanceDir(forked.Id))) } @@ -171,10 +173,13 @@ func TestForkSnapshotFromStandbyArmsOneShotUFFDForFirecrackerRestore(t *testing. require.NoError(t, err) assert.True(t, meta.StoredMetadata.FirecrackerUseUFFDOnNextRestore) assert.Equal(t, "shared-snapshot-cache", meta.StoredMetadata.FirecrackerSnapshotCacheKey) - assert.Equal(t, firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.SnapshotGuestDir(snap.Id)), meta.StoredMetadata.FirecrackerDeferredSnapshotMemoryPath) assert.Empty(t, meta.StoredMetadata.FirecrackerUFFDSessionID) assert.Empty(t, meta.StoredMetadata.FirecrackerUFFDPagerVersion) - assert.NoFileExists(t, firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.InstanceDir(forked.Id))) + forkMemoryPath := firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.InstanceDir(forked.Id)) + forkMemory, err := os.ReadFile(forkMemoryPath) + require.NoError(t, err) + assert.Equal(t, []byte("source memory"), forkMemory, "snapshot fork must own a local copy of the snapshot mem-file") + assertDifferentInode(t, firecrackerSnapshotMemoryPathInGuestDir(mgr.paths.SnapshotGuestDir(snap.Id)), forkMemoryPath) assert.FileExists(t, filepath.Join(mgr.paths.InstanceSnapshotLatest(forked.Id), "state")) } @@ -235,26 +240,13 @@ func TestRestoreInstanceClearsOneShotUFFDFlagAfterSuccessfulRestore(t *testing.T assert.False(t, updated.StoredMetadata.FirecrackerUseUFFDOnNextRestore) } -func TestResolveFirecrackerSnapshotMemoryBackingPathUsesRetainedBaseAlternate(t *testing.T) { - t.Parallel() - - root := t.TempDir() - latestMemoryPath := filepath.Join(root, "snapshots", "snapshot-latest", "memory") - baseMemoryPath := filepath.Join(root, "snapshots", "snapshot-base", "memory") - require.NoError(t, os.MkdirAll(filepath.Dir(baseMemoryPath), 0755)) - require.NoError(t, os.WriteFile(baseMemoryPath, []byte("base memory"), 0644)) - - resolved, err := resolveFirecrackerSnapshotMemoryBackingPath(latestMemoryPath) +func assertDifferentInode(t *testing.T, pathA, pathB string) { + t.Helper() + infoA, err := os.Stat(pathA) require.NoError(t, err) - assert.Equal(t, baseMemoryPath, resolved) -} - -func TestFirecrackerSnapshotSourceLockKeyUsesGuestDirectory(t *testing.T) { - t.Parallel() - - guestDir := filepath.Join(t.TempDir(), "guests", "source") - assert.Equal(t, guestDir, firecrackerSnapshotSourceLockKey(filepath.Join(guestDir, "snapshots", "snapshot-latest", "memory"))) - assert.Equal(t, guestDir, firecrackerSnapshotSourceLockKey(filepath.Join(guestDir, "snapshots", "snapshot-base", "memory"))) + infoB, err := os.Stat(pathB) + require.NoError(t, err) + assert.False(t, os.SameFile(infoA, infoB), "%s and %s must not share an inode", pathA, pathB) } func installOneShotFirecrackerStarter(t *testing.T, mgr *manager) { @@ -318,7 +310,7 @@ func (oneShotFirecrackerTestHypervisor) Resume(context.Context) error { return nil } -func (oneShotFirecrackerTestHypervisor) Snapshot(context.Context, string, hypervisor.SnapshotOptions) error { +func (oneShotFirecrackerTestHypervisor) Snapshot(context.Context, string) error { return nil } diff --git a/lib/instances/fork.go b/lib/instances/fork.go index 30b0d116..09aa4d94 100644 --- a/lib/instances/fork.go +++ b/lib/instances/fork.go @@ -7,7 +7,6 @@ import ( "fmt" "hash/crc32" "os" - "path/filepath" "strings" "time" @@ -102,14 +101,6 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR } return nil, "", false, fmt.Errorf("restore source instance after fork: %w", restoreErr) } - if forkErr == nil && targetState != StateStopped { - if err := m.repointForkDeferredSnapshotMemoryToSourceBase(forked.Id, id); err != nil { - forkErr = fmt.Errorf("update fork deferred snapshot memory backing: %w", err) - if cleanupErr := m.cleanupForkInstanceOnError(ctx, forked.Id); cleanupErr != nil { - forkErr = fmt.Errorf("%v; additionally failed to cleanup forked instance %s: %v", forkErr, forked.Id, cleanupErr) - } - } - } if restoredSource != nil && !restoredSource.NetworkEnabled && (restoredSource.State == StateRunning || restoredSource.State == StateInitializing) { if err := ensureGuestAgentReadyForForkPhase(ctx, &restoredSource.StoredMetadata, "after restoring running fork source"); err != nil { if forkErr != nil { @@ -211,29 +202,6 @@ func generateForkSourceVsockCID(sourceID, forkID string, current int64) int64 { return cid } -func (m *manager) repointForkDeferredSnapshotMemoryToSourceBase(forkID, sourceID string) error { - meta, err := m.loadMetadata(forkID) - if err != nil { - return err - } - stored := &meta.StoredMetadata - if stored.HypervisorType != hypervisor.TypeFirecracker || stored.FirecrackerDeferredSnapshotMemoryPath == "" { - return nil - } - - sourceLatestMemoryPath := firecrackerSnapshotMemoryPathInGuestDir(m.paths.InstanceDir(sourceID)) - if stored.FirecrackerDeferredSnapshotMemoryPath != sourceLatestMemoryPath { - return nil - } - - sourceBaseMemoryPath := filepath.Join(m.paths.InstanceSnapshotBase(sourceID), "memory") - if _, err := os.Stat(sourceBaseMemoryPath); err != nil { - return fmt.Errorf("stat source retained base memory %q: %w", sourceBaseMemoryPath, err) - } - stored.FirecrackerDeferredSnapshotMemoryPath = sourceBaseMemoryPath - return m.saveMetadata(meta) -} - func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id string, req ForkInstanceRequest, supportValidated bool) (*Instance, bool, error) { log := logger.FromContext(ctx) @@ -289,28 +257,13 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin srcDir := m.paths.InstanceDir(id) dstDir := m.paths.InstanceDir(forkID) - deferredSnapshotMemoryPath := "" - if m.shouldDeferFirecrackerSnapshotMemoryCopy(stored, source.State == StateStandby, targetState) { - deferredSnapshotMemoryPath = firecrackerDeferredSnapshotMemoryPath(stored, srcDir) - } - unlockSnapshotSource := func() {} - snapshotSourceLocked := false - if deferredSnapshotMemoryPath != "" { - unlockSnapshotSource = m.lockFirecrackerSnapshotSource(deferredSnapshotMemoryPath) - snapshotSourceLocked = true - } - defer func() { - if snapshotSourceLocked { - unlockSnapshotSource() - } - }() cu := cleanup.Make(func() { _ = os.RemoveAll(dstDir) }) defer cu.Clean() - if err := m.copyForkSourceGuestDirectory(ctx, source.State, id, stored, srcDir, dstDir, deferredSnapshotMemoryPath); err != nil { + if err := m.copyForkSourceGuestDirectory(ctx, source.State, id, stored, srcDir, dstDir); err != nil { return nil, false, err } @@ -336,10 +289,8 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin forkMeta.FirecrackerUFFDSessionID = "" forkMeta.FirecrackerUFFDPagerVersion = "" forkMeta.FirecrackerUseUFFDOnNextRestore = useFirecrackerUFFDOnNextRestore(forkMeta.HypervisorType, source.State == StateStandby, targetState) - forkMeta.FirecrackerDeferredSnapshotMemoryPath = deferredSnapshotMemoryPath if source.State != StateStandby { forkMeta.FirecrackerSnapshotCacheKey = "" - forkMeta.FirecrackerDeferredSnapshotMemoryPath = "" } // Forks are new instances; phase accounting must not inherit the source's // cumulative durations. The first transition into the fork's runtime @@ -402,10 +353,6 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin } restoreNeedsSourceLock = prepareResult.RequiresSnapshotSourceAlias } - if snapshotSourceLocked { - unlockSnapshotSource() - snapshotSourceLocked = false - } newMeta := &metadata{StoredMetadata: forkMeta} if err := m.saveForkMetadata(ctx, newMeta); err != nil { diff --git a/lib/instances/lifecycle_noop_test.go b/lib/instances/lifecycle_noop_test.go index b430b567..5ca7515f 100644 --- a/lib/instances/lifecycle_noop_test.go +++ b/lib/instances/lifecycle_noop_test.go @@ -40,7 +40,7 @@ func (h lifecycleNoopHypervisor) GetVMInfo(context.Context) (*hypervisor.VMInfo, } func (h lifecycleNoopHypervisor) Pause(context.Context) error { return nil } func (h lifecycleNoopHypervisor) Resume(context.Context) error { return nil } -func (h lifecycleNoopHypervisor) Snapshot(context.Context, string, hypervisor.SnapshotOptions) error { +func (h lifecycleNoopHypervisor) Snapshot(context.Context, string) error { return nil } func (h lifecycleNoopHypervisor) ResizeMemory(context.Context, int64) error { diff --git a/lib/instances/manager.go b/lib/instances/manager.go index b1bf7578..8e8e25f3 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -171,7 +171,6 @@ type manager struct { resourceValidator ResourceValidator // Optional validator for aggregate resource limits instanceLocks sync.Map // map[string]*sync.RWMutex - per-instance locks forkMetadataMu sync.Mutex - snapshotSourceLocks sync.Map // map[string]*sync.Mutex - Firecracker snapshot source alias locks, retained for process lifetime bootMarkerScans sync.Map // map[string]time.Time next allowed boot-marker rescan hypervisorStateCache sync.Map // map[string]hypervisorStateCacheEntry - last observed hypervisor state per instance hostTopology *HostTopology // Cached host CPU topology diff --git a/lib/instances/restore.go b/lib/instances/restore.go index f41c7725..dfe5e6f5 100644 --- a/lib/instances/restore.go +++ b/lib/instances/restore.go @@ -380,8 +380,8 @@ func (m *manager) restoreInstance( // before markers ever hydrated we resume in Initializing. resumePhase, _ := runningPhaseFromMarkers(stored) stored.Phases.Record(resumePhase, time.Now().UTC()) - // The one-shot restore has been consumed, but a UFFD-backed VM may still - // need its pager session until the next standby materializes snapshot memory. + // The one-shot restore has been consumed, but a UFFD-backed VM keeps its + // pager session serving faults until the next standby or stop closes it. stored.FirecrackerUseUFFDOnNextRestore = false meta = &metadata{StoredMetadata: *stored} if err := m.saveMetadata(meta); err != nil { diff --git a/lib/instances/snapshot.go b/lib/instances/snapshot.go index 8a97ace8..4c5236d0 100644 --- a/lib/instances/snapshot.go +++ b/lib/instances/snapshot.go @@ -417,25 +417,7 @@ func (m *manager) forkSnapshot(ctx context.Context, snapshotID string, req ForkS if target != nil && target.State == compressionJobStateRunning { m.recordSnapshotCompressionPreemption(ctx, snapshotCompressionPreemptionForkSnapshot, target.Target) } - snapshotGuestDir := m.paths.SnapshotGuestDir(snapshotID) - deferredSource := rec.StoredMetadata - deferredSource.HypervisorType = targetHypervisor - deferredSnapshotMemoryPath := "" - if m.shouldDeferFirecrackerSnapshotMemoryCopy(&deferredSource, rec.Snapshot.Kind == SnapshotKindStandby, targetState) { - deferredSnapshotMemoryPath = firecrackerDeferredSnapshotMemoryPath(&rec.StoredMetadata, snapshotGuestDir) - } - unlockSnapshotSource := func() {} - snapshotSourceLocked := false - if deferredSnapshotMemoryPath != "" { - unlockSnapshotSource = m.lockFirecrackerSnapshotSource(deferredSnapshotMemoryPath) - snapshotSourceLocked = true - } - defer func() { - if snapshotSourceLocked { - unlockSnapshotSource() - } - }() - if err := m.copySnapshotGuestDirectoryForFork(ctx, snapshotID, rec.StoredMetadata.HypervisorType, dstDir, deferredSnapshotMemoryPath); err != nil { + if err := m.copySnapshotGuestDirectoryForFork(ctx, snapshotID, rec.StoredMetadata.HypervisorType, dstDir); err != nil { return nil, err } @@ -469,10 +451,8 @@ func (m *manager) forkSnapshot(ctx context.Context, snapshotID string, req ForkS forkMeta.FirecrackerUFFDSessionID = "" forkMeta.FirecrackerUFFDPagerVersion = "" forkMeta.FirecrackerUseUFFDOnNextRestore = useFirecrackerUFFDOnNextRestore(targetHypervisor, rec.Snapshot.Kind == SnapshotKindStandby, targetState) - forkMeta.FirecrackerDeferredSnapshotMemoryPath = deferredSnapshotMemoryPath if rec.Snapshot.Kind != SnapshotKindStandby { forkMeta.FirecrackerSnapshotCacheKey = "" - forkMeta.FirecrackerDeferredSnapshotMemoryPath = "" } if rec.Snapshot.Kind == SnapshotKindStandby { forkMeta.VsockCID = rec.StoredMetadata.VsockCID @@ -513,10 +493,6 @@ func (m *manager) forkSnapshot(ctx context.Context, snapshotID string, req ForkS "target_data_dir", forkMeta.DataDir) } } - if snapshotSourceLocked { - unlockSnapshotSource() - snapshotSourceLocked = false - } if err := m.saveMetadata(&metadata{StoredMetadata: forkMeta}); err != nil { return nil, fmt.Errorf("save fork metadata: %w", err) diff --git a/lib/instances/snapshot_alias_lock.go b/lib/instances/snapshot_alias_lock.go index 00180e59..58df5f28 100644 --- a/lib/instances/snapshot_alias_lock.go +++ b/lib/instances/snapshot_alias_lock.go @@ -40,13 +40,12 @@ func copyGuestDirectoryWithAliasReadLock(srcDir, dstDir string) error { }) } -func (m *manager) copyForkSourceGuestDirectory(ctx context.Context, sourceState State, sourceID string, stored *StoredMetadata, srcDir, dstDir, deferredSnapshotMemoryPath string) error { +func (m *manager) copyForkSourceGuestDirectory(ctx context.Context, sourceState State, sourceID string, stored *StoredMetadata, srcDir, dstDir string) error { ctx, span := m.tracerOrDefault().Start(ctx, "instances.fork.copy_guest_directory", trace.WithAttributes( attribute.String("operation", "fork_copy_guest_directory"), attribute.String("instance_id", sourceID), attribute.String("source_state", string(sourceState)), - attribute.Bool("deferred_snapshot_memory", deferredSnapshotMemoryPath != ""), ), ) var retErr error @@ -66,18 +65,12 @@ func (m *manager) copyForkSourceGuestDirectory(ctx context.Context, sourceState readyDone(nil) } - copyOptions := forkvm.CopyOptions{} - if deferredSnapshotMemoryPath != "" { - copyOptions.SkipRelativePaths = map[string]struct{}{firecrackerSnapshotMemoryRelPath: {}} - } - _, cloneDone := m.startLifecycleStep(ctx, "instances.fork.copy_guest_directory.clone", attribute.String("operation", "fork_copy_guest_directory_clone"), attribute.String("instance_id", sourceID), - attribute.Bool("deferred_snapshot_memory", deferredSnapshotMemoryPath != ""), ) retErr = withSnapshotSourceAliasReadLock(func() error { - if err := forkvm.CopyGuestDirectoryWithOptions(srcDir, dstDir, copyOptions); err != nil { + if err := forkvm.CopyGuestDirectory(srcDir, dstDir); err != nil { if errors.Is(err, forkvm.ErrSparseCopyUnsupported) { return fmt.Errorf("fork requires sparse-capable filesystem (SEEK_DATA/SEEK_HOLE unsupported): %w", err) } @@ -89,13 +82,12 @@ func (m *manager) copyForkSourceGuestDirectory(ctx context.Context, sourceState return retErr } -func (m *manager) copySnapshotGuestDirectoryForFork(ctx context.Context, snapshotID string, hvType hypervisor.Type, dstDir, deferredSnapshotMemoryPath string) error { +func (m *manager) copySnapshotGuestDirectoryForFork(ctx context.Context, snapshotID string, hvType hypervisor.Type, dstDir string) error { ctx, span := m.tracerOrDefault().Start(ctx, "instances.snapshot.copy_guest_directory", trace.WithAttributes( attribute.String("operation", "snapshot_copy_guest_directory"), attribute.String("snapshot_id", snapshotID), attribute.String("hypervisor", string(hvType)), - attribute.Bool("deferred_snapshot_memory", deferredSnapshotMemoryPath != ""), ), ) var retErr error @@ -113,18 +105,12 @@ func (m *manager) copySnapshotGuestDirectoryForFork(ctx context.Context, snapsho } readyDone(nil) - copyOptions := forkvm.CopyOptions{} - if deferredSnapshotMemoryPath != "" { - copyOptions.SkipRelativePaths = map[string]struct{}{firecrackerSnapshotMemoryRelPath: {}} - } - _, cloneDone := m.startLifecycleStep(ctx, "instances.snapshot.copy_guest_directory.clone", attribute.String("operation", "snapshot_copy_guest_directory_clone"), attribute.String("snapshot_id", snapshotID), - attribute.Bool("deferred_snapshot_memory", deferredSnapshotMemoryPath != ""), ) retErr = withSnapshotSourceAliasReadLock(func() error { - if err := forkvm.CopyGuestDirectoryWithOptions(m.paths.SnapshotGuestDir(snapshotID), dstDir, copyOptions); err != nil { + if err := forkvm.CopyGuestDirectory(m.paths.SnapshotGuestDir(snapshotID), dstDir); err != nil { if errors.Is(err, forkvm.ErrSparseCopyUnsupported) { return fmt.Errorf("fork from snapshot requires sparse-capable filesystem (SEEK_DATA/SEEK_HOLE unsupported): %w", err) } diff --git a/lib/instances/standby.go b/lib/instances/standby.go index af683a3e..3589564c 100644 --- a/lib/instances/standby.go +++ b/lib/instances/standby.go @@ -146,11 +146,7 @@ func (m *manager) standbyInstance( attribute.String("operation", "create_snapshot"), attribute.Bool("reuse_snapshot_base", reuseSnapshotBase), ) - snapshotOptions := hypervisor.SnapshotOptions{} - if stored.HypervisorType == hypervisor.TypeFirecracker { - snapshotOptions.DeferredMemoryBackingPath = stored.FirecrackerDeferredSnapshotMemoryPath - } - if err := createSnapshot(snapshotCtx, hv, snapshotDir, reuseSnapshotBase, snapshotOptions); err != nil { + if err := createSnapshot(snapshotCtx, hv, snapshotDir, reuseSnapshotBase); err != nil { snapshotSpanEnd(err) // Snapshot failed - try to resume VM log.ErrorContext(ctx, "snapshot failed, attempting to resume VM", "instance_id", id, "error", err) @@ -279,7 +275,7 @@ func (m *manager) standbyInstance( } // createSnapshot creates a snapshot using the hypervisor interface -func createSnapshot(ctx context.Context, hv hypervisor.Hypervisor, snapshotDir string, reuseSnapshotBase bool, opts hypervisor.SnapshotOptions) error { +func createSnapshot(ctx context.Context, hv hypervisor.Hypervisor, snapshotDir string, reuseSnapshotBase bool) error { log := logger.FromContext(ctx) // Remove old snapshot if the hypervisor does not support reusing snapshots @@ -295,7 +291,7 @@ func createSnapshot(ctx context.Context, hv hypervisor.Hypervisor, snapshotDir s // Create snapshot via hypervisor API log.DebugContext(ctx, "invoking hypervisor snapshot API", "snapshot_dir", snapshotDir) - if err := hv.Snapshot(ctx, snapshotDir, opts); err != nil { + if err := hv.Snapshot(ctx, snapshotDir); err != nil { return fmt.Errorf("snapshot: %w", err) } diff --git a/lib/instances/types.go b/lib/instances/types.go index 6ff42406..1ff97575 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -131,11 +131,10 @@ type StoredMetadata struct { HypervisorPID *int // Hypervisor process ID (may be stale after host restart) // Firecracker UFFD snapshot restore metadata. - FirecrackerSnapshotCacheKey string - FirecrackerUseUFFDOnNextRestore bool - FirecrackerUFFDSessionID string - FirecrackerUFFDPagerVersion string - FirecrackerDeferredSnapshotMemoryPath string + FirecrackerSnapshotCacheKey string + FirecrackerUseUFFDOnNextRestore bool + FirecrackerUFFDSessionID string + FirecrackerUFFDPagerVersion string // Paths SocketPath string // Path to API socket From 2bae64b444ac327a588d33384f171c2b2e6215f2 Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Thu, 2 Jul 2026 13:31:35 +0000 Subject: [PATCH 2/2] Bump UFFD pager version for restore wiring change --- lib/uffdpager/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/uffdpager/VERSION b/lib/uffdpager/VERSION index b1e80bb2..845639ee 100644 --- a/lib/uffdpager/VERSION +++ b/lib/uffdpager/VERSION @@ -1 +1 @@ -0.1.3 +0.1.4