From fbd6b5d0430f483cb966da18c7b2dc3bb040762f Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 09:24:29 -0700
Subject: [PATCH 01/15] test(hm-vm): add test proving HmVm injects workspace
 for snapshot-sourced steps

---
 crates/hm-vm/src/vm.rs | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/crates/hm-vm/src/vm.rs b/crates/hm-vm/src/vm.rs
index de8800ba..2af2982f 100644
--- a/crates/hm-vm/src/vm.rs
+++ b/crates/hm-vm/src/vm.rs
@@ -393,4 +393,40 @@ mod tests {
         // Registry should still be empty.
         assert!(hm.registry.is_empty());
     }
+
+    #[tokio::test]
+    async fn cache_miss_from_snapshot_still_injects_workspace() {
+        let backend = MockBackend::new(0, false);
+        let (registry, _dir) = open_temp_registry(10);
+        let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
+
+        let mut action = make_action();
+        // Simulate child step: source is a snapshot, not an image.
+        action.source = ImageSource::Snapshot(SnapshotId("parent-snap".into()));
+
+        let result = hm
+            .execute(
+                action,
+                CachingPolicy::Cache {
+                    key: "child-step".into(),
+                },
+                &NullSink,
+            )
+            .await
+            .expect("execute should succeed");
+
+        assert_eq!(result.exit_code, 0);
+        assert!(!result.cached);
+
+        let log = calls(&backend);
+        // Must restore from snapshot (not create from image).
+        assert!(log.iter().any(|c| c.starts_with("restore:parent-snap")));
+        // Must still inject workspace even though source is a snapshot.
+        assert!(
+            log.iter().any(|c| c.starts_with("inject:")),
+            "workspace injection skipped for snapshot-sourced step: {log:?}"
+        );
+        assert!(log.iter().any(|c| c.starts_with("exec:")));
+        assert!(log.iter().any(|c| c.starts_with("snapshot:")));
+    }
 }

From 0856af00c591aa88b0767a17279e4f09c7738431 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 09:26:00 -0700
Subject: [PATCH 02/15] fix: always inject workspace into every executing step
 (CLI-28)

Child steps that boot from a cached parent snapshot now receive
fresh source files via tar injection, preventing stale workspace
data from leaking across builds.
---
 crates/hm-exec/src/local/runner/vm.rs | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/crates/hm-exec/src/local/runner/vm.rs b/crates/hm-exec/src/local/runner/vm.rs
index 37fdf149..896d189b 100644
--- a/crates/hm-exec/src/local/runner/vm.rs
+++ b/crates/hm-exec/src/local/runner/vm.rs
@@ -72,20 +72,15 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
         )
     };
 
-    // Only inject workspace for root steps (no parent snapshot).
-    // Child steps inherit workspace from the parent via COW snapshot.
-    let (inject, _temp_guard) = if input.parent_snapshot.is_none() {
-        let archive_bytes = ctx
-            .archives
-            .get_bytes(input.workspace_archive_id)
-            .ok_or_else(|| anyhow::anyhow!("source archive not found"))?;
-        let dir =
-            extract_archive_to_tempdir(&archive_bytes).context("extracting workspace archive")?;
-        let path = dir.path().to_path_buf();
-        (Some(path), Some(dir))
-    } else {
-        (None, None)
-    };
+    // Always inject workspace so every executing step sees fresh source
+    // files, even when booting from a cached parent snapshot (CLI-28).
+    let archive_bytes = ctx
+        .archives
+        .get_bytes(input.workspace_archive_id)
+        .ok_or_else(|| anyhow::anyhow!("source archive not found"))?;
+    let temp_guard =
+        extract_archive_to_tempdir(&archive_bytes).context("extracting workspace archive")?;
+    let inject = Some(temp_guard.path().to_path_buf());
 
     // Baseline env for shell operation inside VMs.
     let mut env: Vec<(String, String)> = vec![

From c9ee5a1961cb89bdc75931ae403b497178b69b2d Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 09:27:15 -0700
Subject: [PATCH 03/15] test: add deep cache chain workspace freshness test
 (CLI-28)

---
 crates/hm/tests/deep_cache_chain_workspace.rs | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 crates/hm/tests/deep_cache_chain_workspace.rs

diff --git a/crates/hm/tests/deep_cache_chain_workspace.rs b/crates/hm/tests/deep_cache_chain_workspace.rs
new file mode 100644
index 00000000..2e48468d
--- /dev/null
+++ b/crates/hm/tests/deep_cache_chain_workspace.rs
@@ -0,0 +1,71 @@
+#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)]
+//! Verify workspace freshness across a deep cache chain.
+//!
+//! Pipeline: A (forever) → B (forever) → C (uncached, reads marker).
+//! Two runs with different marker contents: C must see the updated
+//! marker even when both A and B are cache hits.
+//!
+//! Requires Docker. `cargo test --test deep_cache_chain_workspace -- --ignored`
+
+use std::fs;
+use std::path::Path;
+use std::process::Command;
+
+use tempfile::tempdir;
+
+fn write_pipeline(dir: &Path, marker: &str) {
+    let hm = dir.join(".hm");
+    fs::create_dir_all(&hm).expect("mkdir .hm");
+    fs::write(dir.join("marker.txt"), marker).expect("marker.txt");
+    fs::write(
+        hm.join("pipeline.py"),
+        r#"
+import harmont as hm
+
+def build():
+    a = hm.scratch().run("echo step-a", label="a", cache=hm.forever())
+    b = a.run("echo step-b", label="b", cache=hm.forever())
+    c = b.run("cat /workspace/marker.txt", label="c")
+    return hm.pipeline(c, default_image="alpine:3.20")
+"#,
+    )
+    .expect("pipeline.py");
+}
+
+fn run_hm(repo: &Path) -> String {
+    let bin = env!("CARGO_BIN_EXE_hm");
+    let out = Command::new(bin)
+        .args(["run"])
+        .current_dir(repo)
+        .output()
+        .expect("spawn hm");
+    let stdout = String::from_utf8_lossy(&out.stdout).into_owned();
+    let stderr = String::from_utf8_lossy(&out.stderr).into_owned();
+    assert!(
+        out.status.success(),
+        "hm run failed.\nstdout:\n{stdout}\nstderr:\n{stderr}"
+    );
+    stderr
+}
+
+#[test]
+#[ignore = "requires Docker"]
+fn deep_chain_child_sees_fresh_workspace() {
+    let dir = tempdir().expect("tempdir");
+
+    write_pipeline(dir.path(), "deep-v1");
+    let out1 = run_hm(dir.path());
+    assert!(out1.contains("deep-v1"), "run 1 missing marker:\n{out1}");
+
+    fs::write(dir.path().join("marker.txt"), "deep-v2").expect("rewrite");
+
+    let out2 = run_hm(dir.path());
+    assert!(
+        out2.contains("deep-v2"),
+        "run 2 did not see fresh workspace through deep cache chain:\n{out2}"
+    );
+    assert!(
+        !out2.contains("deep-v1"),
+        "run 2 leaked stale workspace through deep cache chain:\n{out2}"
+    );
+}

From e836fa76ce7c39eebcf125ace850fb3f12c38ef0 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 09:27:47 -0700
Subject: [PATCH 04/15] docs: document deleted-file edge case in workspace
 injection (CLI-28)

---
 crates/hm-exec/src/local/runner/vm.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crates/hm-exec/src/local/runner/vm.rs b/crates/hm-exec/src/local/runner/vm.rs
index 896d189b..affb62eb 100644
--- a/crates/hm-exec/src/local/runner/vm.rs
+++ b/crates/hm-exec/src/local/runner/vm.rs
@@ -74,6 +74,10 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
 
     // Always inject workspace so every executing step sees fresh source
     // files, even when booting from a cached parent snapshot (CLI-28).
+    // Tar extraction overlays fresh source onto the snapshot: changed files
+    // are overwritten and new files added, but files the user deleted from
+    // their tree may linger. Clearing the directory instead would destroy
+    // build artifacts (node_modules, dist) inherited from cached parents.
     let archive_bytes = ctx
         .archives
         .get_bytes(input.workspace_archive_id)

From e28d564920e1b9aa8598c45ba97edeb71459d549 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 10:46:06 -0700
Subject: [PATCH 05/15] feat(hm-vm): add workspace COW copy utility

---
 crates/hm-vm/src/lib.rs       |  1 +
 crates/hm-vm/src/workspace.rs | 68 +++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 crates/hm-vm/src/workspace.rs

diff --git a/crates/hm-vm/src/lib.rs b/crates/hm-vm/src/lib.rs
index e6c69647..6c8b8da4 100644
--- a/crates/hm-vm/src/lib.rs
+++ b/crates/hm-vm/src/lib.rs
@@ -5,6 +5,7 @@ pub mod backend;
 pub mod registry;
 pub mod types;
 pub mod vm;
+pub mod workspace;
 
 #[cfg(feature = "docker-backend")]
 pub mod docker;
diff --git a/crates/hm-vm/src/workspace.rs b/crates/hm-vm/src/workspace.rs
new file mode 100644
index 00000000..f006645d
--- /dev/null
+++ b/crates/hm-vm/src/workspace.rs
@@ -0,0 +1,68 @@
+//! Host-side workspace utilities for COW build directories.
+
+use std::path::Path;
+use std::process::Command;
+
+use anyhow::{Context, Result, ensure};
+
+/// Create a copy-on-write clone of `src` contents into `dst`.
+///
+/// macOS APFS: `cp -cR` with full-copy fallback for cross-volume.
+/// Linux: `cp --reflink=auto -a` (COW on btrfs/XFS, full copy on ext4).
+pub fn cow_copy(src: &Path, dst: &Path) -> Result<()> {
+    let src_dot = format!("{}/.", src.display());
+
+    let status = if cfg!(target_os = "macos") {
+        Command::new("cp")
+            .args(["-cR", &src_dot])
+            .arg(dst)
+            .status()
+            .or_else(|_| {
+                Command::new("cp")
+                    .args(["-R", &src_dot])
+                    .arg(dst)
+                    .status()
+            })
+            .context("spawning cp")?
+    } else {
+        Command::new("cp")
+            .args(["--reflink=auto", "-a", &src_dot])
+            .arg(dst)
+            .status()
+            .context("spawning cp")?
+    };
+
+    ensure!(status.success(), "cp exited with {status}");
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::fs;
+    use tempfile::tempdir;
+
+    #[test]
+    fn cow_copy_produces_independent_clone() {
+        let src = tempdir().unwrap();
+        fs::write(src.path().join("file.txt"), "original").unwrap();
+        fs::create_dir(src.path().join("sub")).unwrap();
+        fs::write(src.path().join("sub/nested.txt"), "nested").unwrap();
+
+        let dst = tempdir().unwrap();
+        cow_copy(src.path(), dst.path()).unwrap();
+
+        assert_eq!(fs::read_to_string(dst.path().join("file.txt")).unwrap(), "original");
+        assert_eq!(fs::read_to_string(dst.path().join("sub/nested.txt")).unwrap(), "nested");
+
+        fs::write(dst.path().join("file.txt"), "modified").unwrap();
+        assert_eq!(fs::read_to_string(src.path().join("file.txt")).unwrap(), "original");
+    }
+
+    #[test]
+    fn cow_copy_empty_dir() {
+        let src = tempdir().unwrap();
+        let dst = tempdir().unwrap();
+        cow_copy(src.path(), dst.path()).unwrap();
+    }
+}

From b1f2f182439cccc3b546d5bdd5a53bcca2e87de5 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 10:46:45 -0700
Subject: [PATCH 06/15] fix(hm-vm): improve macOS COW fallback to retry on
 nonzero exit

cp -cR can fail with nonzero exit (cross-APFS-volume) but or_else
only caught spawn failures. Now falls back to cp -R on either case.
---
 crates/hm-vm/src/workspace.rs | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/crates/hm-vm/src/workspace.rs b/crates/hm-vm/src/workspace.rs
index f006645d..794e6035 100644
--- a/crates/hm-vm/src/workspace.rs
+++ b/crates/hm-vm/src/workspace.rs
@@ -13,17 +13,18 @@ pub fn cow_copy(src: &Path, dst: &Path) -> Result<()> {
     let src_dot = format!("{}/.", src.display());
 
     let status = if cfg!(target_os = "macos") {
-        Command::new("cp")
+        let st = Command::new("cp")
             .args(["-cR", &src_dot])
             .arg(dst)
-            .status()
-            .or_else(|_| {
-                Command::new("cp")
-                    .args(["-R", &src_dot])
-                    .arg(dst)
-                    .status()
-            })
-            .context("spawning cp")?
+            .status();
+        match st {
+            Ok(s) if s.success() => s,
+            _ => Command::new("cp")
+                .args(["-R", &src_dot])
+                .arg(dst)
+                .status()
+                .context("spawning cp")?,
+        }
     } else {
         Command::new("cp")
             .args(["--reflink=auto", "-a", &src_dot])

From c2e051b0bafc1afa5f038b6ae0826d10f64bb019 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 10:48:55 -0700
Subject: [PATCH 07/15] feat(hm-vm): extend ImageRegistry to store workspace
 directory paths

Add optional workspace_dir column to the snapshots table with an
idempotent schema migration. Change eviction return type from
Vec<SnapshotId> to Vec<(SnapshotId, Option<String>)> so callers can
clean up workspace directories alongside backend snapshots.

New methods: put_with_workspace() and get_with_workspace().
Updated vm.rs eviction loop to remove workspace dirs on eviction.
---
 crates/hm-vm/src/registry.rs | 123 ++++++++++++++++++++++++++++++-----
 crates/hm-vm/src/vm.rs       |  11 +++-
 2 files changed, 113 insertions(+), 21 deletions(-)

diff --git a/crates/hm-vm/src/registry.rs b/crates/hm-vm/src/registry.rs
index 939cde00..205c8071 100644
--- a/crates/hm-vm/src/registry.rs
+++ b/crates/hm-vm/src/registry.rs
@@ -72,6 +72,19 @@ impl ImageRegistry {
              );",
         )?;
 
+        // Idempotent migration: add workspace_dir column if missing.
+        let has_ws_col: bool = conn
+            .query_row(
+                "SELECT COUNT(*) FROM pragma_table_info('snapshots') WHERE name='workspace_dir'",
+                [],
+                |row| row.get::<_, i64>(0),
+            )
+            .unwrap_or(0)
+            > 0;
+        if !has_ws_col {
+            conn.execute_batch("ALTER TABLE snapshots ADD COLUMN workspace_dir TEXT;")?;
+        }
+
         Ok(Self {
             conn: Mutex::new(conn),
             capacity,
@@ -83,18 +96,27 @@ impl ImageRegistry {
     /// Returns `None` if no entry exists for `key`.
     #[must_use]
     pub fn get(&self, key: &str) -> Option<SnapshotId> {
+        self.get_with_workspace(key).map(|(snap, _)| snap)
+    }
+
+    /// Look up a cached snapshot and its workspace directory, updating the
+    /// access time.
+    ///
+    /// Returns `None` if no entry exists for `key`.
+    #[must_use]
+    pub fn get_with_workspace(&self, key: &str) -> Option<(SnapshotId, Option<String>)> {
         let now = epoch_secs();
         let conn = self.conn.lock().ok()?;
 
-        let snapshot: Option<String> = conn
+        let result: Option<(String, Option<String>)> = conn
             .query_row(
-                "SELECT snapshot_id FROM snapshots WHERE key = ?1",
+                "SELECT snapshot_id, workspace_dir FROM snapshots WHERE key = ?1",
                 [key],
-                |row| row.get(0),
+                |row| Ok((row.get::<_, String>(0)?, row.get::<_, Option<String>>(1)?)),
             )
             .ok();
 
-        if snapshot.is_some() {
+        if result.is_some() {
             let _ = conn.execute(
                 "UPDATE snapshots SET accessed_at = ?1 WHERE key = ?2",
                 rusqlite::params![now, key],
@@ -102,15 +124,16 @@ impl ImageRegistry {
         }
 
         drop(conn);
-        snapshot.map(SnapshotId)
+        result.map(|(snap, ws)| (SnapshotId(snap), ws))
     }
 
     /// Insert or update a cache entry.
     ///
-    /// Returns the [`SnapshotId`]s of any entries evicted to keep the registry
-    /// within its configured capacity. The caller is responsible for cleaning
-    /// up the backend resources associated with evicted snapshots.
-    pub fn put(&self, key: &str, snapshot: &SnapshotId) -> Vec<SnapshotId> {
+    /// Returns the evicted entries (snapshot ID and optional workspace directory)
+    /// to keep the registry within its configured capacity. The caller is
+    /// responsible for cleaning up backend resources and workspace directories
+    /// associated with evicted entries.
+    pub fn put(&self, key: &str, snapshot: &SnapshotId) -> Vec<(SnapshotId, Option<String>)> {
         let now = epoch_secs();
 
         let Ok(conn) = self.conn.lock() else {
@@ -128,6 +151,32 @@ impl ImageRegistry {
         self.evict_overflow()
     }
 
+    /// Insert or update a cache entry with an associated workspace directory.
+    ///
+    /// Returns the evicted entries (snapshot ID and optional workspace directory)
+    /// to keep the registry within its configured capacity.
+    pub fn put_with_workspace(
+        &self,
+        key: &str,
+        snapshot: &SnapshotId,
+        workspace_dir: &str,
+    ) -> Vec<(SnapshotId, Option<String>)> {
+        let now = epoch_secs();
+
+        let Ok(conn) = self.conn.lock() else {
+            return Vec::new();
+        };
+
+        let _result = conn.execute(
+            "INSERT OR REPLACE INTO snapshots (key, snapshot_id, accessed_at, workspace_dir)
+             VALUES (?1, ?2, ?3, ?4)",
+            rusqlite::params![key, snapshot.0, now, workspace_dir],
+        );
+
+        drop(conn);
+        self.evict_overflow()
+    }
+
     /// Remove a specific entry.
     ///
     /// Returns the removed snapshot's ID so the caller can clean up backend
@@ -173,8 +222,9 @@ impl ImageRegistry {
     }
 
     /// Evict the oldest entries (by `accessed_at`) when the registry exceeds
-    /// its capacity. Returns the snapshot IDs of evicted entries.
-    fn evict_overflow(&self) -> Vec<SnapshotId> {
+    /// its capacity. Returns the snapshot IDs and optional workspace directories
+    /// of evicted entries.
+    fn evict_overflow(&self) -> Vec<(SnapshotId, Option<String>)> {
         let count = self.len();
         if count <= self.capacity {
             return Vec::new();
@@ -186,14 +236,19 @@ impl ImageRegistry {
             return Vec::new();
         };
 
-        let Ok(mut stmt) =
-            conn.prepare("SELECT snapshot_id FROM snapshots ORDER BY accessed_at ASC LIMIT ?1")
-        else {
+        let Ok(mut stmt) = conn.prepare(
+            "SELECT snapshot_id, workspace_dir FROM snapshots ORDER BY accessed_at ASC LIMIT ?1",
+        ) else {
             return Vec::new();
         };
 
-        let evicted: Vec<SnapshotId> = stmt
-            .query_map([overflow], |row| row.get::<_, String>(0).map(SnapshotId))
+        let evicted: Vec<(SnapshotId, Option<String>)> = stmt
+            .query_map([overflow], |row| {
+                Ok((
+                    SnapshotId(row.get::<_, String>(0)?),
+                    row.get::<_, Option<String>>(1)?,
+                ))
+            })
             .ok()
             .map(|rows| rows.filter_map(Result::ok).collect())
             .unwrap_or_default();
@@ -264,7 +319,7 @@ mod tests {
         let evicted = reg.put("c", &SnapshotId("snap-c".into()));
 
         assert_eq!(evicted.len(), 1);
-        assert_eq!(evicted[0], SnapshotId("snap-b".into()));
+        assert_eq!(evicted[0].0, SnapshotId("snap-b".into()));
 
         // "a" should still be present.
         assert!(reg.get("a").is_some());
@@ -285,7 +340,7 @@ mod tests {
         let evicted = reg.put("z", &SnapshotId("snap-z".into()));
 
         assert_eq!(evicted.len(), 1);
-        assert_eq!(evicted[0], SnapshotId("snap-x".into()));
+        assert_eq!(evicted[0].0, SnapshotId("snap-x".into()));
         assert_eq!(reg.len(), 2);
     }
 
@@ -322,4 +377,36 @@ mod tests {
         let removed2 = reg.invalidate("to-remove");
         assert!(removed2.is_none());
     }
+
+    #[test]
+    fn put_with_workspace_then_get_returns_both() {
+        let (reg, _dir) = open_temp(10);
+        let snap = SnapshotId("snap-ws".into());
+        reg.put_with_workspace("ws-key", &snap, "/cache/workspaces/ws-key");
+
+        let (got_snap, got_ws) = reg.get_with_workspace("ws-key").unwrap();
+        assert_eq!(got_snap, SnapshotId("snap-ws".into()));
+        assert_eq!(got_ws.as_deref(), Some("/cache/workspaces/ws-key"));
+    }
+
+    #[test]
+    fn put_without_workspace_returns_none_workspace() {
+        let (reg, _dir) = open_temp(10);
+        reg.put("plain-key", &SnapshotId("snap-plain".into()));
+
+        let (_, got_ws) = reg.get_with_workspace("plain-key").unwrap();
+        assert!(got_ws.is_none());
+    }
+
+    #[test]
+    fn eviction_includes_workspace_path() {
+        let (reg, _dir) = open_temp(1);
+        reg.put_with_workspace("a", &SnapshotId("snap-a".into()), "/ws/a");
+        std::thread::sleep(std::time::Duration::from_secs(1));
+
+        let evicted = reg.put_with_workspace("b", &SnapshotId("snap-b".into()), "/ws/b");
+        assert_eq!(evicted.len(), 1);
+        assert_eq!(evicted[0].0, SnapshotId("snap-a".into()));
+        assert_eq!(evicted[0].1.as_deref(), Some("/ws/a"));
+    }
 }
diff --git a/crates/hm-vm/src/vm.rs b/crates/hm-vm/src/vm.rs
index 2af2982f..56e8e098 100644
--- a/crates/hm-vm/src/vm.rs
+++ b/crates/hm-vm/src/vm.rs
@@ -114,9 +114,14 @@ impl HmVm {
 
             if let CachingPolicy::Cache { key } = policy {
                 let evicted = self.registry.put(key, &snap);
-                for old in &evicted {
-                    if let Err(e) = self.backend.remove_snapshot(old).await {
-                        warn!(snapshot = %old.0, error = %e, "failed to remove evicted snapshot");
+                for (old_snap, old_ws) in &evicted {
+                    if let Err(e) = self.backend.remove_snapshot(old_snap).await {
+                        warn!(snapshot = %old_snap.0, error = %e, "failed to remove evicted snapshot");
+                    }
+                    if let Some(ws_dir) = old_ws {
+                        if let Err(e) = std::fs::remove_dir_all(ws_dir) {
+                            warn!(workspace = %ws_dir, error = %e, "failed to remove evicted workspace");
+                        }
                     }
                 }
             }

From 76156d0f26f0ce260128b102de538f71de12201c Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 10:55:26 -0700
Subject: [PATCH 08/15] refactor(hm-vm): replace inject() with bind-mount
 workspace, persist to cache

Replace the tar-upload inject() mechanism with Docker bind mounts for
workspace directories. The workspace is now mounted at container creation
time via HostConfig.binds, eliminating the tar_directory() helper and
UploadToContainerOptions usage entirely.

Key changes:
- Add WorkspaceMount type (host_path + guest_path) to types.rs
- Add workspace_dir field to ExecutionResult for downstream consumption
- Add workspace_cache_dir to VmConfig for COW cache persistence
- Update VmBackend::create/restore to accept Optional<&WorkspaceMount>
- Remove Vm::inject() from the trait and all implementations
- Delete tar_directory() and remove tar crate dependency
- Persist workspace via cow_copy after snapshot on cache miss
- Store workspace path in registry via put_with_workspace on cache store
- Return cached workspace path from registry on cache hit
- Clean up evicted workspace directories alongside snapshots
- Update VmRunner to construct WorkspaceMount instead of inject path
---
 Cargo.lock                               |  1 -
 crates/hm-exec/src/local/backend.rs      |  1 +
 crates/hm-exec/src/local/runner/vm.rs    | 17 +++--
 crates/hm-exec/tests/backend_contract.rs |  2 +
 crates/hm-vm/Cargo.toml                  |  3 +-
 crates/hm-vm/src/backend.rs              | 10 +--
 crates/hm-vm/src/docker.rs               | 86 +++++-------------------
 crates/hm-vm/src/lib.rs                  |  1 +
 crates/hm-vm/src/types.rs                | 13 +++-
 crates/hm-vm/src/vm.rs                   | 80 ++++++++++++----------
 10 files changed, 86 insertions(+), 128 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 434e5d0e..b59a146b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1361,7 +1361,6 @@ dependencies = [
  "derive_more",
  "futures",
  "rusqlite",
- "tar",
  "tempfile",
  "tokio",
  "tracing",
diff --git a/crates/hm-exec/src/local/backend.rs b/crates/hm-exec/src/local/backend.rs
index 499183f4..2fb7683e 100644
--- a/crates/hm-exec/src/local/backend.rs
+++ b/crates/hm-exec/src/local/backend.rs
@@ -56,6 +56,7 @@ impl LocalBackend {
         let config = VmConfig {
             memory_mib: Some(8192),
             disk_size_gb: Some(10),
+            workspace_cache_dir: Some(cache_dir.join("workspaces")),
             ..Default::default()
         };
 
diff --git a/crates/hm-exec/src/local/runner/vm.rs b/crates/hm-exec/src/local/runner/vm.rs
index affb62eb..67551609 100644
--- a/crates/hm-exec/src/local/runner/vm.rs
+++ b/crates/hm-exec/src/local/runner/vm.rs
@@ -2,7 +2,7 @@
 //!
 //! Each step runs inside a lightweight VM managed by [`HmVm`]. The
 //! source archive is extracted to a host-side temp directory and
-//! injected into the VM before the step command runs. System-level
+//! bind-mounted into the VM before the step command runs. System-level
 //! state propagates via VM snapshots.
 
 use std::future::Future;
@@ -14,7 +14,7 @@ use hm_plugin_protocol::{
     BuildEvent, CacheDecision, ExecutorInput, SnapshotRef, StdStream, StepResult,
 };
 use hm_vm::types::OutputSink;
-use hm_vm::{Action, CachingPolicy, HmVm, ImageSource, SnapshotId};
+use hm_vm::{Action, CachingPolicy, HmVm, ImageSource, SnapshotId, WorkspaceMount};
 use uuid::Uuid;
 
 use super::{StepContext, StepRunner};
@@ -72,19 +72,18 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
         )
     };
 
-    // Always inject workspace so every executing step sees fresh source
+    // Always bind-mount workspace so every executing step sees fresh source
     // files, even when booting from a cached parent snapshot (CLI-28).
-    // Tar extraction overlays fresh source onto the snapshot: changed files
-    // are overwritten and new files added, but files the user deleted from
-    // their tree may linger. Clearing the directory instead would destroy
-    // build artifacts (node_modules, dist) inherited from cached parents.
     let archive_bytes = ctx
         .archives
         .get_bytes(input.workspace_archive_id)
         .ok_or_else(|| anyhow::anyhow!("source archive not found"))?;
     let temp_guard =
         extract_archive_to_tempdir(&archive_bytes).context("extracting workspace archive")?;
-    let inject = Some(temp_guard.path().to_path_buf());
+    let workspace = Some(WorkspaceMount {
+        host_path: temp_guard.path().to_path_buf(),
+        guest_path: input.workdir.clone(),
+    });
 
     // Baseline env for shell operation inside VMs.
     let mut env: Vec<(String, String)> = vec![
@@ -102,7 +101,7 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
         env,
         working_dir: input.workdir.clone(),
         timeout: None,
-        inject,
+        workspace,
     };
 
     let sink = EventBusSink {
diff --git a/crates/hm-exec/tests/backend_contract.rs b/crates/hm-exec/tests/backend_contract.rs
index 5a043319..f33bf432 100644
--- a/crates/hm-exec/tests/backend_contract.rs
+++ b/crates/hm-exec/tests/backend_contract.rs
@@ -74,6 +74,7 @@ impl hm_vm::VmBackend for NoopVmBackend {
         &self,
         _image: &str,
         _config: &hm_vm::VmConfig,
+        _workspace: Option<&hm_vm::WorkspaceMount>,
     ) -> anyhow::Result<Box<dyn hm_vm::backend::Vm>> {
         anyhow::bail!("noop backend")
     }
@@ -81,6 +82,7 @@ impl hm_vm::VmBackend for NoopVmBackend {
         &self,
         _snapshot: &hm_vm::SnapshotId,
         _config: &hm_vm::VmConfig,
+        _workspace: Option<&hm_vm::WorkspaceMount>,
     ) -> anyhow::Result<Box<dyn hm_vm::backend::Vm>> {
         anyhow::bail!("noop backend")
     }
diff --git a/crates/hm-vm/Cargo.toml b/crates/hm-vm/Cargo.toml
index 1e01a45a..3bb5c787 100644
--- a/crates/hm-vm/Cargo.toml
+++ b/crates/hm-vm/Cargo.toml
@@ -16,11 +16,10 @@ rusqlite = { version = "0.37", features = ["bundled"] }
 
 bollard = { version = "0.18", optional = true }
 futures = { version = "0.3", optional = true }
-tar = { version = "0.4", optional = true }
 
 [features]
 default = ["docker-backend"]
-docker-backend = ["dep:bollard", "dep:futures", "dep:tar"]
+docker-backend = ["dep:bollard", "dep:futures"]
 
 [dev-dependencies]
 tempfile = "3"
diff --git a/crates/hm-vm/src/backend.rs b/crates/hm-vm/src/backend.rs
index 259259be..6d6e8428 100644
--- a/crates/hm-vm/src/backend.rs
+++ b/crates/hm-vm/src/backend.rs
@@ -1,21 +1,20 @@
 //! Backend trait for pluggable VM implementations.
 
 use std::fmt;
-use std::path::Path;
 
 use anyhow::Result;
 use async_trait::async_trait;
 
-use crate::types::{OutputSink, SnapshotId, VmConfig};
+use crate::types::{OutputSink, SnapshotId, VmConfig, WorkspaceMount};
 
 /// Factory that creates and manages virtual machines.
 #[async_trait]
 pub trait VmBackend: Send + Sync + fmt::Debug {
     /// Boot a new VM from the given OCI image reference.
-    async fn create(&self, image: &str, config: &VmConfig) -> Result<Box<dyn Vm>>;
+    async fn create(&self, image: &str, config: &VmConfig, workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>>;
 
     /// Restore a VM from a previously taken snapshot.
-    async fn restore(&self, snapshot: &SnapshotId, config: &VmConfig) -> Result<Box<dyn Vm>>;
+    async fn restore(&self, snapshot: &SnapshotId, config: &VmConfig, workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>>;
 
     /// Check whether a snapshot exists in the backend store.
     async fn snapshot_exists(&self, snapshot: &SnapshotId) -> Result<bool>;
@@ -27,9 +26,6 @@ pub trait VmBackend: Send + Sync + fmt::Debug {
 /// Handle to a running virtual machine.
 #[async_trait]
 pub trait Vm: Send {
-    /// Copy a host path into the guest filesystem.
-    async fn inject(&self, host_path: &Path, guest_path: &str) -> Result<()>;
-
     /// Run a command inside the VM and stream output to `sink`.
     async fn exec(
         &self,
diff --git a/crates/hm-vm/src/docker.rs b/crates/hm-vm/src/docker.rs
index ea14bf9d..007168b4 100644
--- a/crates/hm-vm/src/docker.rs
+++ b/crates/hm-vm/src/docker.rs
@@ -5,14 +5,13 @@
 //! image commits.
 
 use std::collections::HashMap;
-use std::path::Path;
 
 use anyhow::{Context, Result};
 use async_trait::async_trait;
 use bollard::Docker;
 use bollard::container::{
     Config, CreateContainerOptions, RemoveContainerOptions, StartContainerOptions,
-    StopContainerOptions, UploadToContainerOptions,
+    StopContainerOptions,
 };
 use bollard::exec::{CreateExecOptions, StartExecResults};
 use bollard::image::{
@@ -22,7 +21,7 @@ use futures::StreamExt;
 use tracing::instrument;
 
 use crate::backend::{Vm, VmBackend};
-use crate::types::{OutputSink, SnapshotId, VmConfig};
+use crate::types::{OutputSink, SnapshotId, VmConfig, WorkspaceMount};
 
 /// Docker-based VM backend.
 ///
@@ -79,10 +78,17 @@ impl DockerBackend {
     }
 
     #[instrument(skip(self))]
-    async fn start_container(&self, image: &str) -> Result<String> {
+    async fn start_container(&self, image: &str, workspace: Option<&WorkspaceMount>) -> Result<String> {
+        let host_config = workspace.map(|ws| {
+            bollard::service::HostConfig {
+                binds: Some(vec![format!("{}:{}:rw", ws.host_path.display(), ws.guest_path)]),
+                ..Default::default()
+            }
+        });
         let cfg = Config {
             image: Some(image.to_string()),
             cmd: Some(vec!["sh".into(), "-c".into(), "sleep infinity".into()]),
+            host_config,
             ..Default::default()
         };
         let create = self
@@ -100,19 +106,19 @@ impl DockerBackend {
 
 #[async_trait]
 impl VmBackend for DockerBackend {
-    #[instrument(skip(self, _config))]
-    async fn create(&self, image: &str, _config: &VmConfig) -> Result<Box<dyn Vm>> {
+    #[instrument(skip(self, _config, workspace))]
+    async fn create(&self, image: &str, _config: &VmConfig, workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>> {
         self.ensure_image(image).await?;
-        let container_id = self.start_container(image).await?;
+        let container_id = self.start_container(image, workspace).await?;
         Ok(Box::new(DockerVm {
             client: self.client.clone(),
             container_id: Some(container_id),
         }))
     }
 
-    #[instrument(skip(self, _config))]
-    async fn restore(&self, snapshot: &SnapshotId, _config: &VmConfig) -> Result<Box<dyn Vm>> {
-        let container_id = self.start_container(&snapshot.0).await?;
+    #[instrument(skip(self, _config, workspace))]
+    async fn restore(&self, snapshot: &SnapshotId, _config: &VmConfig, workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>> {
+        let container_id = self.start_container(&snapshot.0, workspace).await?;
         Ok(Box::new(DockerVm {
             client: self.client.clone(),
             container_id: Some(container_id),
@@ -168,68 +174,8 @@ impl Drop for DockerVm {
     }
 }
 
-/// Build a tar archive from a host directory.
-///
-/// The archive contains all files under `host_path` with paths relative
-/// to `host_path` itself (i.e. the directory contents, not the directory).
-fn tar_directory(host_path: &Path) -> Result<Vec<u8>> {
-    let mut archive = tar::Builder::new(Vec::new());
-    archive
-        .append_dir_all(".", host_path)
-        .with_context(|| format!("archiving '{}'", host_path.display()))?;
-    archive.finish().context("finalizing tar archive")?;
-    archive.into_inner().context("extracting tar bytes")
-}
-
 #[async_trait]
 impl Vm for DockerVm {
-    #[instrument(skip(self), fields(host = %host_path.display()))]
-    async fn inject(&self, host_path: &Path, guest_path: &str) -> Result<()> {
-        // Ensure the destination directory exists inside the container.
-        let cid = self
-            .container_id
-            .as_deref()
-            .context("container already destroyed")?;
-        let mkdir = self
-            .client
-            .create_exec(
-                cid,
-                CreateExecOptions {
-                    cmd: Some(vec!["mkdir", "-p", guest_path]),
-                    attach_stdout: Some(true),
-                    attach_stderr: Some(true),
-                    ..Default::default()
-                },
-            )
-            .await
-            .context("create mkdir exec")?;
-        if let StartExecResults::Attached { mut output, .. } = self
-            .client
-            .start_exec(&mkdir.id, None)
-            .await
-            .context("start mkdir exec")?
-        {
-            while output.next().await.is_some() {}
-        }
-
-        let tar_bytes = tar_directory(host_path)?;
-        let options = UploadToContainerOptions {
-            path: guest_path,
-            ..Default::default()
-        };
-        self.client
-            .upload_to_container(cid, Some(options), tar_bytes.into())
-            .await
-            .with_context(|| {
-                format!(
-                    "uploading '{}' to container '{}:{guest_path}'",
-                    host_path.display(),
-                    cid.get(..12).unwrap_or(cid),
-                )
-            })?;
-        Ok(())
-    }
-
     #[instrument(skip(self, env, sink))]
     async fn exec(
         &self,
diff --git a/crates/hm-vm/src/lib.rs b/crates/hm-vm/src/lib.rs
index 6c8b8da4..ab5d028b 100644
--- a/crates/hm-vm/src/lib.rs
+++ b/crates/hm-vm/src/lib.rs
@@ -14,5 +14,6 @@ pub use backend::{Vm, VmBackend};
 pub use registry::ImageRegistry;
 pub use types::{
     Action, CachingPolicy, ExecutionResult, ImageSource, NullSink, OutputSink, SnapshotId, VmConfig,
+    WorkspaceMount,
 };
 pub use vm::HmVm;
diff --git a/crates/hm-vm/src/types.rs b/crates/hm-vm/src/types.rs
index 89248e5f..ecb1d5b1 100644
--- a/crates/hm-vm/src/types.rs
+++ b/crates/hm-vm/src/types.rs
@@ -10,6 +10,13 @@ pub enum ImageSource {
     Snapshot(SnapshotId),
 }
 
+/// Bind-mount specification for a host workspace directory.
+#[derive(Clone, Debug)]
+pub struct WorkspaceMount {
+    pub host_path: PathBuf,
+    pub guest_path: String,
+}
+
 /// What to execute inside a VM.
 #[derive(Clone, Debug)]
 pub struct Action {
@@ -18,9 +25,9 @@ pub struct Action {
     pub env: Vec<(String, String)>,
     pub working_dir: String,
     pub timeout: Option<Duration>,
-    /// Host directory to copy into `working_dir` before execution.
+    /// Host workspace directory to bind-mount into the VM.
     /// Skipped on cache hits (snapshot already contains prior state).
-    pub inject: Option<PathBuf>,
+    pub workspace: Option<WorkspaceMount>,
 }
 
 /// How to cache the result.
@@ -43,6 +50,7 @@ pub struct ExecutionResult {
     pub exit_code: i32,
     pub snapshot: Option<SnapshotId>,
     pub cached: bool,
+    pub workspace_dir: Option<PathBuf>,
 }
 
 /// VM resource configuration.
@@ -51,6 +59,7 @@ pub struct VmConfig {
     pub cpus: Option<u32>,
     pub memory_mib: Option<u64>,
     pub disk_size_gb: Option<u64>,
+    pub workspace_cache_dir: Option<PathBuf>,
 }
 
 /// Receives stdout/stderr lines during execution.
diff --git a/crates/hm-vm/src/vm.rs b/crates/hm-vm/src/vm.rs
index 56e8e098..07f7aefe 100644
--- a/crates/hm-vm/src/vm.rs
+++ b/crates/hm-vm/src/vm.rs
@@ -1,5 +1,6 @@
 //! High-level VM orchestrator.
 
+use std::path::PathBuf;
 use std::sync::Arc;
 
 use anyhow::Result;
@@ -13,7 +14,7 @@ use crate::types::{Action, CachingPolicy, ExecutionResult, ImageSource, OutputSi
 ///
 /// `HmVm` composes a [`VmBackend`] with an [`ImageRegistry`] to provide
 /// cache-aware execution: if a cached snapshot already exists for a given
-/// caching key the expensive create-inject-exec cycle is skipped entirely.
+/// caching key the expensive create-exec cycle is skipped entirely.
 #[derive(Debug)]
 pub struct HmVm {
     backend: Arc<dyn VmBackend>,
@@ -42,7 +43,7 @@ impl HmVm {
     ///
     /// # Errors
     ///
-    /// Returns an error if the backend fails to create, restore, inject, or
+    /// Returns an error if the backend fails to create, restore, or
     /// execute. Best-effort cleanup is performed even on failure paths.
     #[instrument(skip(self, action, sink), fields(cmd = %action.cmd))]
     pub async fn execute(
@@ -56,10 +57,14 @@ impl HmVm {
             && let Some(snap) = self.registry.get(key)
         {
             if self.backend.snapshot_exists(&snap).await? {
+                let ws_dir = self.registry.get_with_workspace(key)
+                    .and_then(|(_, ws)| ws)
+                    .map(PathBuf::from);
                 return Ok(ExecutionResult {
                     exit_code: 0,
                     snapshot: Some(snap),
                     cached: true,
+                    workspace_dir: ws_dir,
                 });
             }
             let _ = self.registry.invalidate(key);
@@ -67,8 +72,8 @@ impl HmVm {
 
         // 2. Create or restore VM
         let mut vm = match &action.source {
-            ImageSource::Image(image) => self.backend.create(image, &self.config).await?,
-            ImageSource::Snapshot(snap) => self.backend.restore(snap, &self.config).await?,
+            ImageSource::Image(image) => self.backend.create(image, &self.config, action.workspace.as_ref()).await?,
+            ImageSource::Snapshot(snap) => self.backend.restore(snap, &self.config, action.workspace.as_ref()).await?,
         };
 
         let result = self.run_in_vm(&mut *vm, &action, &policy, sink).await;
@@ -79,8 +84,8 @@ impl HmVm {
         result
     }
 
-    /// Inner lifecycle: inject, exec, snapshot. Separated so the caller
-    /// can guarantee `vm.destroy()` runs regardless of outcome.
+    /// Inner lifecycle: exec, snapshot, persist workspace. Separated so the
+    /// caller can guarantee `vm.destroy()` runs regardless of outcome.
     async fn run_in_vm(
         &self,
         vm: &mut dyn crate::backend::Vm,
@@ -88,12 +93,9 @@ impl HmVm {
         policy: &CachingPolicy,
         sink: &dyn OutputSink,
     ) -> Result<ExecutionResult> {
-        // 3. Inject workspace
-        if let Some(ref host_path) = action.inject {
-            vm.inject(host_path, &action.working_dir).await?;
-        }
+        let mut workspace_dir: Option<PathBuf> = None;
 
-        // 4. Execute command (with optional timeout)
+        // 3. Execute command (with optional timeout)
         let exec_fut = vm.exec(&action.cmd, &action.env, &action.working_dir, sink);
         let exit_code = if let Some(timeout) = action.timeout {
             match tokio::time::timeout(timeout, exec_fut).await {
@@ -104,7 +106,7 @@ impl HmVm {
             exec_fut.await?
         };
 
-        // 5. Snapshot and cache on success
+        // 4. Snapshot and cache on success
         let snapshot = if exit_code == 0 {
             let label = match policy {
                 CachingPolicy::Cache { key } => key.as_str(),
@@ -112,16 +114,31 @@ impl HmVm {
             };
             let snap = vm.snapshot(label).await?;
 
+            // Persist workspace to cache directory.
+            if let Some(ref ws) = action.workspace {
+                if let Some(ref cache_dir) = self.config.workspace_cache_dir {
+                    let ws_cache = cache_dir.join(label);
+                    if ws_cache.exists() {
+                        std::fs::remove_dir_all(&ws_cache).ok();
+                    }
+                    std::fs::create_dir_all(&ws_cache)?;
+                    crate::workspace::cow_copy(&ws.host_path, &ws_cache)?;
+                    workspace_dir = Some(ws_cache);
+                }
+            }
+
             if let CachingPolicy::Cache { key } = policy {
-                let evicted = self.registry.put(key, &snap);
+                let evicted = if let Some(ref ws) = workspace_dir {
+                    self.registry.put_with_workspace(key, &snap, &ws.display().to_string())
+                } else {
+                    self.registry.put(key, &snap)
+                };
                 for (old_snap, old_ws) in &evicted {
                     if let Err(e) = self.backend.remove_snapshot(old_snap).await {
                         warn!(snapshot = %old_snap.0, error = %e, "failed to remove evicted snapshot");
                     }
-                    if let Some(ws_dir) = old_ws {
-                        if let Err(e) = std::fs::remove_dir_all(ws_dir) {
-                            warn!(workspace = %ws_dir, error = %e, "failed to remove evicted workspace");
-                        }
+                    if let Some(ws_path) = old_ws {
+                        std::fs::remove_dir_all(ws_path).ok();
                     }
                 }
             }
@@ -135,6 +152,7 @@ impl HmVm {
             exit_code,
             snapshot,
             cached: false,
+            workspace_dir: if exit_code == 0 { workspace_dir } else { None },
         })
     }
 }
@@ -144,9 +162,8 @@ impl HmVm {
 mod tests {
     use super::*;
     use crate::backend::Vm;
-    use crate::types::{NullSink, SnapshotId};
+    use crate::types::{NullSink, SnapshotId, WorkspaceMount};
 
-    use std::path::Path;
     use std::sync::Mutex;
 
     use async_trait::async_trait;
@@ -176,7 +193,7 @@ mod tests {
 
     #[async_trait]
     impl VmBackend for MockBackend {
-        async fn create(&self, image: &str, _config: &VmConfig) -> Result<Box<dyn Vm>> {
+        async fn create(&self, image: &str, _config: &VmConfig, _workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>> {
             self.calls
                 .lock()
                 .map_or_else(|_| {}, |mut c| c.push(format!("create:{image}")));
@@ -186,7 +203,7 @@ mod tests {
             }))
         }
 
-        async fn restore(&self, snapshot: &SnapshotId, _config: &VmConfig) -> Result<Box<dyn Vm>> {
+        async fn restore(&self, snapshot: &SnapshotId, _config: &VmConfig, _workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>> {
             self.calls
                 .lock()
                 .map_or_else(|_| {}, |mut c| c.push(format!("restore:{}", snapshot.0)));
@@ -220,14 +237,6 @@ mod tests {
 
     #[async_trait]
     impl Vm for MockVm {
-        async fn inject(&self, host_path: &Path, guest_path: &str) -> Result<()> {
-            self.calls.lock().map_or_else(
-                |_| {},
-                |mut c| c.push(format!("inject:{}:{guest_path}", host_path.display())),
-            );
-            Ok(())
-        }
-
         async fn exec(
             &self,
             cmd: &str,
@@ -274,7 +283,10 @@ mod tests {
             env: vec![],
             working_dir: "/work".into(),
             timeout: None,
-            inject: Some(std::path::PathBuf::from("/host/src")),
+            workspace: Some(WorkspaceMount {
+                host_path: std::path::PathBuf::from("/host/src"),
+                guest_path: "/work".into(),
+            }),
         }
     }
 
@@ -309,7 +321,6 @@ mod tests {
 
         let log = calls(&backend);
         assert!(log.iter().any(|c| c.starts_with("create:")));
-        assert!(log.iter().any(|c| c.starts_with("inject:")));
         assert!(log.iter().any(|c| c.starts_with("exec:")));
         assert!(log.iter().any(|c| c.starts_with("snapshot:")));
         assert!(log.iter().any(|c| c == "destroy"));
@@ -400,7 +411,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn cache_miss_from_snapshot_still_injects_workspace() {
+    async fn cache_miss_from_snapshot_passes_workspace() {
         let backend = MockBackend::new(0, false);
         let (registry, _dir) = open_temp_registry(10);
         let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
@@ -426,11 +437,6 @@ mod tests {
         let log = calls(&backend);
         // Must restore from snapshot (not create from image).
         assert!(log.iter().any(|c| c.starts_with("restore:parent-snap")));
-        // Must still inject workspace even though source is a snapshot.
-        assert!(
-            log.iter().any(|c| c.starts_with("inject:")),
-            "workspace injection skipped for snapshot-sourced step: {log:?}"
-        );
         assert!(log.iter().any(|c| c.starts_with("exec:")));
         assert!(log.iter().any(|c| c.starts_with("snapshot:")));
     }

From 9f16b9dc608b581044f7d3f1ab20eca09b8660b0 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 10:59:12 -0700
Subject: [PATCH 09/15] feat(hm-exec): wire COW workspace through runner and
 scheduler DAG

- Add parent_workspace_dir to StepContext, workspace_dir to StepResult
- VmRunner: COW copy parent workspace for child steps, extract fresh for root
- Scheduler: propagate workspace_dir through StepOutcome to children
---
 crates/hm-exec/src/local/runner/mod.rs    |  5 ++++
 crates/hm-exec/src/local/runner/vm.rs     | 30 ++++++++++++++++-------
 crates/hm-exec/src/local/scheduler.rs     | 22 ++++++++++++++---
 crates/hm-plugin-protocol/src/executor.rs |  6 +++++
 4 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/crates/hm-exec/src/local/runner/mod.rs b/crates/hm-exec/src/local/runner/mod.rs
index 680eda65..d69c84fd 100644
--- a/crates/hm-exec/src/local/runner/mod.rs
+++ b/crates/hm-exec/src/local/runner/mod.rs
@@ -29,6 +29,10 @@ pub struct StepContext {
     pub event_bus: Arc<EventBus>,
     pub archives: Arc<ArchiveStore>,
     pub cancel: CancellationToken,
+    /// When set, the step should COW-copy this directory as its
+    /// workspace base instead of extracting from the source archive.
+    /// Populated by the scheduler for child steps in a `BuildsIn` chain.
+    pub parent_workspace_dir: Option<String>,
 }
 
 /// Async trait implemented by step executors (e.g. the VM runner).
@@ -152,6 +156,7 @@ mod tests {
                     exit_code: 0,
                     committed_snapshot: None,
                     artifacts: vec![],
+                    workspace_dir: None,
                 })
             })
         }
diff --git a/crates/hm-exec/src/local/runner/vm.rs b/crates/hm-exec/src/local/runner/vm.rs
index 67551609..af1fa883 100644
--- a/crates/hm-exec/src/local/runner/vm.rs
+++ b/crates/hm-exec/src/local/runner/vm.rs
@@ -72,16 +72,27 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
         )
     };
 
-    // Always bind-mount workspace so every executing step sees fresh source
-    // files, even when booting from a cached parent snapshot (CLI-28).
-    let archive_bytes = ctx
-        .archives
-        .get_bytes(input.workspace_archive_id)
-        .ok_or_else(|| anyhow::anyhow!("source archive not found"))?;
-    let temp_guard =
-        extract_archive_to_tempdir(&archive_bytes).context("extracting workspace archive")?;
+    // Prepare the workspace: COW-copy from parent step's workspace (if this
+    // is a child in a BuildsIn chain) or extract from the source archive
+    // (root step). Either way, bind-mount the result into the VM.
+    let step_ws = tempfile::tempdir().context("creating step workspace")?;
+
+    if let Some(ref parent_ws) = ctx.parent_workspace_dir {
+        hm_vm::workspace::cow_copy(std::path::Path::new(parent_ws), step_ws.path())
+            .context("COW copy parent workspace")?;
+    } else {
+        let archive_bytes = ctx
+            .archives
+            .get_bytes(input.workspace_archive_id)
+            .ok_or_else(|| anyhow::anyhow!("source archive not found"))?;
+        let base_dir = extract_archive_to_tempdir(&archive_bytes)
+            .context("extracting workspace archive")?;
+        hm_vm::workspace::cow_copy(base_dir.path(), step_ws.path())
+            .context("COW copy source to workspace")?;
+    }
+
     let workspace = Some(WorkspaceMount {
-        host_path: temp_guard.path().to_path_buf(),
+        host_path: step_ws.path().to_path_buf(),
         guest_path: input.workdir.clone(),
     });
 
@@ -137,6 +148,7 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
         exit_code: result.exit_code,
         committed_snapshot: result.snapshot.map(|s| SnapshotRef(s.0)),
         artifacts: vec![],
+        workspace_dir: result.workspace_dir.map(|p| p.display().to_string()),
     })
 }
 
diff --git a/crates/hm-exec/src/local/scheduler.rs b/crates/hm-exec/src/local/scheduler.rs
index 46afe845..ea27187d 100644
--- a/crates/hm-exec/src/local/scheduler.rs
+++ b/crates/hm-exec/src/local/scheduler.rs
@@ -59,6 +59,10 @@ struct StepOutcome {
     /// `None` only for steps short-circuited because a predecessor failed
     /// or the build was cancelled before they could run.
     summary: Option<StepResultSummary>,
+    /// Host-side workspace path produced by this step's runner, if any.
+    /// The scheduler propagates this to downstream `BuildsIn` children
+    /// so they can COW-copy instead of re-extracting.
+    workspace_dir: Option<String>,
 }
 
 type StepFuture = futures::future::Shared<BoxFuture<'static, StepOutcome>>;
@@ -129,6 +133,7 @@ pub(crate) async fn run(
         event_bus: bus.clone(),
         archives: archives.clone(),
         cancel: cancel.clone(),
+        parent_workspace_dir: None,
     };
 
     let parallelism = parallelism.max(1);
@@ -202,6 +207,7 @@ pub(crate) async fn run(
                         exit_code: None,
                         duration_ms: 0,
                     }),
+                    workspace_dir: None,
                 };
             }
 
@@ -211,12 +217,17 @@ pub(crate) async fn run(
                 .await
                 .expect("semaphore closed unexpectedly");
 
-            // Find the BuildsIn parent's snapshot for container lineage.
-            let parent_snapshot = preds
+            // Find the BuildsIn parent's snapshot and workspace dir for
+            // container lineage and COW workspace propagation.
+            let (parent_snapshot, parent_workspace_dir) = preds
                 .iter()
                 .zip(&pred_outcomes)
                 .find(|((ek, _), _)| *ek == EdgeKind::BuildsIn)
-                .and_then(|(_, outcome)| outcome.snapshot.clone());
+                .map(|(_, outcome)| (outcome.snapshot.clone(), outcome.workspace_dir.clone()))
+                .unwrap_or((None, None));
+
+            let mut step_ctx = run_ctx.clone();
+            step_ctx.parent_workspace_dir = parent_workspace_dir;
 
             match execute_step(
                 n,
@@ -227,7 +238,7 @@ pub(crate) async fn run(
                 parent_key,
                 archive_id,
                 run_id,
-                run_ctx,
+                step_ctx,
                 reg,
                 bus,
                 cancel,
@@ -247,6 +258,7 @@ pub(crate) async fn run(
                             exit_code: Some(1),
                             duration_ms: 0,
                         }),
+                        workspace_dir: None,
                     }
                 }
             }
@@ -474,6 +486,7 @@ async fn execute_step(
                             exit_code: Some(124),
                             duration_ms: dur_ms,
                         }),
+                        workspace_dir: None,
                     });
                 }
             }
@@ -518,6 +531,7 @@ async fn execute_step(
                     exit_code: Some(sr.exit_code),
                     duration_ms: dur_ms,
                 }),
+                workspace_dir: sr.workspace_dir,
             })
         }
         Err(e) => {
diff --git a/crates/hm-plugin-protocol/src/executor.rs b/crates/hm-plugin-protocol/src/executor.rs
index 75323192..e2cca5e0 100644
--- a/crates/hm-plugin-protocol/src/executor.rs
+++ b/crates/hm-plugin-protocol/src/executor.rs
@@ -98,4 +98,10 @@ pub struct StepResult {
     /// (typically only on `CacheDecision::MissBuildAs`).
     pub committed_snapshot: Option<SnapshotRef>,
     pub artifacts: Vec<ArtifactRef>,
+    /// Host-side path to the workspace directory that survived this step.
+    /// Used by the scheduler to propagate workspace state to downstream
+    /// steps via COW copy instead of re-extracting the source archive.
+    #[serde(skip)]
+    #[schemars(skip)]
+    pub workspace_dir: Option<String>,
 }

From 939c83ac79aba6b300cf0816259c26da3ba1e0f0 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 11:03:30 -0700
Subject: [PATCH 10/15] feat(dsl): add predicate cache policy for user-defined
 invalidation

---
 .../harmont-py/harmont/__init__.py            |  22 ++++
 .../harmont-py/harmont/_pipeline.py           |   4 +
 .../hm-dsl-engine/harmont-py/harmont/cache.py |  11 ++
 .../harmont-py/harmont/keygen.py              |   2 +
 .../harmont-py/tests/test_cache_predicate.py  | 112 ++++++++++++++++++
 crates/hm-dsl-engine/harmont-ts/src/cache.ts  |  11 +-
 crates/hm-dsl-engine/harmont-ts/src/index.ts  |   2 +
 crates/hm-dsl-engine/harmont-ts/src/keygen.ts |   4 +
 .../hm-dsl-engine/harmont-ts/src/pipeline.ts  |   2 +
 .../harmont-ts/tests/cache.test.ts            |  15 ++-
 .../harmont-ts/tests/keygen.test.ts           |  31 ++++-
 11 files changed, 213 insertions(+), 3 deletions(-)
 create mode 100644 crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py

diff --git a/crates/hm-dsl-engine/harmont-py/harmont/__init__.py b/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
index f1f70c73..061ae89b 100644
--- a/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
+++ b/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
@@ -52,6 +52,7 @@
     CacheNone,
     CacheOnChange,
     CachePolicy,
+    CachePredicate,
     CacheTTL,
 )
 from .triggers import pull_request, push
@@ -178,6 +179,25 @@ def compose(*policies: CachePolicy) -> CacheCompose:
     return CacheCompose(policies=tuple(policies))
 
 
+def predicate(fn: "Callable[[], str]") -> CachePredicate:
+    """Cache policy that invalidates when ``fn()`` returns a new value.
+
+    ``fn`` is called once at plan time. Its string return value is
+    folded into the cache key.
+
+    Example::
+
+        import json
+
+        def package_manager():
+            with open("package.json") as f:
+                return json.load(f).get("packageManager", "")
+
+        step.run("install", cache=hm.predicate(package_manager))
+    """
+    return CachePredicate(fn=fn)
+
+
 def timeout(duration: str | int | timedelta, step: Step) -> Step:
     """Apply a wall-clock timeout to a single step.
 
@@ -287,6 +307,7 @@ def group(steps: list[Step] | tuple[Step, ...]) -> tuple[Step, ...]:
     "CacheNone",
     "CacheOnChange",
     "CachePolicy",
+    "CachePredicate",
     "CacheTTL",
     "JsProject",
     "Pipeline",
@@ -306,6 +327,7 @@ def group(steps: list[Step] | tuple[Step, ...]) -> tuple[Step, ...]:
     "pipeline",
     "pipeline_to_json",
     "pr",
+    "predicate",
     "pull_request",
     "push",
     "py",
diff --git a/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py b/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py
index cf601d86..233cc04e 100644
--- a/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py
+++ b/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py
@@ -21,6 +21,7 @@
     CacheNone,
     CacheOnChange,
     CachePolicy,
+    CachePredicate,
     CacheTTL,
 )
 
@@ -245,6 +246,9 @@ def _cache_to_dict(policy: CachePolicy) -> dict[str, Any]:
             "policy": "compose",
             "sub_policies": [_cache_to_dict(p) for p in policy.policies],
         }
+    if isinstance(policy, CachePredicate):
+        value = str(policy.fn())
+        return {"policy": "predicate", "value": value}
     msg = f"unknown CachePolicy: {type(policy).__name__}"
     raise TypeError(msg)
 
diff --git a/crates/hm-dsl-engine/harmont-py/harmont/cache.py b/crates/hm-dsl-engine/harmont-py/harmont/cache.py
index 1b9da422..d597ea79 100644
--- a/crates/hm-dsl-engine/harmont-py/harmont/cache.py
+++ b/crates/hm-dsl-engine/harmont-py/harmont/cache.py
@@ -78,3 +78,14 @@ class CacheCompose(CachePolicy):
     """
 
     policies: tuple[CachePolicy, ...]
+
+
+@dataclass(frozen=True)
+class CachePredicate(CachePolicy):
+    """Cache keyed on the return value of a user-supplied function.
+
+    The function is called at plan time. If it returns a different value
+    than the previous run, the cache key changes and the step re-runs.
+    """
+
+    fn: object  # Callable[[], str] — called at plan time
diff --git a/crates/hm-dsl-engine/harmont-py/harmont/keygen.py b/crates/hm-dsl-engine/harmont-py/harmont/keygen.py
index f7285c33..14096ea7 100644
--- a/crates/hm-dsl-engine/harmont-py/harmont/keygen.py
+++ b/crates/hm-dsl-engine/harmont-py/harmont/keygen.py
@@ -130,6 +130,8 @@ def _resolve_policy(
             for sub in subs
         ]
         return "compose-" + _sha256_hex("".join(parts))
+    if kind == "predicate":
+        return "predicate-" + _sha256_hex(policy["value"])
     msg = f"resolve-policy-key: unknown policy {kind!r}"
     raise ValueError(msg)
 
diff --git a/crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py b/crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py
new file mode 100644
index 00000000..7cd08c38
--- /dev/null
+++ b/crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py
@@ -0,0 +1,112 @@
+"""Tests for the ``predicate`` cache policy."""
+
+from __future__ import annotations
+
+import hashlib
+from pathlib import Path
+
+import harmont as hm
+from harmont._pipeline import _cache_to_dict
+from harmont.cache import CachePredicate
+from harmont.keygen import resolve_pipeline_keys
+
+NUL = "\x00"
+
+
+def _sha256_hex(s: str) -> str:
+    return hashlib.sha256(s.encode("utf-8")).hexdigest()
+
+
+def _make_graph(nodes, edges=None):
+    return {
+        "nodes": nodes,
+        "node_holes": [],
+        "edge_property": "directed",
+        "edges": edges or [],
+    }
+
+
+# -- factory -----------------------------------------------------------------
+
+def test_predicate_factory_returns_cache_predicate():
+    policy = hm.predicate(lambda: "v1")
+    assert isinstance(policy, CachePredicate)
+
+
+# -- lowering ----------------------------------------------------------------
+
+def test_predicate_lowering_calls_fn_and_produces_dict():
+    policy = hm.predicate(lambda: "v1")
+    d = _cache_to_dict(policy)
+    assert d == {"policy": "predicate", "value": "v1"}
+
+
+def test_predicate_lowering_stringifies_return_value():
+    policy = hm.predicate(lambda: 42)
+    d = _cache_to_dict(policy)
+    assert d == {"policy": "predicate", "value": "42"}
+
+
+def test_predicate_fn_is_called_at_lowering_time():
+    calls: list[int] = []
+
+    def counter():
+        calls.append(1)
+        return "x"
+
+    policy = hm.predicate(counter)
+    assert len(calls) == 0
+    _cache_to_dict(policy)
+    assert len(calls) == 1
+
+
+# -- keygen ------------------------------------------------------------------
+
+def test_predicate_keygen_produces_deterministic_key():
+    graph = _make_graph(
+        [
+            {
+                "step": {
+                    "key": "a",
+                    "cmd": "echo",
+                    "cache": {"policy": "predicate", "value": "v1"},
+                },
+                "env": {},
+            },
+        ]
+    )
+    out = resolve_pipeline_keys(
+        graph,
+        pipeline_org="default",
+        pipeline_slug="default",
+        now=0,
+        base_path=Path("/tmp"),  # noqa: S108
+        env={},
+    )
+    policy_res = "predicate-" + _sha256_hex("v1")
+    expected = _sha256_hex(
+        "default" + NUL + "default" + NUL + "a" + NUL + "scratch" + NUL + policy_res
+    )
+    assert out["nodes"][0]["step"]["cache"]["key"] == expected
+
+
+def test_different_predicate_values_produce_different_keys():
+    def make_graph(value):
+        return _make_graph(
+            [
+                {
+                    "step": {
+                        "key": "a",
+                        "cmd": "echo",
+                        "cache": {"policy": "predicate", "value": value},
+                    },
+                    "env": {},
+                },
+            ]
+        )
+
+    g1 = make_graph("v1")
+    g2 = make_graph("v2")
+    resolve_pipeline_keys(g1, pipeline_org="o", pipeline_slug="s", now=0, base_path=Path("/tmp"), env={})  # noqa: S108
+    resolve_pipeline_keys(g2, pipeline_org="o", pipeline_slug="s", now=0, base_path=Path("/tmp"), env={})  # noqa: S108
+    assert g1["nodes"][0]["step"]["cache"]["key"] != g2["nodes"][0]["step"]["cache"]["key"]
diff --git a/crates/hm-dsl-engine/harmont-ts/src/cache.ts b/crates/hm-dsl-engine/harmont-ts/src/cache.ts
index fc423c6d..47f1ab52 100644
--- a/crates/hm-dsl-engine/harmont-ts/src/cache.ts
+++ b/crates/hm-dsl-engine/harmont-ts/src/cache.ts
@@ -19,7 +19,12 @@ export interface CacheCompose {
   readonly policies: readonly CachePolicy[];
 }
 
-export type CachePolicy = CacheForever | CacheTTL | CacheOnChange | CacheCompose;
+export interface CachePredicate {
+  readonly kind: "predicate";
+  readonly value: string;
+}
+
+export type CachePolicy = CacheForever | CacheTTL | CacheOnChange | CacheCompose | CachePredicate;
 
 export function forever(opts?: { envKeys?: string[] }): CacheForever {
   return { kind: "forever", envKeys: opts?.envKeys ?? [] };
@@ -39,3 +44,7 @@ export function onChange(...paths: string[]): CacheOnChange {
 export function compose(...policies: CachePolicy[]): CacheCompose {
   return { kind: "compose", policies };
 }
+
+export function predicate(value: string): CachePredicate {
+  return { kind: "predicate", value };
+}
diff --git a/crates/hm-dsl-engine/harmont-ts/src/index.ts b/crates/hm-dsl-engine/harmont-ts/src/index.ts
index c950c850..e6306bfa 100644
--- a/crates/hm-dsl-engine/harmont-ts/src/index.ts
+++ b/crates/hm-dsl-engine/harmont-ts/src/index.ts
@@ -5,10 +5,12 @@ export {
   type CacheTTL,
   type CacheOnChange,
   type CacheCompose,
+  type CachePredicate,
   forever,
   ttl,
   onChange,
   compose,
+  predicate,
 } from "./cache.js";
 export {
   type Trigger,
diff --git a/crates/hm-dsl-engine/harmont-ts/src/keygen.ts b/crates/hm-dsl-engine/harmont-ts/src/keygen.ts
index 2867b3c5..19bfd34d 100644
--- a/crates/hm-dsl-engine/harmont-ts/src/keygen.ts
+++ b/crates/hm-dsl-engine/harmont-ts/src/keygen.ts
@@ -130,6 +130,10 @@ function resolvePolicy(
     return "compose-" + sha256hex(parts.join(""));
   }
 
+  if (policy === "predicate") {
+    return "predicate-" + sha256hex(cache.value as string);
+  }
+
   throw new Error(`resolve-policy-key: unknown policy "${policy}"`);
 }
 
diff --git a/crates/hm-dsl-engine/harmont-ts/src/pipeline.ts b/crates/hm-dsl-engine/harmont-ts/src/pipeline.ts
index a531fb92..cf6d6500 100644
--- a/crates/hm-dsl-engine/harmont-ts/src/pipeline.ts
+++ b/crates/hm-dsl-engine/harmont-ts/src/pipeline.ts
@@ -204,5 +204,7 @@ function cachePolicyToDict(policy: CachePolicy): Record<string, unknown> {
         policy: "compose",
         sub_policies: policy.policies.map(cachePolicyToDict),
       };
+    case "predicate":
+      return { policy: "predicate", value: policy.value };
   }
 }
diff --git a/crates/hm-dsl-engine/harmont-ts/tests/cache.test.ts b/crates/hm-dsl-engine/harmont-ts/tests/cache.test.ts
index cee168de..bb0310e4 100644
--- a/crates/hm-dsl-engine/harmont-ts/tests/cache.test.ts
+++ b/crates/hm-dsl-engine/harmont-ts/tests/cache.test.ts
@@ -1,5 +1,5 @@
 import { describe, expect, it } from "vitest";
-import { forever, ttl, onChange, compose, type CachePolicy } from "../src/cache.js";
+import { forever, ttl, onChange, compose, predicate, type CachePolicy } from "../src/cache.js";
 
 describe("forever", () => {
   it("creates a forever policy with no env keys", () => {
@@ -42,6 +42,19 @@ describe("compose", () => {
   });
 });
 
+describe("predicate", () => {
+  it("creates a predicate policy with the given value", () => {
+    const p = predicate("v1");
+    expect(p).toEqual({ kind: "predicate", value: "v1" });
+  });
+
+  it("different values produce distinct objects", () => {
+    const p1 = predicate("v1");
+    const p2 = predicate("v2");
+    expect(p1.value).not.toBe(p2.value);
+  });
+});
+
 describe("type discrimination", () => {
   it("kind field enables type narrowing", () => {
     const p: CachePolicy = forever();
diff --git a/crates/hm-dsl-engine/harmont-ts/tests/keygen.test.ts b/crates/hm-dsl-engine/harmont-ts/tests/keygen.test.ts
index 9cfa1c1c..407f826f 100644
--- a/crates/hm-dsl-engine/harmont-ts/tests/keygen.test.ts
+++ b/crates/hm-dsl-engine/harmont-ts/tests/keygen.test.ts
@@ -6,7 +6,7 @@ import { describe, expect, it, beforeEach } from "vitest";
 import { resolvePipelineCacheKeys, type CacheKeyOptions } from "../src/keygen.js";
 import { pipeline, type PipelineIR } from "../src/pipeline.js";
 import { sh } from "../src/step.js";
-import { forever, ttl, onChange } from "../src/cache.js";
+import { forever, ttl, onChange, predicate } from "../src/cache.js";
 
 function sha256(s: string): string {
   return createHash("sha256").update(s, "utf8").digest("hex");
@@ -153,6 +153,35 @@ describe("resolvePipelineCacheKeys", () => {
     expect(cache.key).toBe(expected);
   });
 
+  it("predicate policy produces correct key", () => {
+    const ir = pipeline([sh("echo test", { label: "test", cache: predicate("v1") })]);
+    const opts = makeOpts({ pipelineOrg: "myorg", pipelineSlug: "myslug" });
+
+    resolvePipelineCacheKeys(ir.graph, opts);
+
+    const stepKey = ir.graph.nodes[0].step.key as string;
+    const policyRes = "predicate-" + sha256("v1");
+    const expected = sha256(
+      "myorg" + NUL + "myslug" + NUL + stepKey + NUL + "scratch" + NUL + policyRes,
+    );
+
+    const cache = ir.graph.nodes[0].step.cache as Record<string, unknown>;
+    expect(cache.key).toBe(expected);
+  });
+
+  it("different predicate values produce different keys", () => {
+    const ir1 = pipeline([sh("echo a", { label: "a", cache: predicate("v1") })]);
+    const ir2 = pipeline([sh("echo a", { label: "a", cache: predicate("v2") })]);
+    const opts = makeOpts();
+
+    resolvePipelineCacheKeys(ir1.graph, opts);
+    resolvePipelineCacheKeys(ir2.graph, opts);
+
+    const k1 = (ir1.graph.nodes[0].step.cache as Record<string, unknown>).key;
+    const k2 = (ir2.graph.nodes[0].step.cache as Record<string, unknown>).key;
+    expect(k1).not.toBe(k2);
+  });
+
   it("child step uses parent resolved key", () => {
     const base = sh("apt-get install", { label: "apt", cache: forever() });
     const child = base.sh("make", { label: "build", cache: forever() });

From 1b0bdc9b0464b4dc989d0cb169732759866a32e7 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 11:04:22 -0700
Subject: [PATCH 11/15] fix(dsl): resolve type checker issues in predicate
 cache policy

---
 crates/hm-dsl-engine/harmont-py/harmont/__init__.py  | 1 +
 crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/crates/hm-dsl-engine/harmont-py/harmont/__init__.py b/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
index 061ae89b..817aa0af 100644
--- a/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
+++ b/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
@@ -60,6 +60,7 @@
 from .types import Pipeline
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
     from datetime import timedelta
 
 
diff --git a/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py b/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py
index 233cc04e..4ee8b79a 100644
--- a/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py
+++ b/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py
@@ -247,7 +247,7 @@ def _cache_to_dict(policy: CachePolicy) -> dict[str, Any]:
             "sub_policies": [_cache_to_dict(p) for p in policy.policies],
         }
     if isinstance(policy, CachePredicate):
-        value = str(policy.fn())
+        value = str(policy.fn())  # type: ignore[operator]
         return {"policy": "predicate", "value": value}
     msg = f"unknown CachePolicy: {type(policy).__name__}"
     raise TypeError(msg)

From 007a23060466ce53b3161f3548f7a3bee504f847 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 11:06:28 -0700
Subject: [PATCH 12/15] style: cargo fmt + clippy fixes across workspace

- Format all Rust files
- Collapse nested if-let to tuple pattern (clippy)
- Replace map().unwrap_or() with map_or() (clippy)
- Add #[doc] Errors section to cow_copy (clippy pedantic)
---
 crates/hm-exec/src/local/runner/vm.rs |  4 +-
 crates/hm-exec/src/local/scheduler.rs |  5 ++-
 crates/hm-vm/src/backend.rs           | 14 +++++-
 crates/hm-vm/src/docker.rs            | 32 ++++++++++----
 crates/hm-vm/src/lib.rs               |  4 +-
 crates/hm-vm/src/vm.rs                | 61 ++++++++++++++++++---------
 crates/hm-vm/src/workspace.rs         | 24 ++++++++---
 crates/hm/src/main.rs                 |  4 +-
 8 files changed, 104 insertions(+), 44 deletions(-)

diff --git a/crates/hm-exec/src/local/runner/vm.rs b/crates/hm-exec/src/local/runner/vm.rs
index af1fa883..e08b018f 100644
--- a/crates/hm-exec/src/local/runner/vm.rs
+++ b/crates/hm-exec/src/local/runner/vm.rs
@@ -85,8 +85,8 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
             .archives
             .get_bytes(input.workspace_archive_id)
             .ok_or_else(|| anyhow::anyhow!("source archive not found"))?;
-        let base_dir = extract_archive_to_tempdir(&archive_bytes)
-            .context("extracting workspace archive")?;
+        let base_dir =
+            extract_archive_to_tempdir(&archive_bytes).context("extracting workspace archive")?;
         hm_vm::workspace::cow_copy(base_dir.path(), step_ws.path())
             .context("COW copy source to workspace")?;
     }
diff --git a/crates/hm-exec/src/local/scheduler.rs b/crates/hm-exec/src/local/scheduler.rs
index ea27187d..75f00eb8 100644
--- a/crates/hm-exec/src/local/scheduler.rs
+++ b/crates/hm-exec/src/local/scheduler.rs
@@ -223,8 +223,9 @@ pub(crate) async fn run(
                 .iter()
                 .zip(&pred_outcomes)
                 .find(|((ek, _), _)| *ek == EdgeKind::BuildsIn)
-                .map(|(_, outcome)| (outcome.snapshot.clone(), outcome.workspace_dir.clone()))
-                .unwrap_or((None, None));
+                .map_or((None, None), |(_, outcome)| {
+                    (outcome.snapshot.clone(), outcome.workspace_dir.clone())
+                });
 
             let mut step_ctx = run_ctx.clone();
             step_ctx.parent_workspace_dir = parent_workspace_dir;
diff --git a/crates/hm-vm/src/backend.rs b/crates/hm-vm/src/backend.rs
index 6d6e8428..1401e617 100644
--- a/crates/hm-vm/src/backend.rs
+++ b/crates/hm-vm/src/backend.rs
@@ -11,10 +11,20 @@ use crate::types::{OutputSink, SnapshotId, VmConfig, WorkspaceMount};
 #[async_trait]
 pub trait VmBackend: Send + Sync + fmt::Debug {
     /// Boot a new VM from the given OCI image reference.
-    async fn create(&self, image: &str, config: &VmConfig, workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>>;
+    async fn create(
+        &self,
+        image: &str,
+        config: &VmConfig,
+        workspace: Option<&WorkspaceMount>,
+    ) -> Result<Box<dyn Vm>>;
 
     /// Restore a VM from a previously taken snapshot.
-    async fn restore(&self, snapshot: &SnapshotId, config: &VmConfig, workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>>;
+    async fn restore(
+        &self,
+        snapshot: &SnapshotId,
+        config: &VmConfig,
+        workspace: Option<&WorkspaceMount>,
+    ) -> Result<Box<dyn Vm>>;
 
     /// Check whether a snapshot exists in the backend store.
     async fn snapshot_exists(&self, snapshot: &SnapshotId) -> Result<bool>;
diff --git a/crates/hm-vm/src/docker.rs b/crates/hm-vm/src/docker.rs
index 007168b4..062feb02 100644
--- a/crates/hm-vm/src/docker.rs
+++ b/crates/hm-vm/src/docker.rs
@@ -78,12 +78,18 @@ impl DockerBackend {
     }
 
     #[instrument(skip(self))]
-    async fn start_container(&self, image: &str, workspace: Option<&WorkspaceMount>) -> Result<String> {
-        let host_config = workspace.map(|ws| {
-            bollard::service::HostConfig {
-                binds: Some(vec![format!("{}:{}:rw", ws.host_path.display(), ws.guest_path)]),
-                ..Default::default()
-            }
+    async fn start_container(
+        &self,
+        image: &str,
+        workspace: Option<&WorkspaceMount>,
+    ) -> Result<String> {
+        let host_config = workspace.map(|ws| bollard::service::HostConfig {
+            binds: Some(vec![format!(
+                "{}:{}:rw",
+                ws.host_path.display(),
+                ws.guest_path
+            )]),
+            ..Default::default()
         });
         let cfg = Config {
             image: Some(image.to_string()),
@@ -107,7 +113,12 @@ impl DockerBackend {
 #[async_trait]
 impl VmBackend for DockerBackend {
     #[instrument(skip(self, _config, workspace))]
-    async fn create(&self, image: &str, _config: &VmConfig, workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>> {
+    async fn create(
+        &self,
+        image: &str,
+        _config: &VmConfig,
+        workspace: Option<&WorkspaceMount>,
+    ) -> Result<Box<dyn Vm>> {
         self.ensure_image(image).await?;
         let container_id = self.start_container(image, workspace).await?;
         Ok(Box::new(DockerVm {
@@ -117,7 +128,12 @@ impl VmBackend for DockerBackend {
     }
 
     #[instrument(skip(self, _config, workspace))]
-    async fn restore(&self, snapshot: &SnapshotId, _config: &VmConfig, workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>> {
+    async fn restore(
+        &self,
+        snapshot: &SnapshotId,
+        _config: &VmConfig,
+        workspace: Option<&WorkspaceMount>,
+    ) -> Result<Box<dyn Vm>> {
         let container_id = self.start_container(&snapshot.0, workspace).await?;
         Ok(Box::new(DockerVm {
             client: self.client.clone(),
diff --git a/crates/hm-vm/src/lib.rs b/crates/hm-vm/src/lib.rs
index ab5d028b..72a4e713 100644
--- a/crates/hm-vm/src/lib.rs
+++ b/crates/hm-vm/src/lib.rs
@@ -13,7 +13,7 @@ pub mod docker;
 pub use backend::{Vm, VmBackend};
 pub use registry::ImageRegistry;
 pub use types::{
-    Action, CachingPolicy, ExecutionResult, ImageSource, NullSink, OutputSink, SnapshotId, VmConfig,
-    WorkspaceMount,
+    Action, CachingPolicy, ExecutionResult, ImageSource, NullSink, OutputSink, SnapshotId,
+    VmConfig, WorkspaceMount,
 };
 pub use vm::HmVm;
diff --git a/crates/hm-vm/src/vm.rs b/crates/hm-vm/src/vm.rs
index 07f7aefe..2aa8bcf0 100644
--- a/crates/hm-vm/src/vm.rs
+++ b/crates/hm-vm/src/vm.rs
@@ -57,7 +57,9 @@ impl HmVm {
             && let Some(snap) = self.registry.get(key)
         {
             if self.backend.snapshot_exists(&snap).await? {
-                let ws_dir = self.registry.get_with_workspace(key)
+                let ws_dir = self
+                    .registry
+                    .get_with_workspace(key)
                     .and_then(|(_, ws)| ws)
                     .map(PathBuf::from);
                 return Ok(ExecutionResult {
@@ -72,8 +74,16 @@ impl HmVm {
 
         // 2. Create or restore VM
         let mut vm = match &action.source {
-            ImageSource::Image(image) => self.backend.create(image, &self.config, action.workspace.as_ref()).await?,
-            ImageSource::Snapshot(snap) => self.backend.restore(snap, &self.config, action.workspace.as_ref()).await?,
+            ImageSource::Image(image) => {
+                self.backend
+                    .create(image, &self.config, action.workspace.as_ref())
+                    .await?
+            }
+            ImageSource::Snapshot(snap) => {
+                self.backend
+                    .restore(snap, &self.config, action.workspace.as_ref())
+                    .await?
+            }
         };
 
         let result = self.run_in_vm(&mut *vm, &action, &policy, sink).await;
@@ -115,24 +125,27 @@ impl HmVm {
             let snap = vm.snapshot(label).await?;
 
             // Persist workspace to cache directory.
-            if let Some(ref ws) = action.workspace {
-                if let Some(ref cache_dir) = self.config.workspace_cache_dir {
-                    let ws_cache = cache_dir.join(label);
-                    if ws_cache.exists() {
-                        std::fs::remove_dir_all(&ws_cache).ok();
-                    }
-                    std::fs::create_dir_all(&ws_cache)?;
-                    crate::workspace::cow_copy(&ws.host_path, &ws_cache)?;
-                    workspace_dir = Some(ws_cache);
+            if let (Some(ws), Some(cache_dir)) = (
+                action.workspace.as_ref(),
+                self.config.workspace_cache_dir.as_ref(),
+            ) {
+                let ws_cache = cache_dir.join(label);
+                if ws_cache.exists() {
+                    std::fs::remove_dir_all(&ws_cache).ok();
                 }
+                std::fs::create_dir_all(&ws_cache)?;
+                crate::workspace::cow_copy(&ws.host_path, &ws_cache)?;
+                workspace_dir = Some(ws_cache);
             }
 
             if let CachingPolicy::Cache { key } = policy {
-                let evicted = if let Some(ref ws) = workspace_dir {
-                    self.registry.put_with_workspace(key, &snap, &ws.display().to_string())
-                } else {
-                    self.registry.put(key, &snap)
-                };
+                let evicted = workspace_dir.as_ref().map_or_else(
+                    || self.registry.put(key, &snap),
+                    |ws| {
+                        self.registry
+                            .put_with_workspace(key, &snap, &ws.display().to_string())
+                    },
+                );
                 for (old_snap, old_ws) in &evicted {
                     if let Err(e) = self.backend.remove_snapshot(old_snap).await {
                         warn!(snapshot = %old_snap.0, error = %e, "failed to remove evicted snapshot");
@@ -193,7 +206,12 @@ mod tests {
 
     #[async_trait]
     impl VmBackend for MockBackend {
-        async fn create(&self, image: &str, _config: &VmConfig, _workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>> {
+        async fn create(
+            &self,
+            image: &str,
+            _config: &VmConfig,
+            _workspace: Option<&WorkspaceMount>,
+        ) -> Result<Box<dyn Vm>> {
             self.calls
                 .lock()
                 .map_or_else(|_| {}, |mut c| c.push(format!("create:{image}")));
@@ -203,7 +221,12 @@ mod tests {
             }))
         }
 
-        async fn restore(&self, snapshot: &SnapshotId, _config: &VmConfig, _workspace: Option<&WorkspaceMount>) -> Result<Box<dyn Vm>> {
+        async fn restore(
+            &self,
+            snapshot: &SnapshotId,
+            _config: &VmConfig,
+            _workspace: Option<&WorkspaceMount>,
+        ) -> Result<Box<dyn Vm>> {
             self.calls
                 .lock()
                 .map_or_else(|_| {}, |mut c| c.push(format!("restore:{}", snapshot.0)));
diff --git a/crates/hm-vm/src/workspace.rs b/crates/hm-vm/src/workspace.rs
index 794e6035..44923d5f 100644
--- a/crates/hm-vm/src/workspace.rs
+++ b/crates/hm-vm/src/workspace.rs
@@ -9,14 +9,15 @@ use anyhow::{Context, Result, ensure};
 ///
 /// macOS APFS: `cp -cR` with full-copy fallback for cross-volume.
 /// Linux: `cp --reflink=auto -a` (COW on btrfs/XFS, full copy on ext4).
+///
+/// # Errors
+///
+/// Returns an error if `cp` cannot be spawned or exits with a non-zero status.
 pub fn cow_copy(src: &Path, dst: &Path) -> Result<()> {
     let src_dot = format!("{}/.", src.display());
 
     let status = if cfg!(target_os = "macos") {
-        let st = Command::new("cp")
-            .args(["-cR", &src_dot])
-            .arg(dst)
-            .status();
+        let st = Command::new("cp").args(["-cR", &src_dot]).arg(dst).status();
         match st {
             Ok(s) if s.success() => s,
             _ => Command::new("cp")
@@ -53,11 +54,20 @@ mod tests {
         let dst = tempdir().unwrap();
         cow_copy(src.path(), dst.path()).unwrap();
 
-        assert_eq!(fs::read_to_string(dst.path().join("file.txt")).unwrap(), "original");
-        assert_eq!(fs::read_to_string(dst.path().join("sub/nested.txt")).unwrap(), "nested");
+        assert_eq!(
+            fs::read_to_string(dst.path().join("file.txt")).unwrap(),
+            "original"
+        );
+        assert_eq!(
+            fs::read_to_string(dst.path().join("sub/nested.txt")).unwrap(),
+            "nested"
+        );
 
         fs::write(dst.path().join("file.txt"), "modified").unwrap();
-        assert_eq!(fs::read_to_string(src.path().join("file.txt")).unwrap(), "original");
+        assert_eq!(
+            fs::read_to_string(src.path().join("file.txt")).unwrap(),
+            "original"
+        );
     }
 
     #[test]
diff --git a/crates/hm/src/main.rs b/crates/hm/src/main.rs
index f4d936b6..e5b268a6 100644
--- a/crates/hm/src/main.rs
+++ b/crates/hm/src/main.rs
@@ -42,8 +42,8 @@ async fn main() {
 
         let (chrome_layer, guard) = build_chrome_layer(trace_path);
 
-        let indicatif_filter = Targets::new()
-            .with_target(hm_render::progress::TUI_TARGET, tracing::Level::TRACE);
+        let indicatif_filter =
+            Targets::new().with_target(hm_render::progress::TUI_TARGET, tracing::Level::TRACE);
 
         tracing_subscriber::registry()
             .with(

From 511b5fa45a7e19c5814ee806e54c14eed60343c1 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 11:25:04 -0700
Subject: [PATCH 13/15] fix: eliminate ephemeral workspace race and review
 fixes

- Skip workspace persistence for CachingPolicy::None (fixes parallel
  uncached steps racing on shared "ephemeral" dir)
- Runner keeps step tempdir alive via TempDir::keep() for uncached
  steps so children can still COW-copy from parent workspace
- Single get_with_workspace() call on cache hit (was double lookup)
- invalidate() now returns workspace_dir for cleanup (was leaking
  orphaned workspace dirs on disk)
- Format test_cache_predicate.py with ruff
---
 .../harmont-py/tests/test_cache_predicate.py  | 21 ++++++++-
 crates/hm-exec/src/local/runner/vm.rs         | 10 +++-
 crates/hm-vm/src/registry.rs                  | 19 ++++----
 crates/hm-vm/src/vm.rs                        | 46 ++++++++++---------
 4 files changed, 62 insertions(+), 34 deletions(-)

diff --git a/crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py b/crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py
index 7cd08c38..1a829552 100644
--- a/crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py
+++ b/crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py
@@ -28,6 +28,7 @@ def _make_graph(nodes, edges=None):
 
 # -- factory -----------------------------------------------------------------
 
+
 def test_predicate_factory_returns_cache_predicate():
     policy = hm.predicate(lambda: "v1")
     assert isinstance(policy, CachePredicate)
@@ -35,6 +36,7 @@ def test_predicate_factory_returns_cache_predicate():
 
 # -- lowering ----------------------------------------------------------------
 
+
 def test_predicate_lowering_calls_fn_and_produces_dict():
     policy = hm.predicate(lambda: "v1")
     d = _cache_to_dict(policy)
@@ -62,6 +64,7 @@ def counter():
 
 # -- keygen ------------------------------------------------------------------
 
+
 def test_predicate_keygen_produces_deterministic_key():
     graph = _make_graph(
         [
@@ -107,6 +110,20 @@ def make_graph(value):
 
     g1 = make_graph("v1")
     g2 = make_graph("v2")
-    resolve_pipeline_keys(g1, pipeline_org="o", pipeline_slug="s", now=0, base_path=Path("/tmp"), env={})  # noqa: S108
-    resolve_pipeline_keys(g2, pipeline_org="o", pipeline_slug="s", now=0, base_path=Path("/tmp"), env={})  # noqa: S108
+    resolve_pipeline_keys(
+        g1,
+        pipeline_org="o",
+        pipeline_slug="s",
+        now=0,
+        base_path=Path("/tmp"),
+        env={},  # noqa: S108
+    )
+    resolve_pipeline_keys(
+        g2,
+        pipeline_org="o",
+        pipeline_slug="s",
+        now=0,
+        base_path=Path("/tmp"),
+        env={},  # noqa: S108
+    )
     assert g1["nodes"][0]["step"]["cache"]["key"] != g2["nodes"][0]["step"]["cache"]["key"]
diff --git a/crates/hm-exec/src/local/runner/vm.rs b/crates/hm-exec/src/local/runner/vm.rs
index e08b018f..3f9bb77f 100644
--- a/crates/hm-exec/src/local/runner/vm.rs
+++ b/crates/hm-exec/src/local/runner/vm.rs
@@ -144,11 +144,19 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
         });
     }
 
+    // For cached steps HmVm persists the workspace to the cache dir and
+    // returns it. For uncached steps workspace_dir is None — keep the
+    // step tempdir alive so children can COW-copy from it.
+    let workspace_dir = result
+        .workspace_dir
+        .map(|p| p.display().to_string())
+        .or_else(|| (result.exit_code == 0).then(|| step_ws.keep().display().to_string()));
+
     Ok(StepResult {
         exit_code: result.exit_code,
         committed_snapshot: result.snapshot.map(|s| SnapshotRef(s.0)),
         artifacts: vec![],
-        workspace_dir: result.workspace_dir.map(|p| p.display().to_string()),
+        workspace_dir,
     })
 }
 
diff --git a/crates/hm-vm/src/registry.rs b/crates/hm-vm/src/registry.rs
index 205c8071..6286abf8 100644
--- a/crates/hm-vm/src/registry.rs
+++ b/crates/hm-vm/src/registry.rs
@@ -179,26 +179,27 @@ impl ImageRegistry {
 
     /// Remove a specific entry.
     ///
-    /// Returns the removed snapshot's ID so the caller can clean up backend
-    /// resources, or `None` if the key was not present.
+    /// Returns the removed snapshot's ID and workspace directory so the
+    /// caller can clean up backend resources, or `None` if the key was
+    /// not present.
     #[must_use]
-    pub fn invalidate(&self, key: &str) -> Option<SnapshotId> {
+    pub fn invalidate(&self, key: &str) -> Option<(SnapshotId, Option<String>)> {
         let conn = self.conn.lock().ok()?;
 
-        let snapshot: Option<String> = conn
+        let row: Option<(String, Option<String>)> = conn
             .query_row(
-                "SELECT snapshot_id FROM snapshots WHERE key = ?1",
+                "SELECT snapshot_id, workspace_dir FROM snapshots WHERE key = ?1",
                 [key],
-                |row| row.get(0),
+                |row| Ok((row.get(0)?, row.get(1)?)),
             )
             .ok();
 
-        if snapshot.is_some() {
+        if row.is_some() {
             let _ = conn.execute("DELETE FROM snapshots WHERE key = ?1", [key]);
         }
 
         drop(conn);
-        snapshot.map(SnapshotId)
+        row.map(|(snap, ws)| (SnapshotId(snap), ws))
     }
 
     /// Returns the number of cached entries.
@@ -369,7 +370,7 @@ mod tests {
         reg.put("to-remove", &snap);
 
         let removed = reg.invalidate("to-remove");
-        assert_eq!(removed, Some(SnapshotId("snap-rm".into())));
+        assert_eq!(removed, Some((SnapshotId("snap-rm".into()), None)));
         assert!(reg.get("to-remove").is_none());
         assert_eq!(reg.len(), 0);
 
diff --git a/crates/hm-vm/src/vm.rs b/crates/hm-vm/src/vm.rs
index 2aa8bcf0..9e7690a9 100644
--- a/crates/hm-vm/src/vm.rs
+++ b/crates/hm-vm/src/vm.rs
@@ -54,22 +54,24 @@ impl HmVm {
     ) -> Result<ExecutionResult> {
         // 1. Cache check
         if let CachingPolicy::Cache { ref key } = policy
-            && let Some(snap) = self.registry.get(key)
+            && let Some((snap, ws)) = self.registry.get_with_workspace(key)
         {
             if self.backend.snapshot_exists(&snap).await? {
-                let ws_dir = self
-                    .registry
-                    .get_with_workspace(key)
-                    .and_then(|(_, ws)| ws)
-                    .map(PathBuf::from);
                 return Ok(ExecutionResult {
                     exit_code: 0,
                     snapshot: Some(snap),
                     cached: true,
-                    workspace_dir: ws_dir,
+                    workspace_dir: ws.map(PathBuf::from),
                 });
             }
-            let _ = self.registry.invalidate(key);
+            if let Some((old_snap, old_ws)) = self.registry.invalidate(key) {
+                if let Err(e) = self.backend.remove_snapshot(&old_snap).await {
+                    warn!(snapshot = %old_snap.0, error = %e, "failed to remove stale snapshot");
+                }
+                if let Some(ws_path) = old_ws {
+                    std::fs::remove_dir_all(ws_path).ok();
+                }
+            }
         }
 
         // 2. Create or restore VM
@@ -118,27 +120,27 @@ impl HmVm {
 
         // 4. Snapshot and cache on success
         let snapshot = if exit_code == 0 {
-            let label = match policy {
+            let label = match &policy {
                 CachingPolicy::Cache { key } => key.as_str(),
                 CachingPolicy::None => "ephemeral",
             };
             let snap = vm.snapshot(label).await?;
 
-            // Persist workspace to cache directory.
-            if let (Some(ws), Some(cache_dir)) = (
-                action.workspace.as_ref(),
-                self.config.workspace_cache_dir.as_ref(),
-            ) {
-                let ws_cache = cache_dir.join(label);
-                if ws_cache.exists() {
-                    std::fs::remove_dir_all(&ws_cache).ok();
+            if let CachingPolicy::Cache { key } = &policy {
+                // Persist workspace to cache directory.
+                if let (Some(ws), Some(cache_dir)) = (
+                    action.workspace.as_ref(),
+                    self.config.workspace_cache_dir.as_ref(),
+                ) {
+                    let ws_cache = cache_dir.join(key.as_str());
+                    if ws_cache.exists() {
+                        std::fs::remove_dir_all(&ws_cache).ok();
+                    }
+                    std::fs::create_dir_all(&ws_cache)?;
+                    crate::workspace::cow_copy(&ws.host_path, &ws_cache)?;
+                    workspace_dir = Some(ws_cache);
                 }
-                std::fs::create_dir_all(&ws_cache)?;
-                crate::workspace::cow_copy(&ws.host_path, &ws_cache)?;
-                workspace_dir = Some(ws_cache);
-            }
 
-            if let CachingPolicy::Cache { key } = policy {
                 let evicted = workspace_dir.as_ref().map_or_else(
                     || self.registry.put(key, &snap),
                     |ws| {

From 4acc7895b063d8bf3b4b0d9ea3693bd4a12ec203 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 12:02:07 -0700
Subject: [PATCH 14/15] fix: harden COW workspace lifecycle and address review
 findings

- Registry: unify put/put_with_workspace into single put(key, snap, ws);
  fold eviction into single lock scope to fix TOCTOU race (C3+C4)
- Ephemeral snapshots: UUID-based labels to avoid parallel collision (C1);
  scheduler cleanup removes Docker images and temp dirs after build (C2)
- Cache hits: peek_cache() in VmRunner skips expensive COW copy (I3);
  validate workspace_dir still exists on disk before trusting cache (I5)
- Blocking I/O: wrap cow_copy, remove_dir_all, create_dir_all in
  spawn_blocking to avoid starving the tokio runtime (I2)
- workspace.rs: clear partial state between failed APFS clone and
  fallback copy (I6); use -p flag to preserve permissions (I7);
  include src/dst paths in error messages
- Python DSL: validate predicate callback is callable; wrap exceptions
  and reject None returns (I8)
---
 Cargo.lock                                    |  1 +
 .../harmont-py/harmont/__init__.py            |  3 +
 .../harmont-py/harmont/_pipeline.py           | 11 ++-
 crates/hm-exec/src/local/backend.rs           |  2 +
 crates/hm-exec/src/local/runner/mod.rs        |  1 +
 crates/hm-exec/src/local/runner/vm.rs         | 29 ++++++
 crates/hm-exec/src/local/scheduler.rs         | 32 +++++++
 crates/hm-plugin-protocol/src/executor.rs     |  5 +
 crates/hm-vm/Cargo.toml                       |  1 +
 crates/hm-vm/src/registry.rs                  | 86 +++++++----------
 crates/hm-vm/src/types.rs                     |  3 +
 crates/hm-vm/src/vm.rs                        | 96 ++++++++++++++-----
 crates/hm-vm/src/workspace.rs                 | 47 +++++++--
 13 files changed, 231 insertions(+), 86 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b59a146b..75de2c3b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1364,6 +1364,7 @@ dependencies = [
  "tempfile",
  "tokio",
  "tracing",
+ "uuid",
 ]
 
 [[package]]
diff --git a/crates/hm-dsl-engine/harmont-py/harmont/__init__.py b/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
index 817aa0af..0f7e0ac7 100644
--- a/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
+++ b/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
@@ -196,6 +196,9 @@ def package_manager():
 
         step.run("install", cache=hm.predicate(package_manager))
     """
+    if not callable(fn):
+        msg = f"hm.predicate() requires a callable, got {type(fn).__name__}"
+        raise TypeError(msg)
     return CachePredicate(fn=fn)
 
 
diff --git a/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py b/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py
index 4ee8b79a..62cc15f1 100644
--- a/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py
+++ b/crates/hm-dsl-engine/harmont-py/harmont/_pipeline.py
@@ -247,8 +247,15 @@ def _cache_to_dict(policy: CachePolicy) -> dict[str, Any]:
             "sub_policies": [_cache_to_dict(p) for p in policy.policies],
         }
     if isinstance(policy, CachePredicate):
-        value = str(policy.fn())  # type: ignore[operator]
-        return {"policy": "predicate", "value": value}
+        try:
+            value = policy.fn()  # type: ignore[operator]
+        except Exception as exc:
+            msg = f"hm.predicate() callback raised {type(exc).__name__}: {exc}"
+            raise ValueError(msg) from exc
+        if value is None:
+            msg = "hm.predicate() callback returned None; it must return a string"
+            raise ValueError(msg)
+        return {"policy": "predicate", "value": str(value)}
     msg = f"unknown CachePolicy: {type(policy).__name__}"
     raise TypeError(msg)
 
diff --git a/crates/hm-exec/src/local/backend.rs b/crates/hm-exec/src/local/backend.rs
index 2fb7683e..67a10bfe 100644
--- a/crates/hm-exec/src/local/backend.rs
+++ b/crates/hm-exec/src/local/backend.rs
@@ -84,6 +84,7 @@ impl ExecutionBackend for LocalBackend {
         let cancel = CancellationToken::new();
         let parallelism = self.parallelism;
         let token = cancel.clone();
+        let backend = Arc::clone(&self.vm_backend);
         let join = tokio::spawn(async move {
             crate::local::run(
                 req.plan.graph,
@@ -93,6 +94,7 @@ impl ExecutionBackend for LocalBackend {
                 registry,
                 tx,
                 token,
+                Some(backend),
             )
             .await
         });
diff --git a/crates/hm-exec/src/local/runner/mod.rs b/crates/hm-exec/src/local/runner/mod.rs
index d69c84fd..052f6909 100644
--- a/crates/hm-exec/src/local/runner/mod.rs
+++ b/crates/hm-exec/src/local/runner/mod.rs
@@ -157,6 +157,7 @@ mod tests {
                     committed_snapshot: None,
                     artifacts: vec![],
                     workspace_dir: None,
+                    ephemeral_snapshot: false,
                 })
             })
         }
diff --git a/crates/hm-exec/src/local/runner/vm.rs b/crates/hm-exec/src/local/runner/vm.rs
index 3f9bb77f..e0336904 100644
--- a/crates/hm-exec/src/local/runner/vm.rs
+++ b/crates/hm-exec/src/local/runner/vm.rs
@@ -51,6 +51,7 @@ impl StepRunner for VmRunner {
     }
 }
 
+#[allow(clippy::too_many_lines)]
 #[tracing::instrument(skip(vm, ctx), fields(step_key = %input.step.key))]
 async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Result<StepResult> {
     let policy = match &input.cache_lookup {
@@ -60,6 +61,33 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
         CacheDecision::MissNoCommit => CachingPolicy::None,
     };
 
+    // Fast path: check cache before doing any workspace prep. COW copies
+    // are expensive and entirely wasted on cache hits.
+    if let CachingPolicy::Cache { ref key } = policy
+        && let Some(result) = vm.peek_cache(key).await?
+    {
+        ctx.event_bus.emit(BuildEvent::StepCacheHit {
+            step_id: input.step_id,
+            key: input
+                .step
+                .cache
+                .as_ref()
+                .and_then(|c| c.key.clone())
+                .unwrap_or_default(),
+            tag: result
+                .snapshot
+                .as_ref()
+                .map_or_else(String::new, |s| s.0.clone()),
+        });
+        return Ok(StepResult {
+            exit_code: 0,
+            committed_snapshot: result.snapshot.map(|s| SnapshotRef(s.0)),
+            artifacts: vec![],
+            workspace_dir: result.workspace_dir.map(|p| p.display().to_string()),
+            ephemeral_snapshot: false,
+        });
+    }
+
     let source = if let Some(ref snap) = input.parent_snapshot {
         ImageSource::Snapshot(SnapshotId(snap.0.clone()))
     } else {
@@ -157,6 +185,7 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
         committed_snapshot: result.snapshot.map(|s| SnapshotRef(s.0)),
         artifacts: vec![],
         workspace_dir,
+        ephemeral_snapshot: result.ephemeral_snapshot,
     })
 }
 
diff --git a/crates/hm-exec/src/local/scheduler.rs b/crates/hm-exec/src/local/scheduler.rs
index 75f00eb8..8b9217f7 100644
--- a/crates/hm-exec/src/local/scheduler.rs
+++ b/crates/hm-exec/src/local/scheduler.rs
@@ -63,6 +63,9 @@ struct StepOutcome {
     /// The scheduler propagates this to downstream `BuildsIn` children
     /// so they can COW-copy instead of re-extracting.
     workspace_dir: Option<String>,
+    /// True when the snapshot is ephemeral and must be cleaned up after
+    /// all downstream steps finish.
+    ephemeral_snapshot: bool,
 }
 
 type StepFuture = futures::future::Shared<BoxFuture<'static, StepOutcome>>;
@@ -81,6 +84,7 @@ type StepFuture = futures::future::Shared<BoxFuture<'static, StepOutcome>>;
 /// Returns an error if the source archive cannot be built or any
 /// scheduler-level failure occurs. Non-zero step exit codes are
 /// surfaced via the returned [`BuildOutcome`], not as an `Err`.
+#[allow(clippy::too_many_arguments)]
 pub(crate) async fn run(
     graph: PipelineGraph,
     repo_root: PathBuf,
@@ -89,6 +93,7 @@ pub(crate) async fn run(
     runner_registry: Arc<RunnerRegistry>,
     tx: tokio::sync::mpsc::Sender<BuildEvent>,
     cancel: CancellationToken,
+    vm_backend: Option<Arc<dyn hm_vm::VmBackend>>,
 ) -> crate::Result<BuildOutcome> {
     // Set up per-run state.
     let bus = EventBus::new();
@@ -207,6 +212,7 @@ pub(crate) async fn run(
                         exit_code: None,
                         duration_ms: 0,
                     }),
+                    ephemeral_snapshot: false,
                     workspace_dir: None,
                 };
             }
@@ -260,6 +266,7 @@ pub(crate) async fn run(
                             duration_ms: 0,
                         }),
                         workspace_dir: None,
+                        ephemeral_snapshot: false,
                     }
                 }
             }
@@ -320,6 +327,29 @@ pub(crate) async fn run(
 
     let steps: Vec<StepResultSummary> = outcomes.iter().filter_map(|o| o.summary.clone()).collect();
 
+    // Clean up ephemeral Docker snapshots and kept temp workspace dirs.
+    // These are from uncached steps that used TempDir::keep() so children
+    // could COW-copy from them. Now that all steps have finished, the
+    // temp dirs and one-shot snapshots can be removed.
+    for outcome in &outcomes {
+        if outcome.ephemeral_snapshot
+            && let (Some(backend), Some(snap)) = (vm_backend.as_ref(), outcome.snapshot.as_ref())
+        {
+            let snap_id = hm_vm::SnapshotId(snap.0.clone());
+            if let Err(e) = backend.remove_snapshot(&snap_id).await {
+                tracing::warn!(snapshot = %snap.0, error = %e, "failed to remove ephemeral snapshot");
+            }
+        }
+        if let Some(ref ws) = outcome.workspace_dir
+            && outcome.ephemeral_snapshot
+        {
+            let ws = ws.clone();
+            tokio::task::spawn_blocking(move || std::fs::remove_dir_all(ws).ok())
+                .await
+                .ok();
+        }
+    }
+
     let dur = started_total.elapsed().as_millis() as u64;
 
     bus.emit(BuildEvent::BuildEnd {
@@ -488,6 +518,7 @@ async fn execute_step(
                             duration_ms: dur_ms,
                         }),
                         workspace_dir: None,
+                        ephemeral_snapshot: false,
                     });
                 }
             }
@@ -533,6 +564,7 @@ async fn execute_step(
                     duration_ms: dur_ms,
                 }),
                 workspace_dir: sr.workspace_dir,
+                ephemeral_snapshot: sr.ephemeral_snapshot,
             })
         }
         Err(e) => {
diff --git a/crates/hm-plugin-protocol/src/executor.rs b/crates/hm-plugin-protocol/src/executor.rs
index e2cca5e0..8a5d3292 100644
--- a/crates/hm-plugin-protocol/src/executor.rs
+++ b/crates/hm-plugin-protocol/src/executor.rs
@@ -104,4 +104,9 @@ pub struct StepResult {
     #[serde(skip)]
     #[schemars(skip)]
     pub workspace_dir: Option<String>,
+    /// True when the snapshot is ephemeral (not registered in the cache)
+    /// and must be cleaned up by the caller after downstream steps finish.
+    #[serde(skip)]
+    #[schemars(skip)]
+    pub ephemeral_snapshot: bool,
 }
diff --git a/crates/hm-vm/Cargo.toml b/crates/hm-vm/Cargo.toml
index 3bb5c787..9a7ddc23 100644
--- a/crates/hm-vm/Cargo.toml
+++ b/crates/hm-vm/Cargo.toml
@@ -13,6 +13,7 @@ tracing = { workspace = true }
 derive_more = { workspace = true }
 
 rusqlite = { version = "0.37", features = ["bundled"] }
+uuid = { workspace = true }
 
 bollard = { version = "0.18", optional = true }
 futures = { version = "0.3", optional = true }
diff --git a/crates/hm-vm/src/registry.rs b/crates/hm-vm/src/registry.rs
index 6286abf8..960fc922 100644
--- a/crates/hm-vm/src/registry.rs
+++ b/crates/hm-vm/src/registry.rs
@@ -127,39 +127,20 @@ impl ImageRegistry {
         result.map(|(snap, ws)| (SnapshotId(snap), ws))
     }
 
-    /// Insert or update a cache entry.
+    /// Insert or update a cache entry, optionally with a workspace directory.
     ///
     /// Returns the evicted entries (snapshot ID and optional workspace directory)
     /// to keep the registry within its configured capacity. The caller is
     /// responsible for cleaning up backend resources and workspace directories
     /// associated with evicted entries.
-    pub fn put(&self, key: &str, snapshot: &SnapshotId) -> Vec<(SnapshotId, Option<String>)> {
-        let now = epoch_secs();
-
-        let Ok(conn) = self.conn.lock() else {
-            return Vec::new();
-        };
-
-        // INSERT OR REPLACE handles both new and updated entries.
-        let _result = conn.execute(
-            "INSERT OR REPLACE INTO snapshots (key, snapshot_id, accessed_at)
-             VALUES (?1, ?2, ?3)",
-            rusqlite::params![key, snapshot.0, now],
-        );
-
-        drop(conn);
-        self.evict_overflow()
-    }
-
-    /// Insert or update a cache entry with an associated workspace directory.
     ///
-    /// Returns the evicted entries (snapshot ID and optional workspace directory)
-    /// to keep the registry within its configured capacity.
-    pub fn put_with_workspace(
+    /// Upsert + eviction run under a single lock acquisition to prevent
+    /// TOCTOU races between concurrent callers.
+    pub fn put(
         &self,
         key: &str,
         snapshot: &SnapshotId,
-        workspace_dir: &str,
+        workspace_dir: Option<&str>,
     ) -> Vec<(SnapshotId, Option<String>)> {
         let now = epoch_secs();
 
@@ -173,8 +154,7 @@ impl ImageRegistry {
             rusqlite::params![key, snapshot.0, now, workspace_dir],
         );
 
-        drop(conn);
-        self.evict_overflow()
+        Self::evict_overflow_locked(&conn, self.capacity)
     }
 
     /// Remove a specific entry.
@@ -223,19 +203,25 @@ impl ImageRegistry {
     }
 
     /// Evict the oldest entries (by `accessed_at`) when the registry exceeds
-    /// its capacity. Returns the snapshot IDs and optional workspace directories
-    /// of evicted entries.
-    fn evict_overflow(&self) -> Vec<(SnapshotId, Option<String>)> {
-        let count = self.len();
-        if count <= self.capacity {
+    /// its capacity. Caller must already hold the connection lock.
+    ///
+    /// SELECT + DELETE run in one lock scope to prevent TOCTOU races
+    /// between concurrent callers.
+    fn evict_overflow_locked(
+        conn: &Connection,
+        capacity: u64,
+    ) -> Vec<(SnapshotId, Option<String>)> {
+        let count: i64 = conn
+            .query_row("SELECT COUNT(*) FROM snapshots", [], |row| row.get(0))
+            .unwrap_or(0);
+        #[allow(clippy::cast_sign_loss)]
+        let count = count as u64;
+
+        if count <= capacity {
             return Vec::new();
         }
 
-        let overflow = count - self.capacity;
-
-        let Ok(conn) = self.conn.lock() else {
-            return Vec::new();
-        };
+        let overflow = count - capacity;
 
         let Ok(mut stmt) = conn.prepare(
             "SELECT snapshot_id, workspace_dir FROM snapshots ORDER BY accessed_at ASC LIMIT ?1",
@@ -254,10 +240,8 @@ impl ImageRegistry {
             .map(|rows| rows.filter_map(Result::ok).collect())
             .unwrap_or_default();
 
-        // Drop stmt before using conn again for the delete.
         drop(stmt);
 
-        // Delete those entries.
         let _deleted = conn.execute(
             "DELETE FROM snapshots WHERE key IN (
                  SELECT key FROM snapshots ORDER BY accessed_at ASC LIMIT ?1
@@ -291,7 +275,7 @@ mod tests {
     fn put_then_get_returns_snapshot() {
         let (reg, _dir) = open_temp(10);
         let snap = SnapshotId("snap-abc".into());
-        let evicted = reg.put("my-key", &snap);
+        let evicted = reg.put("my-key", &snap, None);
         assert!(evicted.is_empty());
 
         let got = reg.get("my-key");
@@ -303,12 +287,12 @@ mod tests {
         let (reg, _dir) = open_temp(2);
 
         // Insert a, then b. "a" is older by insertion order.
-        reg.put("a", &SnapshotId("snap-a".into()));
+        reg.put("a", &SnapshotId("snap-a".into()), None);
 
         // Tiny sleep so timestamps differ.
         std::thread::sleep(std::time::Duration::from_secs(1));
 
-        reg.put("b", &SnapshotId("snap-b".into()));
+        reg.put("b", &SnapshotId("snap-b".into()), None);
 
         // Touch "a" so it becomes the most recently accessed.
         std::thread::sleep(std::time::Duration::from_secs(1));
@@ -317,7 +301,7 @@ mod tests {
         // Now insert "c" -- capacity is 2, so one must be evicted.
         // "b" should be evicted since "a" was touched more recently.
         std::thread::sleep(std::time::Duration::from_secs(1));
-        let evicted = reg.put("c", &SnapshotId("snap-c".into()));
+        let evicted = reg.put("c", &SnapshotId("snap-c".into()), None);
 
         assert_eq!(evicted.len(), 1);
         assert_eq!(evicted[0].0, SnapshotId("snap-b".into()));
@@ -332,13 +316,13 @@ mod tests {
     fn eviction_returns_overflow_entries() {
         let (reg, _dir) = open_temp(2);
 
-        reg.put("x", &SnapshotId("snap-x".into()));
+        reg.put("x", &SnapshotId("snap-x".into()), None);
         std::thread::sleep(std::time::Duration::from_secs(1));
-        reg.put("y", &SnapshotId("snap-y".into()));
+        reg.put("y", &SnapshotId("snap-y".into()), None);
         std::thread::sleep(std::time::Duration::from_secs(1));
 
         // This third insert should evict the oldest ("x").
-        let evicted = reg.put("z", &SnapshotId("snap-z".into()));
+        let evicted = reg.put("z", &SnapshotId("snap-z".into()), None);
 
         assert_eq!(evicted.len(), 1);
         assert_eq!(evicted[0].0, SnapshotId("snap-x".into()));
@@ -352,7 +336,7 @@ mod tests {
 
         {
             let reg = ImageRegistry::open(&db_path, 10).expect("open");
-            reg.put("persistent", &SnapshotId("snap-persist".into()));
+            reg.put("persistent", &SnapshotId("snap-persist".into()), None);
             assert_eq!(reg.len(), 1);
             // reg is dropped here, closing the connection.
         }
@@ -367,7 +351,7 @@ mod tests {
     fn invalidate_returns_removed_snapshot() {
         let (reg, _dir) = open_temp(10);
         let snap = SnapshotId("snap-rm".into());
-        reg.put("to-remove", &snap);
+        reg.put("to-remove", &snap, None);
 
         let removed = reg.invalidate("to-remove");
         assert_eq!(removed, Some((SnapshotId("snap-rm".into()), None)));
@@ -383,7 +367,7 @@ mod tests {
     fn put_with_workspace_then_get_returns_both() {
         let (reg, _dir) = open_temp(10);
         let snap = SnapshotId("snap-ws".into());
-        reg.put_with_workspace("ws-key", &snap, "/cache/workspaces/ws-key");
+        reg.put("ws-key", &snap, Some("/cache/workspaces/ws-key"));
 
         let (got_snap, got_ws) = reg.get_with_workspace("ws-key").unwrap();
         assert_eq!(got_snap, SnapshotId("snap-ws".into()));
@@ -393,7 +377,7 @@ mod tests {
     #[test]
     fn put_without_workspace_returns_none_workspace() {
         let (reg, _dir) = open_temp(10);
-        reg.put("plain-key", &SnapshotId("snap-plain".into()));
+        reg.put("plain-key", &SnapshotId("snap-plain".into()), None);
 
         let (_, got_ws) = reg.get_with_workspace("plain-key").unwrap();
         assert!(got_ws.is_none());
@@ -402,10 +386,10 @@ mod tests {
     #[test]
     fn eviction_includes_workspace_path() {
         let (reg, _dir) = open_temp(1);
-        reg.put_with_workspace("a", &SnapshotId("snap-a".into()), "/ws/a");
+        reg.put("a", &SnapshotId("snap-a".into()), Some("/ws/a"));
         std::thread::sleep(std::time::Duration::from_secs(1));
 
-        let evicted = reg.put_with_workspace("b", &SnapshotId("snap-b".into()), "/ws/b");
+        let evicted = reg.put("b", &SnapshotId("snap-b".into()), Some("/ws/b"));
         assert_eq!(evicted.len(), 1);
         assert_eq!(evicted[0].0, SnapshotId("snap-a".into()));
         assert_eq!(evicted[0].1.as_deref(), Some("/ws/a"));
diff --git a/crates/hm-vm/src/types.rs b/crates/hm-vm/src/types.rs
index ecb1d5b1..f94c2b16 100644
--- a/crates/hm-vm/src/types.rs
+++ b/crates/hm-vm/src/types.rs
@@ -51,6 +51,9 @@ pub struct ExecutionResult {
     pub snapshot: Option<SnapshotId>,
     pub cached: bool,
     pub workspace_dir: Option<PathBuf>,
+    /// True when the snapshot is ephemeral (not registered in the cache)
+    /// and must be cleaned up by the caller after downstream steps finish.
+    pub ephemeral_snapshot: bool,
 }
 
 /// VM resource configuration.
diff --git a/crates/hm-vm/src/vm.rs b/crates/hm-vm/src/vm.rs
index 9e7690a9..0a1dce1e 100644
--- a/crates/hm-vm/src/vm.rs
+++ b/crates/hm-vm/src/vm.rs
@@ -5,6 +5,7 @@ use std::sync::Arc;
 
 use anyhow::Result;
 use tracing::{instrument, warn};
+use uuid::Uuid;
 
 use crate::backend::VmBackend;
 use crate::registry::ImageRegistry;
@@ -32,6 +33,41 @@ impl HmVm {
         }
     }
 
+    /// Check whether a cached result exists for `key` without executing anything.
+    ///
+    /// Returns `Some(result)` on a valid hit (snapshot exists in backend and
+    /// workspace dir, if recorded, still exists on disk). Invalidates stale
+    /// entries as a side-effect. Returns `None` on miss.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the backend's `snapshot_exists` check fails.
+    pub async fn peek_cache(&self, key: &str) -> Result<Option<ExecutionResult>> {
+        if let Some((snap, ws)) = self.registry.get_with_workspace(key) {
+            let ws_valid = ws.as_ref().is_none_or(|p| std::path::Path::new(p).is_dir());
+            if ws_valid && self.backend.snapshot_exists(&snap).await? {
+                return Ok(Some(ExecutionResult {
+                    exit_code: 0,
+                    snapshot: Some(snap),
+                    cached: true,
+                    workspace_dir: ws.map(PathBuf::from),
+                    ephemeral_snapshot: false,
+                }));
+            }
+            if let Some((old_snap, old_ws)) = self.registry.invalidate(key) {
+                if let Err(e) = self.backend.remove_snapshot(&old_snap).await {
+                    warn!(snapshot = %old_snap.0, error = %e, "failed to remove stale snapshot");
+                }
+                if let Some(ws_path) = old_ws {
+                    tokio::task::spawn_blocking(move || std::fs::remove_dir_all(ws_path).ok())
+                        .await
+                        .ok();
+                }
+            }
+        }
+        Ok(None)
+    }
+
     /// Execute an [`Action`] inside a VM, obeying the given [`CachingPolicy`].
     ///
     /// # Cache behaviour
@@ -56,12 +92,14 @@ impl HmVm {
         if let CachingPolicy::Cache { ref key } = policy
             && let Some((snap, ws)) = self.registry.get_with_workspace(key)
         {
-            if self.backend.snapshot_exists(&snap).await? {
+            let ws_valid = ws.as_ref().is_none_or(|p| std::path::Path::new(p).is_dir());
+            if ws_valid && self.backend.snapshot_exists(&snap).await? {
                 return Ok(ExecutionResult {
                     exit_code: 0,
                     snapshot: Some(snap),
                     cached: true,
                     workspace_dir: ws.map(PathBuf::from),
+                    ephemeral_snapshot: false,
                 });
             }
             if let Some((old_snap, old_ws)) = self.registry.invalidate(key) {
@@ -69,7 +107,9 @@ impl HmVm {
                     warn!(snapshot = %old_snap.0, error = %e, "failed to remove stale snapshot");
                 }
                 if let Some(ws_path) = old_ws {
-                    std::fs::remove_dir_all(ws_path).ok();
+                    tokio::task::spawn_blocking(move || std::fs::remove_dir_all(ws_path).ok())
+                        .await
+                        .ok();
                 }
             }
         }
@@ -119,12 +159,12 @@ impl HmVm {
         };
 
         // 4. Snapshot and cache on success
-        let snapshot = if exit_code == 0 {
-            let label = match &policy {
-                CachingPolicy::Cache { key } => key.as_str(),
-                CachingPolicy::None => "ephemeral",
+        let (snapshot, ephemeral) = if exit_code == 0 {
+            let (label, is_ephemeral) = match &policy {
+                CachingPolicy::Cache { key } => (key.clone(), false),
+                CachingPolicy::None => (format!("ephemeral-{}", Uuid::new_v4()), true),
             };
-            let snap = vm.snapshot(label).await?;
+            let snap = vm.snapshot(&label).await?;
 
             if let CachingPolicy::Cache { key } = &policy {
                 // Persist workspace to cache directory.
@@ -133,34 +173,39 @@ impl HmVm {
                     self.config.workspace_cache_dir.as_ref(),
                 ) {
                     let ws_cache = cache_dir.join(key.as_str());
-                    if ws_cache.exists() {
-                        std::fs::remove_dir_all(&ws_cache).ok();
-                    }
-                    std::fs::create_dir_all(&ws_cache)?;
-                    crate::workspace::cow_copy(&ws.host_path, &ws_cache)?;
+                    let host_path = ws.host_path.clone();
+                    let ws_cache_clone = ws_cache.clone();
+                    tokio::task::spawn_blocking(move || -> anyhow::Result<()> {
+                        if ws_cache_clone.exists() {
+                            std::fs::remove_dir_all(&ws_cache_clone).ok();
+                        }
+                        std::fs::create_dir_all(&ws_cache_clone)?;
+                        crate::workspace::cow_copy(&host_path, &ws_cache_clone)?;
+                        Ok(())
+                    })
+                    .await??;
                     workspace_dir = Some(ws_cache);
                 }
 
-                let evicted = workspace_dir.as_ref().map_or_else(
-                    || self.registry.put(key, &snap),
-                    |ws| {
-                        self.registry
-                            .put_with_workspace(key, &snap, &ws.display().to_string())
-                    },
-                );
-                for (old_snap, old_ws) in &evicted {
-                    if let Err(e) = self.backend.remove_snapshot(old_snap).await {
+                let ws_str = workspace_dir.as_ref().map(|ws| ws.display().to_string());
+                let evicted = self.registry.put(key, &snap, ws_str.as_deref());
+                for (old_snap, old_ws) in evicted {
+                    if let Err(e) = self.backend.remove_snapshot(&old_snap).await {
                         warn!(snapshot = %old_snap.0, error = %e, "failed to remove evicted snapshot");
                     }
                     if let Some(ws_path) = old_ws {
-                        std::fs::remove_dir_all(ws_path).ok();
+                        tokio::task::spawn_blocking(move || {
+                            std::fs::remove_dir_all(ws_path).ok();
+                        })
+                        .await
+                        .ok();
                     }
                 }
             }
 
-            Some(snap)
+            (Some(snap), is_ephemeral)
         } else {
-            None
+            (None, false)
         };
 
         Ok(ExecutionResult {
@@ -168,6 +213,7 @@ impl HmVm {
             snapshot,
             cached: false,
             workspace_dir: if exit_code == 0 { workspace_dir } else { None },
+            ephemeral_snapshot: ephemeral,
         })
     }
 }
@@ -357,7 +403,7 @@ mod tests {
         let (registry, _dir) = open_temp_registry(10);
 
         // Pre-populate the registry.
-        registry.put("step-1", &SnapshotId("cached-snap".into()));
+        registry.put("step-1", &SnapshotId("cached-snap".into()), None);
 
         let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
 
diff --git a/crates/hm-vm/src/workspace.rs b/crates/hm-vm/src/workspace.rs
index 44923d5f..6ad11d73 100644
--- a/crates/hm-vm/src/workspace.rs
+++ b/crates/hm-vm/src/workspace.rs
@@ -17,24 +17,55 @@ pub fn cow_copy(src: &Path, dst: &Path) -> Result<()> {
     let src_dot = format!("{}/.", src.display());
 
     let status = if cfg!(target_os = "macos") {
-        let st = Command::new("cp").args(["-cR", &src_dot]).arg(dst).status();
+        let st = Command::new("cp")
+            .args(["-cpR", &src_dot])
+            .arg(dst)
+            .status();
         match st {
             Ok(s) if s.success() => s,
-            _ => Command::new("cp")
-                .args(["-R", &src_dot])
-                .arg(dst)
-                .status()
-                .context("spawning cp")?,
+            _ => {
+                // APFS clonefile failed (cross-volume or non-APFS). Wipe any
+                // partial state left by the failed attempt before falling back
+                // to a regular copy.
+                if dst.exists() {
+                    for entry in std::fs::read_dir(dst)
+                        .with_context(|| format!("cleaning partial COW dst {}", dst.display()))?
+                    {
+                        let p = entry?.path();
+                        if p.is_dir() {
+                            std::fs::remove_dir_all(&p).ok();
+                        } else {
+                            std::fs::remove_file(&p).ok();
+                        }
+                    }
+                }
+                Command::new("cp")
+                    .args(["-pR", &src_dot])
+                    .arg(dst)
+                    .status()
+                    .with_context(|| {
+                        format!(
+                            "spawning cp fallback: {} -> {}",
+                            src.display(),
+                            dst.display()
+                        )
+                    })?
+            }
         }
     } else {
         Command::new("cp")
             .args(["--reflink=auto", "-a", &src_dot])
             .arg(dst)
             .status()
-            .context("spawning cp")?
+            .with_context(|| format!("spawning cp: {} -> {}", src.display(), dst.display()))?
     };
 
-    ensure!(status.success(), "cp exited with {status}");
+    ensure!(
+        status.success(),
+        "cp {} -> {} exited with {status}",
+        src.display(),
+        dst.display()
+    );
     Ok(())
 }
 

From a7663f5eae44c2109637e76bb6a9d01d5ebf7c14 Mon Sep 17 00:00:00 2001
From: Marko Vejnovic <marko@harmont.dev>
Date: Tue, 9 Jun 2026 20:35:47 -0700
Subject: [PATCH 15/15] =?UTF-8?q?refactor:=20run-local=20workspace=20linea?=
 =?UTF-8?q?ge=20=E2=80=94=20registry=20caches=20system=20snapshots=20only?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rearchitect workspace flow to kill the stale-source bug class (CLI-28):
cached snapshots now carry ONLY system state (docker commit); workspace
state is strictly run-scoped and rebases to the current run's fresh
source at every cache-hit boundary. A cache-hit step contributes its
snapshot for container lineage but no workspace dir; its children COW
from the run's fresh source extract instead of a stale persisted copy.

Core changes:
- registry: workspace_dir always written NULL (legacy rows reaped);
  put() runs upsert+eviction in one transaction with DELETE..RETURNING;
  invalidate_if() CAS closes the stale-entry/fresh-insert race;
  contains_snapshot() backs all guarded image removals
- vm: HmVm::execute takes a CancellationToken — cooperative cancel with
  awaited teardown (destroy/chown reclaim strictly happens-before the
  runner drops the workspace tempdir); evictions deferred to end-of-run
  (cleanup_deferred_evictions) so in-flight readers never lose their
  tag; put-failure demotes the snapshot to ephemeral instead of leaking
  it; gc_orphaned_snapshots sweeps aged rowless tags at startup
- scheduler: per-step workspace refcounting frees a parent's kept dir
  the moment its last BuildsIn consumer finishes (temp footprint now
  tracks the live DAG frontier); per-step timeout fires a child token
  and AWAITS teardown instead of dropping the future; ephemeral
  snapshot cleanup is guarded by contains_snapshot so a concurrently
  re-registered tag survives
- runner: once-per-run shared source extract (Arc<OnceCell<TempDir>>);
  cow_copy call sites moved into spawn_blocking; cache-hit path does
  zero workspace prep
- workspace: single-attempt cp with captured stderr (cp -c self-falls
  back to copyfile(2); the manual retry masked real errors)
- docker: per-tag GC removal (untag-safe for multi-tag images); exec
  quiesce on cancelled commands before chown reclaim
- DSL: forever() docs state the hit-boundary semantics (workspace
  writes are not replayed across runs); predicate keygen
  cross-language tests

Semantic change: on a cache hit, files a step wrote into /workspace are
no longer visible to downstream steps across runs — downstream sees the
current run's source tree. Write build outputs to system paths or use
on_change() when workspace outputs must survive hits.

Acceptance: deep_cache_chain_workspace passes from warm and cold cache,
asserting [a]/[b] cache hits AND fresh marker content (the CLI-28
contract). local_fork_cache + local_parallelism pass. Test fixtures
modernized from the obsolete def build()/.run() DSL form.
---
 Cargo.lock                                    |   1 +
 .../harmont-py/harmont/__init__.py            |   8 +-
 .../hm-dsl-engine/harmont-py/harmont/cache.py |   6 +
 .../harmont-py/tests/test_cache_predicate.py  |   8 +-
 crates/hm-dsl-engine/harmont-ts/src/cache.ts  |   9 +
 crates/hm-exec/src/local/backend.rs           |  72 +-
 crates/hm-exec/src/local/runner/mod.rs        |  10 +-
 crates/hm-exec/src/local/runner/vm.rs         |  80 +-
 crates/hm-exec/src/local/scheduler.rs         | 608 +++++++++++---
 crates/hm-vm/Cargo.toml                       |   1 +
 crates/hm-vm/src/backend.rs                   |  26 +
 crates/hm-vm/src/docker.rs                    | 393 ++++++++-
 crates/hm-vm/src/registry.rs                  | 280 +++++--
 crates/hm-vm/src/types.rs                     |   7 +-
 crates/hm-vm/src/vm.rs                        | 745 +++++++++++++++---
 crates/hm-vm/src/workspace.rs                 |  72 +-
 crates/hm/tests/deep_cache_chain_workspace.rs |  23 +-
 crates/hm/tests/local_fork_cache.rs           |  11 +-
 crates/hm/tests/local_parallelism.rs          |  10 +-
 19 files changed, 1930 insertions(+), 440 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 75de2c3b..c5d5668d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1363,6 +1363,7 @@ dependencies = [
  "rusqlite",
  "tempfile",
  "tokio",
+ "tokio-util",
  "tracing",
  "uuid",
 ]
diff --git a/crates/hm-dsl-engine/harmont-py/harmont/__init__.py b/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
index 0f7e0ac7..62a5ceee 100644
--- a/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
+++ b/crates/hm-dsl-engine/harmont-py/harmont/__init__.py
@@ -142,6 +142,12 @@ def forever(env_keys: tuple[str, ...] = ()) -> CacheForever:
     itself encodes the version (e.g. downloading a pinned binary). Do not
     use for steps that fetch mutable remote resources.
 
+    On a cache hit only SYSTEM state (the snapshot) is restored — files the
+    step wrote into the workspace are not replayed across runs; downstream
+    steps see the current run's source tree instead. Write build outputs to
+    system paths (e.g. ``/usr/local``) or use ``on_change`` if the step's
+    workspace outputs must survive cache hits.
+
     Args:
         env_keys: Environment variable names whose values are folded into
             the cache key. Use this when the command's behavior varies by
@@ -180,7 +186,7 @@ def compose(*policies: CachePolicy) -> CacheCompose:
     return CacheCompose(policies=tuple(policies))
 
 
-def predicate(fn: "Callable[[], str]") -> CachePredicate:
+def predicate(fn: Callable[[], str]) -> CachePredicate:
     """Cache policy that invalidates when ``fn()`` returns a new value.
 
     ``fn`` is called once at plan time. Its string return value is
diff --git a/crates/hm-dsl-engine/harmont-py/harmont/cache.py b/crates/hm-dsl-engine/harmont-py/harmont/cache.py
index d597ea79..878acfc0 100644
--- a/crates/hm-dsl-engine/harmont-py/harmont/cache.py
+++ b/crates/hm-dsl-engine/harmont-py/harmont/cache.py
@@ -33,6 +33,12 @@ class CacheForever(CachePolicy):
     Use for pure computations whose only inputs are visible to the planner.
     DO NOT use for installs that fetch the public internet — package repos
     drift; manual cache busts will be needed.
+
+    On a cache hit only SYSTEM state (the snapshot) is restored — files the
+    step wrote into the workspace are not replayed across runs; downstream
+    steps see the current run's source tree instead. Write build outputs to
+    system paths (e.g. ``/usr/local``) or use ``on_change`` if the step's
+    workspace outputs must survive cache hits.
     """
 
     env_keys: tuple[str, ...] = ()
diff --git a/crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py b/crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py
index 1a829552..9bfc4e49 100644
--- a/crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py
+++ b/crates/hm-dsl-engine/harmont-py/tests/test_cache_predicate.py
@@ -115,15 +115,15 @@ def make_graph(value):
         pipeline_org="o",
         pipeline_slug="s",
         now=0,
-        base_path=Path("/tmp"),
-        env={},  # noqa: S108
+        base_path=Path("/tmp"),  # noqa: S108
+        env={},
     )
     resolve_pipeline_keys(
         g2,
         pipeline_org="o",
         pipeline_slug="s",
         now=0,
-        base_path=Path("/tmp"),
-        env={},  # noqa: S108
+        base_path=Path("/tmp"),  # noqa: S108
+        env={},
     )
     assert g1["nodes"][0]["step"]["cache"]["key"] != g2["nodes"][0]["step"]["cache"]["key"]
diff --git a/crates/hm-dsl-engine/harmont-ts/src/cache.ts b/crates/hm-dsl-engine/harmont-ts/src/cache.ts
index 47f1ab52..40da0ce1 100644
--- a/crates/hm-dsl-engine/harmont-ts/src/cache.ts
+++ b/crates/hm-dsl-engine/harmont-ts/src/cache.ts
@@ -26,6 +26,15 @@ export interface CachePredicate {
 
 export type CachePolicy = CacheForever | CacheTTL | CacheOnChange | CacheCompose | CachePredicate;
 
+/**
+ * Create a permanent cache policy, keyed on (command, parent, envKeys).
+ *
+ * On a cache hit only SYSTEM state (the snapshot) is restored — files the
+ * step wrote into the workspace are not replayed across runs; downstream
+ * steps see the current run's source tree instead. Write build outputs to
+ * system paths (e.g. `/usr/local`) or use `onChange` if the step's
+ * workspace outputs must survive cache hits.
+ */
 export function forever(opts?: { envKeys?: string[] }): CacheForever {
   return { kind: "forever", envKeys: opts?.envKeys ?? [] };
 }
diff --git a/crates/hm-exec/src/local/backend.rs b/crates/hm-exec/src/local/backend.rs
index 67a10bfe..ecb547e4 100644
--- a/crates/hm-exec/src/local/backend.rs
+++ b/crates/hm-exec/src/local/backend.rs
@@ -45,8 +45,10 @@ impl LocalBackend {
 
     /// Build the runner registry, constructing the [`HmVm`] orchestrator
     /// (VM backend + snapshot registry) and registering the [`VmRunner`] as
-    /// the default runner.
-    fn build_registry(&self) -> Result<RunnerRegistry> {
+    /// the default runner. The orchestrator handle is also returned so the
+    /// run loop can drain its deferred eviction queue once the DAG has
+    /// finished.
+    fn build_registry(&self) -> Result<(RunnerRegistry, Arc<HmVm>)> {
         let cache_dir = hm_util::dirs::harmont_cache_dir().ok_or_else(|| {
             BackendError::Local("cannot resolve the Harmont cache directory".into())
         })?;
@@ -56,15 +58,14 @@ impl LocalBackend {
         let config = VmConfig {
             memory_mib: Some(8192),
             disk_size_gb: Some(10),
-            workspace_cache_dir: Some(cache_dir.join("workspaces")),
             ..Default::default()
         };
 
-        let hmvm = HmVm::new(Arc::clone(&self.vm_backend), registry, config);
+        let hmvm = Arc::new(HmVm::new(Arc::clone(&self.vm_backend), registry, config));
 
         let mut runners = RunnerRegistry::new();
-        runners.register(Arc::new(VmRunner::new(Arc::new(hmvm))), true);
-        Ok(runners)
+        runners.register(Arc::new(VmRunner::new(Arc::clone(&hmvm))), true);
+        Ok((runners, hmvm))
     }
 }
 
@@ -79,14 +80,57 @@ impl ExecutionBackend for LocalBackend {
     }
 
     async fn start(&self, req: RunRequest) -> Result<BackendHandle> {
-        let registry = Arc::new(self.build_registry()?);
+        let (registry, hmvm) = self.build_registry()?;
+
+        // Best-effort, detached GC of aged snapshot images in the
+        // *run-scoped* namespaces only:
+        // - `harmont-ephemeral`: uncached snapshots whose run died before
+        //   its end-of-run cleanup.
+        // - `ephemeral`: the EXACT legacy repo pre-fix versions used for
+        //   uncached snapshots (`ephemeral:latest`, `ephemeral:<64hex>`).
+        //   Deliberately not a glob — a wildcard like `ephemeral-*` would
+        //   force-remove unrelated user images.
+        // These tags are created and removed within a single run, so any
+        // image older than the 24h floor is residue of a dead run; the
+        // floor also keeps a starting run from deleting a concurrent run's
+        // freshly committed, still-in-use ephemeral images.
+        //
+        // `harmont-cache/*` is deliberately NOT swept here. The registry DB
+        // is per-user (`hm_util::dirs::harmont_cache_dir`) while the Docker
+        // daemon can be shared across users and CI jobs, so absence from
+        // THIS registry does not prove a cache image is orphaned — sweeping
+        // would delete other registries' live caches and silently force
+        // their cached steps to re-run. It would also race deferred LRU
+        // eviction: an evicted row vanishes from SQLite the moment `put`
+        // commits, while in-flight steps (ours or a concurrent process's)
+        // may still restore from the evicted tag until their DAG drains —
+        // a window no registry-based liveness check can observe. Crash
+        // residue under `harmont-cache/*` is bounded by the registry
+        // capacity and reclaimable via `hm cache clean` plus `docker rmi`.
+        // Failures are logged and never block or fail the run.
+        let gc_vm = Arc::clone(&hmvm);
+        tokio::spawn(async move {
+            #[allow(
+                clippy::duration_suboptimal_units,
+                reason = "from_hours is nightly-only"
+            )]
+            const GC_AGE: std::time::Duration = std::time::Duration::from_secs(24 * 3600);
+            for reference in ["harmont-ephemeral", "ephemeral"] {
+                match gc_vm.gc_orphaned_snapshots(reference, GC_AGE).await {
+                    Ok(0) => {}
+                    Ok(n) => tracing::debug!(reference, removed = n, "snapshot GC removed images"),
+                    Err(e) => tracing::warn!(reference, error = %e, "snapshot GC failed"),
+                }
+            }
+        });
+        let registry = Arc::new(registry);
         let (tx, rx) = mpsc::channel(1024);
         let cancel = CancellationToken::new();
         let parallelism = self.parallelism;
         let token = cancel.clone();
-        let backend = Arc::clone(&self.vm_backend);
+        let run_vm = Arc::clone(&hmvm);
         let join = tokio::spawn(async move {
-            crate::local::run(
+            let result = crate::local::run(
                 req.plan.graph,
                 req.repo_root,
                 req.pipeline_slug,
@@ -94,9 +138,15 @@ impl ExecutionBackend for LocalBackend {
                 registry,
                 tx,
                 token,
-                Some(backend),
+                Some(run_vm),
             )
-            .await
+            .await;
+            // Snapshot images evicted from the registry during the run are
+            // only removed now, strictly after every step has finished: an
+            // in-flight step may restore from an evicted tag long after the
+            // eviction (cache-hit outcomes propagate only the tag).
+            hmvm.cleanup_deferred_evictions().await;
+            result
         });
         Ok(BackendHandle::spawn(rx, cancel, join))
     }
diff --git a/crates/hm-exec/src/local/runner/mod.rs b/crates/hm-exec/src/local/runner/mod.rs
index 052f6909..dfbd9477 100644
--- a/crates/hm-exec/src/local/runner/mod.rs
+++ b/crates/hm-exec/src/local/runner/mod.rs
@@ -30,9 +30,15 @@ pub struct StepContext {
     pub archives: Arc<ArchiveStore>,
     pub cancel: CancellationToken,
     /// When set, the step should COW-copy this directory as its
-    /// workspace base instead of extracting from the source archive.
-    /// Populated by the scheduler for child steps in a `BuildsIn` chain.
+    /// workspace base instead of copying from the shared source base.
+    /// Populated by the scheduler for child steps in a `BuildsIn` chain;
+    /// always a live, run-owned temp directory of a step that executed
+    /// this run (cache hits propagate `None`).
     pub parent_workspace_dir: Option<String>,
+    /// Lazily-extracted, once-per-run copy of the current source archive.
+    /// Steps without a `parent_workspace_dir` COW their workspace from
+    /// this directory, guaranteeing they see the current run's sources.
+    pub source_base: Arc<tokio::sync::OnceCell<tempfile::TempDir>>,
 }
 
 /// Async trait implemented by step executors (e.g. the VM runner).
diff --git a/crates/hm-exec/src/local/runner/vm.rs b/crates/hm-exec/src/local/runner/vm.rs
index e0336904..ac0fbbce 100644
--- a/crates/hm-exec/src/local/runner/vm.rs
+++ b/crates/hm-exec/src/local/runner/vm.rs
@@ -79,11 +79,14 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
                 .as_ref()
                 .map_or_else(String::new, |s| s.0.clone()),
         });
+        // Cache hits carry no workspace: workspace state is strictly
+        // run-scoped, so children rebase onto the current source instead
+        // of inheriting a stale tree from the original run.
         return Ok(StepResult {
             exit_code: 0,
             committed_snapshot: result.snapshot.map(|s| SnapshotRef(s.0)),
             artifacts: vec![],
-            workspace_dir: result.workspace_dir.map(|p| p.display().to_string()),
+            workspace_dir: None,
             ephemeral_snapshot: false,
         });
     }
@@ -100,23 +103,38 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
         )
     };
 
-    // Prepare the workspace: COW-copy from parent step's workspace (if this
-    // is a child in a BuildsIn chain) or extract from the source archive
-    // (root step). Either way, bind-mount the result into the VM.
+    // Prepare the workspace: COW-copy from the parent step's live workspace
+    // (child of a step that executed this run) or from the shared
+    // once-per-run source base (root step, or child of a cache-hit parent).
+    // Either way, bind-mount the result into the VM.
     let step_ws = tempfile::tempdir().context("creating step workspace")?;
 
-    if let Some(ref parent_ws) = ctx.parent_workspace_dir {
-        hm_vm::workspace::cow_copy(std::path::Path::new(parent_ws), step_ws.path())
-            .context("COW copy parent workspace")?;
+    let cow_src: std::path::PathBuf = if let Some(ref parent_ws) = ctx.parent_workspace_dir {
+        std::path::PathBuf::from(parent_ws)
     } else {
-        let archive_bytes = ctx
-            .archives
-            .get_bytes(input.workspace_archive_id)
-            .ok_or_else(|| anyhow::anyhow!("source archive not found"))?;
-        let base_dir =
-            extract_archive_to_tempdir(&archive_bytes).context("extracting workspace archive")?;
-        hm_vm::workspace::cow_copy(base_dir.path(), step_ws.path())
-            .context("COW copy source to workspace")?;
+        let base = ctx
+            .source_base
+            .get_or_try_init(|| async {
+                let archive_bytes = ctx
+                    .archives
+                    .get_bytes(input.workspace_archive_id)
+                    .ok_or_else(|| anyhow::anyhow!("source archive not found"))?;
+                tokio::task::spawn_blocking(move || extract_archive_to_tempdir(&archive_bytes))
+                    .await
+                    .context("archive extraction task panicked")?
+                    .context("extracting workspace archive")
+            })
+            .await?;
+        base.path().to_path_buf()
+    };
+
+    {
+        let dst = step_ws.path().to_path_buf();
+        let src = cow_src.clone();
+        tokio::task::spawn_blocking(move || hm_vm::workspace::cow_copy(&src, &dst))
+            .await
+            .context("workspace COW task panicked")?
+            .with_context(|| format!("COW copy {} into step workspace", cow_src.display()))?;
     }
 
     let workspace = Some(WorkspaceMount {
@@ -148,13 +166,17 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
         bus: Arc::clone(&ctx.event_bus),
     };
 
-    let result = tokio::select! {
-        r = vm.execute(action, policy, &sink) => r,
-        () = ctx.cancel.cancelled() => {
-            anyhow::bail!("step cancelled (build timeout or sibling failure)")
-        }
-    }
-    .context("vm execute failed")?;
+    // Cancellation is cooperative INSIDE `HmVm::execute` (the token is
+    // threaded down): on Ctrl-C / sibling failure / step timeout it bails
+    // with exit 130 only after destroying the container, which reclaims
+    // bind-mount ownership of root-written files. Never `select!`-drop
+    // this future: doing so would tear down `step_ws` concurrently with a
+    // still-running container and leak a (root-owned, on native Linux)
+    // workspace directory.
+    let result = vm
+        .execute(action, policy, &sink, &ctx.cancel)
+        .await
+        .context("vm execute failed")?;
 
     if result.cached {
         ctx.event_bus.emit(BuildEvent::StepCacheHit {
@@ -172,13 +194,13 @@ async fn run_step_vm(vm: &HmVm, ctx: &StepContext, input: ExecutorInput) -> Resu
         });
     }
 
-    // For cached steps HmVm persists the workspace to the cache dir and
-    // returns it. For uncached steps workspace_dir is None — keep the
-    // step tempdir alive so children can COW-copy from it.
-    let workspace_dir = result
-        .workspace_dir
-        .map(|p| p.display().to_string())
-        .or_else(|| (result.exit_code == 0).then(|| step_ws.keep().display().to_string()));
+    // Steps that executed successfully (cached-miss and uncached alike)
+    // keep their live tempdir alive so same-run children can COW-copy
+    // from it; the scheduler removes every kept dir after the DAG drains.
+    // Cache hits (rare race: a concurrent fill between peek and execute)
+    // and failures propagate no workspace -- the TempDir self-cleans.
+    let workspace_dir =
+        (result.exit_code == 0 && !result.cached).then(|| step_ws.keep().display().to_string());
 
     Ok(StepResult {
         exit_code: result.exit_code,
diff --git a/crates/hm-exec/src/local/scheduler.rs b/crates/hm-exec/src/local/scheduler.rs
index 8b9217f7..40a0c5b2 100644
--- a/crates/hm-exec/src/local/scheduler.rs
+++ b/crates/hm-exec/src/local/scheduler.rs
@@ -25,6 +25,7 @@
 use std::collections::HashMap;
 use std::path::PathBuf;
 use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
 use std::time::{Duration, Instant};
 
 use daggy::petgraph::algo::toposort;
@@ -61,7 +62,11 @@ struct StepOutcome {
     summary: Option<StepResultSummary>,
     /// Host-side workspace path produced by this step's runner, if any.
     /// The scheduler propagates this to downstream `BuildsIn` children
-    /// so they can COW-copy instead of re-extracting.
+    /// so they can COW-copy instead of re-extracting. By construction it
+    /// is always a run-owned kept tempdir of a step that executed this
+    /// run with exit 0; cache hits, skips and failures carry `None`.
+    /// The dir is deleted as soon as the step's last `BuildsIn` child
+    /// finishes (refcounted), with an end-of-run sweep as backstop.
     workspace_dir: Option<String>,
     /// True when the snapshot is ephemeral and must be cleaned up after
     /// all downstream steps finish.
@@ -93,7 +98,7 @@ pub(crate) async fn run(
     runner_registry: Arc<RunnerRegistry>,
     tx: tokio::sync::mpsc::Sender<BuildEvent>,
     cancel: CancellationToken,
-    vm_backend: Option<Arc<dyn hm_vm::VmBackend>>,
+    vm: Option<Arc<hm_vm::HmVm>>,
 ) -> crate::Result<BuildOutcome> {
     // Set up per-run state.
     let bus = EventBus::new();
@@ -139,6 +144,7 @@ pub(crate) async fn run(
         archives: archives.clone(),
         cancel: cancel.clone(),
         parent_workspace_dir: None,
+        source_base: Arc::new(tokio::sync::OnceCell::new()),
     };
 
     let parallelism = parallelism.max(1);
@@ -153,6 +159,27 @@ pub(crate) async fn run(
         crate::BackendError::Local(format!("pipeline graph has a cycle at {:?}", c.node_id()))
     })?;
 
+    // Per-node refcount of `BuildsIn` children: the only consumers of a
+    // step's kept workspace dir. Each child decrements its parent's count
+    // when it finishes (whether it ran, was skipped, or failed); the child
+    // that drops the count to zero deletes the parent's dir. Steps with no
+    // `BuildsIn` children free their own dir immediately. This caps the
+    // run's temp-space footprint at the live DAG frontier instead of
+    // accumulating one workspace copy per executed step until the end of
+    // the run (full byte copies on non-reflink filesystems, RAM on
+    // tmpfs-mounted /tmp).
+    let ws_consumers: HashMap<NodeIndex, Arc<AtomicUsize>> = order
+        .iter()
+        .map(|&n| {
+            let count = dag
+                .children(n)
+                .iter(dag)
+                .filter(|(e, _)| dag.edge_weight(*e).copied() == Some(EdgeKind::BuildsIn))
+                .count();
+            (n, Arc::new(AtomicUsize::new(count)))
+        })
+        .collect();
+
     let started_at = chrono::Utc::now();
     bus.emit(BuildEvent::BuildStart {
         run_id,
@@ -169,11 +196,19 @@ pub(crate) async fn run(
     let mut done: HashMap<NodeIndex, StepFuture> = HashMap::new();
 
     for &n in &order {
-        let preds: Vec<(EdgeKind, StepFuture)> = dag
+        // (edge kind, parent future, parent's workspace-consumer refcount)
+        let preds: Vec<(EdgeKind, StepFuture, Arc<AtomicUsize>)> = dag
             .parents(n)
             .iter(dag)
-            .map(|(e, p)| (*dag.edge_weight(e).expect("edge in DAG"), done[&p].clone()))
+            .map(|(e, p)| {
+                (
+                    *dag.edge_weight(e).expect("edge in DAG"),
+                    done[&p].clone(),
+                    Arc::clone(&ws_consumers[&p]),
+                )
+            })
             .collect();
+        let own_ws_consumers = Arc::clone(&ws_consumers[&n]);
 
         let transition = dag[n].clone();
         let node_key = transition.step.key.clone();
@@ -193,83 +228,116 @@ pub(crate) async fn run(
         let fut: StepFuture = async move {
             // Await all predecessors.
             let pred_outcomes: Vec<StepOutcome> =
-                join_all(preds.iter().map(|(_, f)| f.clone())).await;
-
-            // Early exit if any predecessor failed or the build was cancelled.
-            if cancel.is_cancelled() || pred_outcomes.iter().any(|o| o.exit_code != 0) {
-                let status = if cancel.is_cancelled() {
-                    StepStatus::Canceled
-                } else {
-                    StepStatus::Skipped
-                };
-                return StepOutcome {
-                    exit_code: 0,
-                    snapshot: None,
-                    summary: Some(StepResultSummary {
-                        step_id: Uuid::new_v4(),
-                        key: node_key,
-                        status,
-                        exit_code: None,
-                        duration_ms: 0,
-                    }),
-                    ephemeral_snapshot: false,
-                    workspace_dir: None,
-                };
-            }
-
-            // Acquire parallelism permit.
-            let _permit = sem
-                .acquire_owned()
-                .await
-                .expect("semaphore closed unexpectedly");
-
-            // Find the BuildsIn parent's snapshot and workspace dir for
-            // container lineage and COW workspace propagation.
-            let (parent_snapshot, parent_workspace_dir) = preds
-                .iter()
-                .zip(&pred_outcomes)
-                .find(|((ek, _), _)| *ek == EdgeKind::BuildsIn)
-                .map_or((None, None), |(_, outcome)| {
-                    (outcome.snapshot.clone(), outcome.workspace_dir.clone())
-                });
+                join_all(preds.iter().map(|(_, f, _)| f.clone())).await;
 
-            let mut step_ctx = run_ctx.clone();
-            step_ctx.parent_workspace_dir = parent_workspace_dir;
-
-            match execute_step(
-                n,
-                transition,
-                parent_snapshot,
-                chain_id,
-                chain_pos,
-                parent_key,
-                archive_id,
-                run_id,
-                step_ctx,
-                reg,
-                bus,
-                cancel,
-            )
-            .await
-            {
-                Ok(outcome) => outcome,
-                Err(e) => {
-                    tracing::error!(%e, "step execution failed");
-                    StepOutcome {
-                        exit_code: 1,
+            // Run the step (or short-circuit). All exit paths of this inner
+            // block flow into the workspace refcount release below.
+            let outcome = async {
+                // Early exit if any predecessor failed or the build was cancelled.
+                if cancel.is_cancelled() || pred_outcomes.iter().any(|o| o.exit_code != 0) {
+                    let status = if cancel.is_cancelled() {
+                        StepStatus::Canceled
+                    } else {
+                        StepStatus::Skipped
+                    };
+                    return StepOutcome {
+                        exit_code: 0,
                         snapshot: None,
                         summary: Some(StepResultSummary {
                             step_id: Uuid::new_v4(),
-                            key: node_key,
-                            status: StepStatus::Failed,
-                            exit_code: Some(1),
+                            key: node_key.clone(),
+                            status,
+                            exit_code: None,
                             duration_ms: 0,
                         }),
-                        workspace_dir: None,
                         ephemeral_snapshot: false,
+                        workspace_dir: None,
+                    };
+                }
+
+                // Acquire parallelism permit.
+                let _permit = sem
+                    .acquire_owned()
+                    .await
+                    .expect("semaphore closed unexpectedly");
+
+                // Find the BuildsIn parent's snapshot and workspace dir for
+                // container lineage and COW workspace propagation.
+                let (parent_snapshot, parent_workspace_dir) = preds
+                    .iter()
+                    .zip(&pred_outcomes)
+                    .find(|((ek, _, _), _)| *ek == EdgeKind::BuildsIn)
+                    .map_or((None, None), |(_, outcome)| {
+                        (outcome.snapshot.clone(), outcome.workspace_dir.clone())
+                    });
+
+                let mut step_ctx = run_ctx.clone();
+                step_ctx.parent_workspace_dir = parent_workspace_dir;
+
+                match execute_step(
+                    n,
+                    transition,
+                    parent_snapshot,
+                    chain_id,
+                    chain_pos,
+                    parent_key,
+                    archive_id,
+                    run_id,
+                    step_ctx,
+                    reg,
+                    bus,
+                    cancel,
+                )
+                .await
+                {
+                    Ok(outcome) => outcome,
+                    Err(e) => {
+                        tracing::error!(%e, "step execution failed");
+                        StepOutcome {
+                            exit_code: 1,
+                            snapshot: None,
+                            summary: Some(StepResultSummary {
+                                step_id: Uuid::new_v4(),
+                                key: node_key.clone(),
+                                status: StepStatus::Failed,
+                                exit_code: Some(1),
+                                duration_ms: 0,
+                            }),
+                            workspace_dir: None,
+                            ephemeral_snapshot: false,
+                        }
                     }
                 }
             }
+            .await;
+
+            // This step is done with its parents' workspaces (the COW copy,
+            // if any, happened inside the runner). Decrement each BuildsIn
+            // parent's consumer count; the last child to finish deletes the
+            // parent's kept dir so temp space tracks the live DAG frontier.
+            for ((kind, _, counter), pred_outcome) in preds.iter().zip(&pred_outcomes) {
+                if *kind == EdgeKind::BuildsIn
+                    && counter.fetch_sub(1, Ordering::AcqRel) == 1
+                    && let Some(ws) = pred_outcome.workspace_dir.clone()
+                {
+                    tokio::task::spawn_blocking(move || std::fs::remove_dir_all(ws).ok())
+                        .await
+                        .ok();
+                }
+            }
+            // No BuildsIn children will ever read this step's workspace:
+            // free it now. (Children, if any, observe this outcome only
+            // after this future resolves, so the load cannot race a
+            // decrement.)
+            if own_ws_consumers.load(Ordering::Acquire) == 0
+                && let Some(ws) = outcome.workspace_dir.clone()
+            {
+                tokio::task::spawn_blocking(move || std::fs::remove_dir_all(ws).ok())
+                    .await
+                    .ok();
+            }
+
+            outcome
         }
         .boxed()
         .shared();
@@ -328,21 +396,24 @@ pub(crate) async fn run(
     let steps: Vec<StepResultSummary> = outcomes.iter().filter_map(|o| o.summary.clone()).collect();
 
     // Clean up ephemeral Docker snapshots and kept temp workspace dirs.
-    // These are from uncached steps that used TempDir::keep() so children
-    // could COW-copy from them. Now that all steps have finished, the
-    // temp dirs and one-shot snapshots can be removed.
+    // Workspace state is strictly run-scoped: every `Some(workspace_dir)`
+    // names a tempdir kept alive (TempDir::keep) by a step that executed
+    // this run so children could COW-copy from it. Most dirs were already
+    // freed incrementally by the last-consumer refcount above; this pass is
+    // a backstop (`remove_dir_all` on an already-deleted path is a no-op)
+    // now that all steps have drained.
     for outcome in &outcomes {
         if outcome.ephemeral_snapshot
-            && let (Some(backend), Some(snap)) = (vm_backend.as_ref(), outcome.snapshot.as_ref())
+            && let (Some(vm), Some(snap)) = (vm.as_ref(), outcome.snapshot.as_ref())
         {
-            let snap_id = hm_vm::SnapshotId(snap.0.clone());
-            if let Err(e) = backend.remove_snapshot(&snap_id).await {
-                tracing::warn!(snapshot = %snap.0, error = %e, "failed to remove ephemeral snapshot");
-            }
+            // Guarded removal: a demoted-to-ephemeral `harmont-cache/*` tag
+            // may have been re-registered by a concurrent run since this
+            // step marked it ephemeral; destroying it would kill that run's
+            // live cache entry.
+            vm.remove_snapshot_unless_registered(&hm_vm::SnapshotId(snap.0.clone()))
+                .await;
         }
-        if let Some(ref ws) = outcome.workspace_dir
-            && outcome.ephemeral_snapshot
-        {
+        if let Some(ref ws) = outcome.workspace_dir {
             let ws = ws.clone();
             tokio::task::spawn_blocking(move || std::fs::remove_dir_all(ws).ok())
                 .await
@@ -482,50 +553,85 @@ async fn execute_step(
             )
         })?;
 
+    // Give the runner a step-scoped cancellation token (a child of the
+    // build token, so build-level cancellation still propagates). The
+    // per-step timeout FIRES this token instead of dropping the runner
+    // future: the runner tears down cooperatively — the VM layer destroys
+    // the container and reclaims bind-mount ownership of root-written
+    // files before the workspace tempdir is touched — so a timed-out step
+    // can never leak a workspace dir or race the host-side cleanup.
+    let mut run_ctx = run_ctx;
+    let step_cancel = run_ctx.cancel.child_token();
+    run_ctx.cancel = step_cancel.clone();
+
     let exec = runner.execute(&run_ctx, input);
-    let result: anyhow::Result<StepResult> = match step_timeout_secs {
+    let (result, step_timed_out): (anyhow::Result<StepResult>, bool) = match step_timeout_secs {
         Some(secs) if secs > 0 => {
-            match tokio::time::timeout(Duration::from_secs(u64::from(secs)), exec).await {
-                Ok(r) => r,
-                Err(_elapsed) => {
-                    // Per-step wall-clock budget exceeded. Emit a step-end with the
-                    // conventional timeout exit code (124), fail the chain, and
-                    // cancel siblings — same shape as a non-zero exit below.
-                    let dur_ms = started.elapsed().as_millis() as u64;
-                    bus.emit(BuildEvent::StepEnd {
-                        step_id,
-                        exit_code: 124,
-                        duration_ms: dur_ms,
-                        snapshot: None,
-                    });
-                    bus.emit(BuildEvent::ChainFailed {
-                        chain_idx: chain_id,
-                        failed_step_id: step_id,
-                        failed_step_key: step_key.clone(),
-                        exit_code: 124,
-                        message: format!("step '{step_key}' timed out after {secs}s"),
-                        ts: chrono::Utc::now(),
-                    });
-                    cancel.cancel();
-                    return Ok(StepOutcome {
-                        exit_code: 124,
-                        snapshot: None,
-                        summary: Some(StepResultSummary {
-                            step_id,
-                            key: step_key.clone(),
-                            status: StepStatus::TimedOut,
-                            exit_code: Some(124),
-                            duration_ms: dur_ms,
-                        }),
-                        workspace_dir: None,
-                        ephemeral_snapshot: false,
-                    });
+            tokio::pin!(exec);
+            tokio::select! {
+                r = &mut exec => (r, false),
+                () = tokio::time::sleep(Duration::from_secs(u64::from(secs))) => {
+                    step_cancel.cancel();
+                    // Await the cooperative teardown to completion; never
+                    // drop the in-flight future.
+                    (exec.await, true)
                 }
             }
         }
-        _ => exec.await,
+        _ => (exec.await, false),
     };
 
+    if step_timed_out {
+        // Per-step wall-clock budget exceeded. Emit a step-end with the
+        // conventional timeout exit code (124), fail the chain, and cancel
+        // siblings — same shape as a non-zero exit below. Whatever the
+        // post-cancel teardown returned is superseded by the timeout
+        // verdict, but any resources it reports (a kept workspace dir or
+        // an ephemeral snapshot, if the step happened to finish in the
+        // cancellation race window) are carried into the outcome so the
+        // scheduler's cleanup passes reclaim them.
+        let dur_ms = started.elapsed().as_millis() as u64;
+        let (snapshot, workspace_dir, ephemeral_snapshot) = match result {
+            Ok(sr) => (
+                sr.committed_snapshot,
+                sr.workspace_dir,
+                sr.ephemeral_snapshot,
+            ),
+            Err(_) => (None, None, false),
+        };
+        bus.emit(BuildEvent::StepEnd {
+            step_id,
+            exit_code: 124,
+            duration_ms: dur_ms,
+            snapshot: None,
+        });
+        bus.emit(BuildEvent::ChainFailed {
+            chain_idx: chain_id,
+            failed_step_id: step_id,
+            failed_step_key: step_key.clone(),
+            exit_code: 124,
+            message: format!(
+                "step '{step_key}' timed out after {}s",
+                step_timeout_secs.unwrap_or(0)
+            ),
+            ts: chrono::Utc::now(),
+        });
+        cancel.cancel();
+        return Ok(StepOutcome {
+            exit_code: 124,
+            snapshot,
+            summary: Some(StepResultSummary {
+                step_id,
+                key: step_key.clone(),
+                status: StepStatus::TimedOut,
+                exit_code: Some(124),
+                duration_ms: dur_ms,
+            }),
+            workspace_dir,
+            ephemeral_snapshot,
+        });
+    }
+
     let dur_ms = started.elapsed().as_millis() as u64;
     match result {
         Ok(sr) => {
@@ -661,3 +767,267 @@ fn compute_chain_info(dag: &Dag<Transition, EdgeKind>) -> ChainInfo {
         node_chain_pos,
     }
 }
+
+#[cfg(test)]
+#[allow(clippy::unwrap_used)]
+mod tests {
+    use super::*;
+    use crate::local::runner::{StepContext, StepRunner};
+    use hm_plugin_protocol::{ExecutorInput, StepResult};
+    use std::future::Future;
+    use std::pin::Pin;
+    use std::sync::Mutex;
+
+    /// Runner stub that materializes a kept tempdir per step (mirroring the
+    /// VM runner's workspace contract) and records, at the start of each
+    /// step, which previously produced workspace dirs still exist on disk.
+    /// This makes the scheduler's incremental workspace reclamation
+    /// observable mid-run.
+    #[derive(Debug, Default)]
+    struct WorkspaceProbeRunner {
+        /// `(step key, kept workspace dir)` in execution order.
+        dirs: Mutex<Vec<(String, PathBuf)>>,
+        /// step key -> keys of earlier steps whose dirs were still on disk
+        /// when this step started.
+        observed: Mutex<HashMap<String, Vec<String>>>,
+    }
+
+    impl StepRunner for WorkspaceProbeRunner {
+        fn name(&self) -> &'static str {
+            "probe"
+        }
+
+        fn execute(
+            &self,
+            _ctx: &StepContext,
+            input: ExecutorInput,
+        ) -> Pin<Box<dyn Future<Output = anyhow::Result<StepResult>> + Send + '_>> {
+            Box::pin(async move {
+                let key = input.step.key.clone();
+                let alive: Vec<String> = self
+                    .dirs
+                    .lock()
+                    .unwrap()
+                    .iter()
+                    .filter(|(_, p)| p.exists())
+                    .map(|(k, _)| k.clone())
+                    .collect();
+                self.observed.lock().unwrap().insert(key.clone(), alive);
+
+                let ws = tempfile::tempdir().unwrap().keep();
+                self.dirs.lock().unwrap().push((key, ws.clone()));
+                Ok(StepResult {
+                    exit_code: 0,
+                    committed_snapshot: None,
+                    artifacts: vec![],
+                    workspace_dir: Some(ws.display().to_string()),
+                    ephemeral_snapshot: false,
+                })
+            })
+        }
+    }
+
+    /// Build a [`PipelineGraph`] from step keys plus `(from, to)` `BuildsIn`
+    /// edges (indices into `keys`).
+    fn graph_with_edges(keys: &[&str], edges: &[(usize, usize)]) -> PipelineGraph {
+        let nodes: Vec<serde_json::Value> = keys
+            .iter()
+            .map(|k| serde_json::json!({ "step": { "key": k, "cmd": "true" }, "env": {} }))
+            .collect();
+        let edges: Vec<serde_json::Value> = edges
+            .iter()
+            .map(|(a, b)| serde_json::json!([a, b, "builds_in"]))
+            .collect();
+        serde_json::from_value(serde_json::json!({
+            "version": "0",
+            "graph": {
+                "nodes": nodes,
+                "node_holes": [],
+                "edge_property": "directed",
+                "edges": edges,
+            }
+        }))
+        .unwrap()
+    }
+
+    async fn run_probe(
+        graph: PipelineGraph,
+        runner: Arc<WorkspaceProbeRunner>,
+    ) -> crate::BuildOutcome {
+        let repo = tempfile::tempdir().unwrap();
+        std::fs::write(repo.path().join("marker.txt"), "v1").unwrap();
+        let mut registry = RunnerRegistry::new();
+        registry.register(runner, true);
+        // Hold `_rx` so the event forwarder keeps a live receiver; the
+        // handful of events a tiny pipeline emits fits the channel.
+        let (tx, _rx) = tokio::sync::mpsc::channel(1024);
+        run(
+            graph,
+            repo.path().to_path_buf(),
+            "test-pipeline".into(),
+            1,
+            Arc::new(registry),
+            tx,
+            CancellationToken::new(),
+            None,
+        )
+        .await
+        .unwrap()
+    }
+
+    /// Chain a -> b -> c: a's kept workspace must be deleted as soon as its
+    /// only `BuildsIn` child (b) finishes — i.e. before c starts — not at the
+    /// end of the run. This caps temp-space at the live DAG frontier.
+    #[tokio::test]
+    async fn chain_frees_parent_workspace_when_last_child_finishes() {
+        let runner = Arc::new(WorkspaceProbeRunner::default());
+        let graph = graph_with_edges(&["a", "b", "c"], &[(0, 1), (1, 2)]);
+
+        let outcome = run_probe(graph, Arc::clone(&runner)).await;
+        assert_eq!(outcome.status, crate::BuildStatus::Passed);
+
+        let observed = runner.observed.lock().unwrap().clone();
+        // b starts while a's dir is alive (it COWs from it)...
+        assert_eq!(observed["b"], vec!["a".to_owned()]);
+        // ...but by the time c starts, b (a's last consumer) has finished
+        // and a's dir is already gone. Only b's dir is alive.
+        assert_eq!(observed["c"], vec!["b".to_owned()]);
+
+        // Backstop: nothing survives the run.
+        for (_, dir) in runner.dirs.lock().unwrap().iter() {
+            assert!(!dir.exists(), "workspace dir leaked: {}", dir.display());
+        }
+    }
+
+    /// Fork a -> {b, c}: the first child to finish must NOT free a's dir
+    /// (its sibling still needs it); the last one does. A leaf's own dir
+    /// (b has no `BuildsIn` children) is freed as soon as the leaf finishes.
+    #[tokio::test]
+    async fn fork_frees_parent_workspace_only_after_last_sibling() {
+        let runner = Arc::new(WorkspaceProbeRunner::default());
+        let graph = graph_with_edges(&["a", "b", "c"], &[(0, 1), (0, 2)]);
+
+        let outcome = run_probe(graph, Arc::clone(&runner)).await;
+        assert_eq!(outcome.status, crate::BuildStatus::Passed);
+
+        let observed = runner.observed.lock().unwrap().clone();
+        let exec_order: Vec<String> = runner
+            .dirs
+            .lock()
+            .unwrap()
+            .iter()
+            .map(|(k, _)| k.clone())
+            .collect();
+        assert_eq!(exec_order[0], "a");
+        // The first sibling sees a alive. The second sibling must STILL see
+        // a alive: it is a's remaining consumer, so the first sibling's
+        // completion must not have freed a's dir. (The first sibling's own
+        // leaf dir is reclaimed concurrently with the second sibling's
+        // start — the permit is released before the cleanup runs — so no
+        // assertion is made about it mid-run; the leak check below covers
+        // it.)
+        assert_eq!(observed[&exec_order[1]], vec!["a".to_owned()]);
+        assert!(observed[&exec_order[2]].contains(&"a".to_owned()));
+
+        for (_, dir) in runner.dirs.lock().unwrap().iter() {
+            assert!(!dir.exists(), "workspace dir leaked: {}", dir.display());
+        }
+    }
+
+    /// Runner that hangs until its step-scoped cancellation token fires,
+    /// then performs (simulated) teardown work before returning — mirroring
+    /// the VM runner, whose post-cancel path destroys the container and
+    /// reclaims workspace ownership before resolving.
+    #[derive(Debug, Default)]
+    struct CooperativeHangRunner {
+        /// Keys of steps whose futures ran to completion (were awaited
+        /// through teardown rather than dropped).
+        torn_down: Mutex<Vec<String>>,
+    }
+
+    impl StepRunner for CooperativeHangRunner {
+        fn name(&self) -> &'static str {
+            "hang"
+        }
+
+        fn execute(
+            &self,
+            ctx: &StepContext,
+            input: ExecutorInput,
+        ) -> Pin<Box<dyn Future<Output = anyhow::Result<StepResult>> + Send + '_>> {
+            let cancel = ctx.cancel.clone();
+            Box::pin(async move {
+                #[allow(
+                    clippy::duration_suboptimal_units,
+                    reason = "from_hours is nightly-only"
+                )]
+                const HANG: Duration = Duration::from_secs(3600);
+                tokio::select! {
+                    () = cancel.cancelled() => {}
+                    () = tokio::time::sleep(HANG) => {}
+                }
+                // Teardown must be awaited by the scheduler, never cut
+                // short by a dropped future.
+                tokio::time::sleep(Duration::from_millis(100)).await;
+                self.torn_down.lock().unwrap().push(input.step.key.clone());
+                Ok(StepResult {
+                    exit_code: 130,
+                    committed_snapshot: None,
+                    artifacts: vec![],
+                    workspace_dir: None,
+                    ephemeral_snapshot: false,
+                })
+            })
+        }
+    }
+
+    /// A per-step timeout must cancel the runner COOPERATIVELY and await
+    /// its teardown to completion (dropping the in-flight future would
+    /// race the container's bind-mount ownership reclaim and leak the
+    /// workspace dir), while still reporting the step as timed out.
+    #[tokio::test]
+    async fn step_timeout_awaits_cooperative_teardown() {
+        let runner = Arc::new(CooperativeHangRunner::default());
+        let graph: PipelineGraph = serde_json::from_value(serde_json::json!({
+            "version": "0",
+            "graph": {
+                "nodes": [{
+                    "step": { "key": "slow", "cmd": "true", "timeout_seconds": 1 },
+                    "env": {}
+                }],
+                "node_holes": [],
+                "edge_property": "directed",
+                "edges": [],
+            }
+        }))
+        .unwrap();
+
+        let repo = tempfile::tempdir().unwrap();
+        std::fs::write(repo.path().join("marker.txt"), "v1").unwrap();
+        let mut registry = RunnerRegistry::new();
+        registry.register(Arc::clone(&runner) as Arc<dyn StepRunner>, true);
+        let (tx, _rx) = tokio::sync::mpsc::channel(1024);
+        let outcome = run(
+            graph,
+            repo.path().to_path_buf(),
+            "test-pipeline".into(),
+            1,
+            Arc::new(registry),
+            tx,
+            CancellationToken::new(),
+            None,
+        )
+        .await
+        .unwrap();
+
+        // The runner's future was awaited through its teardown...
+        assert_eq!(
+            runner.torn_down.lock().unwrap().clone(),
+            vec!["slow".to_owned()]
+        );
+        // ...and the step is still reported as timed out.
+        assert_eq!(outcome.steps.len(), 1);
+        assert_eq!(outcome.steps[0].status, StepStatus::TimedOut);
+        assert_eq!(outcome.steps[0].exit_code, Some(124));
+    }
+}
diff --git a/crates/hm-vm/Cargo.toml b/crates/hm-vm/Cargo.toml
index 9a7ddc23..51905e3a 100644
--- a/crates/hm-vm/Cargo.toml
+++ b/crates/hm-vm/Cargo.toml
@@ -9,6 +9,7 @@ publish = false
 anyhow = { workspace = true }
 async-trait = { workspace = true }
 tokio = { workspace = true }
+tokio-util = { workspace = true }
 tracing = { workspace = true }
 derive_more = { workspace = true }
 
diff --git a/crates/hm-vm/src/backend.rs b/crates/hm-vm/src/backend.rs
index 1401e617..74465b43 100644
--- a/crates/hm-vm/src/backend.rs
+++ b/crates/hm-vm/src/backend.rs
@@ -31,6 +31,32 @@ pub trait VmBackend: Send + Sync + fmt::Debug {
 
     /// Delete a snapshot from the backend store.
     async fn remove_snapshot(&self, snapshot: &SnapshotId) -> Result<()>;
+
+    /// Best-effort garbage collection of snapshots matching `reference`
+    /// (a backend-specific reference filter pattern) whose creation time is
+    /// older than `older_than`. Returns the number of snapshots removed.
+    ///
+    /// `keep` is consulted once per matching snapshot tag; tags for which
+    /// it returns `true` are retained. Callers use it to protect snapshots
+    /// that are still referenced (e.g. by the snapshot registry) so that GC
+    /// only ever removes orphans.
+    ///
+    /// Backends without a notion of aged snapshot storage may keep the
+    /// default no-op implementation.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error only when the backend cannot be queried at all;
+    /// per-snapshot removal failures are logged and skipped.
+    async fn gc_snapshots(
+        &self,
+        reference: &str,
+        older_than: std::time::Duration,
+        keep: &(dyn for<'a> Fn(&'a str) -> bool + Send + Sync),
+    ) -> Result<u64> {
+        let _ = (reference, older_than, keep);
+        Ok(0)
+    }
 }
 
 /// Handle to a running virtual machine.
diff --git a/crates/hm-vm/src/docker.rs b/crates/hm-vm/src/docker.rs
index 062feb02..3509b91a 100644
--- a/crates/hm-vm/src/docker.rs
+++ b/crates/hm-vm/src/docker.rs
@@ -5,6 +5,7 @@
 //! image commits.
 
 use std::collections::HashMap;
+use std::sync::atomic::{AtomicBool, Ordering};
 
 use anyhow::{Context, Result};
 use async_trait::async_trait;
@@ -77,6 +78,33 @@ impl DockerBackend {
         Ok(!images.is_empty())
     }
 
+    /// Compute the host-ownership reclaim spec for a bind-mounted workspace.
+    ///
+    /// Steps run as the image's default user (typically root) and write
+    /// straight into the bind-mounted host directory, so on native Linux
+    /// Docker the files land on the host owned by uid 0. The unprivileged
+    /// `hm` user then can neither COW-copy 0600/0700 entries into child
+    /// workspaces nor delete them during cleanup. The reclaim spec records
+    /// the owner of the host tempdir itself (always the `hm` user that
+    /// created it) so [`reclaim_workspace_ownership`] can chown everything
+    /// back before the host touches the directory again.
+    #[cfg(unix)]
+    fn reclaim_spec(workspace: Option<&WorkspaceMount>) -> Option<WorkspaceReclaim> {
+        use std::os::unix::fs::MetadataExt;
+        let ws = workspace?;
+        let meta = std::fs::metadata(&ws.host_path).ok()?;
+        Some(WorkspaceReclaim {
+            guest_path: ws.guest_path.clone(),
+            uid: meta.uid(),
+            gid: meta.gid(),
+        })
+    }
+
+    #[cfg(not(unix))]
+    fn reclaim_spec(_workspace: Option<&WorkspaceMount>) -> Option<WorkspaceReclaim> {
+        None
+    }
+
     #[instrument(skip(self))]
     async fn start_container(
         &self,
@@ -102,10 +130,38 @@ impl DockerBackend {
             .create_container(None::<CreateContainerOptions<String>>, cfg)
             .await
             .context("create container")?;
-        self.client
+        if let Err(e) = self
+            .client
             .start_container(&create.id, None::<StartContainerOptions<String>>)
             .await
-            .context("start container")?;
+        {
+            // The daemon validates bind-mount sources at START time (e.g.
+            // Docker Desktop's file-sharing allowlist), so this path is
+            // reachable on every step. No `Vm` handle exists yet — neither
+            // `HmVm::execute`'s `destroy()` nor `DockerVm`'s `Drop` backstop
+            // will ever see this container, and image GC cannot reclaim an
+            // image pinned by a Created-state container — so remove it here,
+            // best-effort, before propagating the error.
+            if let Err(rm_err) = self
+                .client
+                .remove_container(
+                    &create.id,
+                    Some(RemoveContainerOptions {
+                        force: true,
+                        v: true,
+                        ..Default::default()
+                    }),
+                )
+                .await
+            {
+                tracing::warn!(
+                    container = %create.id,
+                    error = %rm_err,
+                    "failed to remove container that never started"
+                );
+            }
+            return Err(anyhow::Error::new(e).context("start container"));
+        }
         Ok(create.id)
     }
 }
@@ -124,6 +180,8 @@ impl VmBackend for DockerBackend {
         Ok(Box::new(DockerVm {
             client: self.client.clone(),
             container_id: Some(container_id),
+            workspace_reclaim: Self::reclaim_spec(workspace),
+            exec_in_flight: AtomicBool::new(false),
         }))
     }
 
@@ -138,6 +196,8 @@ impl VmBackend for DockerBackend {
         Ok(Box::new(DockerVm {
             client: self.client.clone(),
             container_id: Some(container_id),
+            workspace_reclaim: Self::reclaim_spec(workspace),
+            exec_in_flight: AtomicBool::new(false),
         }))
     }
 
@@ -161,6 +221,170 @@ impl VmBackend for DockerBackend {
             .with_context(|| format!("removing image '{}'", snapshot.0))?;
         Ok(())
     }
+
+    #[instrument(skip(self, keep))]
+    async fn gc_snapshots(
+        &self,
+        reference: &str,
+        older_than: std::time::Duration,
+        keep: &(dyn for<'a> Fn(&'a str) -> bool + Send + Sync),
+    ) -> Result<u64> {
+        let mut filters = HashMap::new();
+        filters.insert("reference".to_string(), vec![reference.to_string()]);
+        let images = self
+            .client
+            .list_images(Some(ListImagesOptions {
+                filters,
+                ..Default::default()
+            }))
+            .await
+            .with_context(|| format!("listing images for GC reference '{reference}'"))?;
+
+        let cutoff: i64 = std::time::SystemTime::now()
+            .duration_since(std::time::UNIX_EPOCH)
+            .unwrap_or_default()
+            .saturating_sub(older_than)
+            .as_secs()
+            .try_into()
+            .unwrap_or(i64::MAX);
+
+        let mut removed: u64 = 0;
+        for image in images {
+            if image.created >= cutoff {
+                continue;
+            }
+            // Remove by tag, never by image id: the daemon's `reference`
+            // filter returns whole images, and a multi-tag image may carry
+            // tags outside the GC pattern (or tags the keeper protects).
+            // Untagging only the matched, unprotected references leaves
+            // every other tag pointing at the image intact; the image data
+            // itself is reclaimed when its last tag goes.
+            for tag in &image.repo_tags {
+                if !reference_matches(reference, tag) {
+                    continue;
+                }
+                if keep(tag) {
+                    tracing::debug!(image = %tag, "GC keeping referenced snapshot");
+                    continue;
+                }
+                match self
+                    .client
+                    .remove_image(
+                        tag,
+                        Some(RemoveImageOptions {
+                            force: true,
+                            noprune: false,
+                        }),
+                        None,
+                    )
+                    .await
+                {
+                    Ok(_) => removed += 1,
+                    Err(e) => {
+                        tracing::warn!(image = %tag, error = %e, "failed to GC snapshot image");
+                    }
+                }
+            }
+        }
+        Ok(removed)
+    }
+}
+
+/// Client-side check that a `repo:tag` reference belongs to a GC pattern.
+///
+/// The daemon's `reference` filter pre-selects *images*, but a multi-tag
+/// image's `repo_tags` can include tags outside the pattern; only matching
+/// tags may be untagged. Supports the two pattern shapes the GC uses:
+///
+/// - an exact repository name (matches every tag of exactly that repo);
+/// - `<repo>/*` (matches repositories exactly one path component below
+///   `<repo>`, mirroring Docker's reference-filter glob semantics).
+fn reference_matches(pattern: &str, repo_tag: &str) -> bool {
+    let repo = repo_tag.rsplit_once(':').map_or(repo_tag, |(r, _)| r);
+    pattern.strip_suffix("/*").map_or_else(
+        || repo == pattern,
+        |prefix| {
+            repo.strip_prefix(prefix).is_some_and(|rest| {
+                rest.strip_prefix('/')
+                    .is_some_and(|name| !name.is_empty() && !name.contains('/'))
+            })
+        },
+    )
+}
+
+/// Ownership-reclaim parameters for a bind-mounted workspace: chown the
+/// guest path back to the host user that owns the workspace tempdir.
+#[derive(Debug, Clone)]
+struct WorkspaceReclaim {
+    guest_path: String,
+    uid: u32,
+    gid: u32,
+}
+
+/// Best-effort: chown the bind-mounted workspace back to the host user.
+///
+/// Runs `chown -R <uid>:<gid> <guest_path>` inside the container as root
+/// (regardless of the image's default user). Because a bind mount shares
+/// inodes with the host, this restores host-side ownership of every file
+/// the step wrote as root, so the unprivileged host user can COW-copy the
+/// workspace into children and delete it during cleanup. On Docker Desktop
+/// (macOS) ownership is already remapped by the file sharing layer and
+/// this is a harmless no-op.
+///
+/// Failures are logged, never propagated: a missing `chown` binary or a
+/// dead container must not fail the build — host-side cleanup will then
+/// warn about anything it cannot remove.
+async fn reclaim_workspace_ownership(client: &Docker, container_id: &str, spec: &WorkspaceReclaim) {
+    let owner = format!("{}:{}", spec.uid, spec.gid);
+    let exec = match client
+        .create_exec(
+            container_id,
+            CreateExecOptions {
+                cmd: Some(vec!["chown", "-R", &owner, &spec.guest_path]),
+                user: Some("0:0"),
+                attach_stdout: Some(true),
+                attach_stderr: Some(true),
+                ..Default::default()
+            },
+        )
+        .await
+    {
+        Ok(exec) => exec,
+        Err(e) => {
+            tracing::warn!(container = %container_id, error = %e,
+                "failed to create workspace ownership-reclaim exec");
+            return;
+        }
+    };
+
+    match client.start_exec(&exec.id, None).await {
+        Ok(StartExecResults::Attached { mut output, .. }) => {
+            // Drain so the exec runs to completion.
+            while let Some(item) = output.next().await {
+                if item.is_err() {
+                    break;
+                }
+            }
+        }
+        Ok(StartExecResults::Detached) => {}
+        Err(e) => {
+            tracing::warn!(container = %container_id, error = %e,
+                "failed to start workspace ownership-reclaim exec");
+            return;
+        }
+    }
+
+    match client.inspect_exec(&exec.id).await {
+        Ok(inspect) if inspect.exit_code.unwrap_or(0) == 0 => {}
+        Ok(inspect) => {
+            tracing::warn!(container = %container_id, exit_code = ?inspect.exit_code,
+                "workspace ownership reclaim (chown) exited non-zero; host-side cleanup may be incomplete");
+        }
+        Err(e) => {
+            tracing::warn!(container = %container_id, error = %e,
+                "failed to inspect workspace ownership-reclaim exec");
+        }
+    }
 }
 
 /// Handle to a running Docker container acting as a VM.
@@ -169,22 +393,86 @@ struct DockerVm {
     #[debug(skip)]
     client: Docker,
     container_id: Option<String>,
+    /// When a workspace is bind-mounted, the chown target used to hand
+    /// root-written files back to the host user before teardown.
+    workspace_reclaim: Option<WorkspaceReclaim>,
+    /// True while a command may still be running inside the container.
+    ///
+    /// Set when `exec` starts and cleared only when it completes cleanly,
+    /// so a cancelled/timed-out (dropped) exec future — or a stream error
+    /// of unknown outcome — leaves it set. Teardown uses it to quiesce the
+    /// container (SIGKILL every process, then restart the idle `sleep`)
+    /// before the ownership-reclaim chown, so no in-container writer can
+    /// race the chown or dirty the bind mount afterwards.
+    exec_in_flight: AtomicBool,
+}
+
+/// Shared container teardown used by both [`DockerVm::destroy`] (awaited)
+/// and [`DockerVm::drop`] (detached backstop).
+///
+/// Order matters: when `exec_in_flight` is set, a cancelled or timed-out
+/// command may still be writing into the bind mount as root, so the
+/// container is stopped first (SIGKILL, quiescing every process) and then
+/// restarted (its command is `sleep infinity`) so the reclaim chown runs
+/// with no concurrent writers. Only after the chown does the final
+/// stop/remove happen — nothing can re-dirty the workspace between the
+/// chown and the host-side removal of the directory.
+async fn teardown_container(
+    client: &Docker,
+    id: &str,
+    reclaim: Option<WorkspaceReclaim>,
+    exec_in_flight: bool,
+) -> Result<()> {
+    if let Some(spec) = reclaim {
+        let mut can_reclaim = true;
+        if exec_in_flight {
+            let _ = client
+                .stop_container(id, Some(StopContainerOptions { t: 0 }))
+                .await;
+            if let Err(e) = client
+                .start_container(id, None::<StartContainerOptions<String>>)
+                .await
+            {
+                tracing::warn!(container = %id, error = %e,
+                    "failed to restart container for workspace ownership reclaim; \
+                     host-side workspace cleanup may be incomplete");
+                can_reclaim = false;
+            }
+        }
+        if can_reclaim {
+            reclaim_workspace_ownership(client, id, &spec).await;
+        }
+    }
+    let _ = client
+        .stop_container(id, Some(StopContainerOptions { t: 0 }))
+        .await;
+    client
+        .remove_container(
+            id,
+            Some(RemoveContainerOptions {
+                force: true,
+                v: true,
+                ..Default::default()
+            }),
+        )
+        .await
+        .with_context(|| format!("removing container '{id}'"))?;
+    Ok(())
 }
 
 impl Drop for DockerVm {
     fn drop(&mut self) {
         if let Some(id) = self.container_id.take() {
             let client = self.client.clone();
+            let reclaim = self.workspace_reclaim.take();
+            let exec_in_flight = self.exec_in_flight.load(Ordering::Acquire);
             tokio::spawn(async move {
-                let opts = StopContainerOptions { t: 0 };
-                let _ = client.stop_container(&id, Some(opts)).await;
-                let rm = RemoveContainerOptions {
-                    force: true,
-                    v: true,
-                    ..Default::default()
-                };
-                let _ = client.remove_container(&id, Some(rm)).await;
-                tracing::debug!(container = %id, "dropped container cleaned up");
+                match teardown_container(&client, &id, reclaim, exec_in_flight).await {
+                    Ok(()) => tracing::debug!(container = %id, "dropped container cleaned up"),
+                    Err(e) => {
+                        tracing::warn!(container = %id, error = %e, "dropped container cleanup failed");
+                    }
+                }
             });
         }
     }
@@ -204,6 +492,12 @@ impl Vm for DockerVm {
             .container_id
             .as_deref()
             .context("container already destroyed")?;
+        // Mark the container as possibly-running-a-command until we have
+        // proof of completion. Cleared only on the clean-return path below;
+        // a dropped (cancelled/timed-out) future or a stream error leaves
+        // it set so teardown quiesces the container before the workspace
+        // ownership reclaim.
+        self.exec_in_flight.store(true, Ordering::Release);
         let env_strings: Vec<String> = env.iter().map(|(k, v)| format!("{k}={v}")).collect();
         let exec = self
             .client
@@ -266,6 +560,9 @@ impl Vm for DockerVm {
             reason = "docker exit codes fit in i32"
         )]
         let exit_code = inspect.exit_code.unwrap_or(0) as i32;
+        // The command finished and its exit code was observed: no process
+        // of ours is writing to the bind mount any more.
+        self.exec_in_flight.store(false, Ordering::Release);
         Ok(exit_code)
     }
 
@@ -309,21 +606,63 @@ impl Vm for DockerVm {
         let Some(id) = self.container_id.take() else {
             return Ok(());
         };
-        let _ = self
-            .client
-            .stop_container(&id, Some(StopContainerOptions { t: 0 }))
-            .await;
-        self.client
-            .remove_container(
-                &id,
-                Some(RemoveContainerOptions {
-                    force: true,
-                    v: true,
-                    ..Default::default()
-                }),
-            )
-            .await
-            .with_context(|| format!("removing container '{id}'"))?;
-        Ok(())
+        // Hand root-written workspace files back to the host user before
+        // teardown: `HmVm::execute` always destroys the VM (awaited, even
+        // on cancellation) before the runner keeps or drops the tempdir,
+        // so the reclaim happens-before every host-side read or removal of
+        // the workspace. When the command was cut short (`exec_in_flight`
+        // still set) the container is quiesced first so no in-container
+        // writer can race or follow the chown.
+        let reclaim = self.workspace_reclaim.take();
+        let exec_in_flight = self.exec_in_flight.load(Ordering::Acquire);
+        teardown_container(&self.client, &id, reclaim, exec_in_flight).await
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::reference_matches;
+
+    #[test]
+    fn exact_repo_pattern_matches_every_tag_of_that_repo() {
+        assert!(reference_matches("ephemeral", "ephemeral:latest"));
+        assert!(reference_matches(
+            "ephemeral",
+            "ephemeral:0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcd"
+        ));
+        assert!(reference_matches(
+            "harmont-ephemeral",
+            "harmont-ephemeral:3c5e0fda-1111-2222-3333-444444444444"
+        ));
+    }
+
+    #[test]
+    fn exact_repo_pattern_rejects_other_repos() {
+        // A user's own image whose repo merely starts with the pattern
+        // must never be swept.
+        assert!(!reference_matches("ephemeral", "ephemeral-test:latest"));
+        assert!(!reference_matches("ephemeral", "my-ephemeral:latest"));
+        assert!(!reference_matches("ephemeral", "ephemeral/sub:latest"));
+    }
+
+    #[test]
+    fn wildcard_pattern_matches_one_path_component() {
+        assert!(reference_matches(
+            "harmont-cache/*",
+            "harmont-cache/build:0123456789abcdef"
+        ));
+        assert!(!reference_matches(
+            "harmont-cache/*",
+            "harmont-cache:latest"
+        ));
+        assert!(!reference_matches(
+            "harmont-cache/*",
+            "harmont-cachex/build:latest"
+        ));
+        assert!(!reference_matches(
+            "harmont-cache/*",
+            "harmont-cache/a/b:latest"
+        ));
+        assert!(!reference_matches("harmont-cache/*", "other/build:latest"));
     }
 }
diff --git a/crates/hm-vm/src/registry.rs b/crates/hm-vm/src/registry.rs
index 960fc922..11c59fa0 100644
--- a/crates/hm-vm/src/registry.rs
+++ b/crates/hm-vm/src/registry.rs
@@ -61,7 +61,8 @@ impl ImageRegistry {
 
         conn.execute_batch(
             "PRAGMA journal_mode = WAL;
-             PRAGMA synchronous = NORMAL;",
+             PRAGMA synchronous = NORMAL;
+             PRAGMA busy_timeout = 5000;",
         )?;
 
         conn.execute_batch(
@@ -127,34 +128,52 @@ impl ImageRegistry {
         result.map(|(snap, ws)| (SnapshotId(snap), ws))
     }
 
-    /// Insert or update a cache entry, optionally with a workspace directory.
+    /// Insert or update a cache entry.
     ///
-    /// Returns the evicted entries (snapshot ID and optional workspace directory)
-    /// to keep the registry within its configured capacity. The caller is
-    /// responsible for cleaning up backend resources and workspace directories
-    /// associated with evicted entries.
+    /// The `workspace_dir` column is always written as `NULL`: the registry
+    /// stores system-state snapshots only. Workspace state is strictly
+    /// run-scoped and never persisted across runs (non-`NULL` values are
+    /// legacy rows kept solely so their directories can be reaped).
     ///
-    /// Upsert + eviction run under a single lock acquisition to prevent
-    /// TOCTOU races between concurrent callers.
+    /// Returns the evicted entries (snapshot ID and optional legacy workspace
+    /// directory) to keep the registry within its configured capacity. The
+    /// caller is responsible for cleaning up backend resources and legacy
+    /// workspace directories associated with evicted entries.
+    ///
+    /// Upsert + eviction run inside a single transaction so concurrent
+    /// writers (including other processes sharing the database file) can
+    /// never observe a partially-applied put.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the registry mutex is poisoned or any statement
+    /// fails; the caller must treat the snapshot as unregistered.
+    #[allow(
+        clippy::significant_drop_tightening,
+        reason = "the transaction borrows the guarded connection until commit"
+    )]
     pub fn put(
         &self,
         key: &str,
         snapshot: &SnapshotId,
-        workspace_dir: Option<&str>,
-    ) -> Vec<(SnapshotId, Option<String>)> {
+    ) -> Result<Vec<(SnapshotId, Option<String>)>> {
         let now = epoch_secs();
 
-        let Ok(conn) = self.conn.lock() else {
-            return Vec::new();
-        };
+        let mut conn = self
+            .conn
+            .lock()
+            .map_err(|_| anyhow::anyhow!("registry mutex poisoned"))?;
+        let tx = conn.transaction()?;
 
-        let _result = conn.execute(
-            "INSERT OR REPLACE INTO snapshots (key, snapshot_id, accessed_at, workspace_dir)
-             VALUES (?1, ?2, ?3, ?4)",
-            rusqlite::params![key, snapshot.0, now, workspace_dir],
-        );
+        tx.execute(
+            "INSERT OR REPLACE INTO snapshots (key, snapshot_id, accessed_at)
+             VALUES (?1, ?2, ?3)",
+            rusqlite::params![key, snapshot.0, now],
+        )?;
 
-        Self::evict_overflow_locked(&conn, self.capacity)
+        let evicted = Self::evict_overflow_tx(&tx, self.capacity)?;
+        tx.commit()?;
+        Ok(evicted)
     }
 
     /// Remove a specific entry.
@@ -182,6 +201,61 @@ impl ImageRegistry {
         row.map(|(snap, ws)| (SnapshotId(snap), ws))
     }
 
+    /// Compare-and-delete: remove the entry for `key` only if it still maps
+    /// to `expected`.
+    ///
+    /// This closes the race where a stale entry is observed, the lock is
+    /// released for an async backend check, and a fresh entry is inserted
+    /// under the same key before the invalidation lands — a plain
+    /// [`Self::invalidate`] would destroy the fresh entry.
+    ///
+    /// Returns `Some(legacy_workspace_dir)` when the observed row was
+    /// deleted, or `None` when the key is absent or now holds a different
+    /// snapshot (the concurrently re-inserted entry survives).
+    #[must_use]
+    #[allow(
+        clippy::significant_drop_tightening,
+        reason = "the prepared statement borrows the guarded connection"
+    )]
+    pub fn invalidate_if(&self, key: &str, expected: &SnapshotId) -> Option<Option<String>> {
+        let conn = self.conn.lock().ok()?;
+        let mut stmt = conn
+            .prepare(
+                "DELETE FROM snapshots WHERE key = ?1 AND snapshot_id = ?2
+                 RETURNING workspace_dir",
+            )
+            .ok()?;
+        let mut rows = stmt
+            .query_map(rusqlite::params![key, expected.0], |row| {
+                row.get::<_, Option<String>>(0)
+            })
+            .ok()?;
+        let legacy_ws = rows.next()?.ok()?;
+        Some(legacy_ws)
+    }
+
+    /// Returns `true` if any entry currently maps to `snapshot`.
+    ///
+    /// Used as a pre-removal guard for deferred eviction cleanup: a tag that
+    /// was evicted earlier may have been re-registered since (by a later step
+    /// in this run or by a concurrent process); Docker re-tagging means the
+    /// tag now names the *fresh* image, so removing it would destroy a live
+    /// cache entry.
+    #[must_use]
+    pub fn contains_snapshot(&self, snapshot: &SnapshotId) -> bool {
+        let Ok(conn) = self.conn.lock() else {
+            // A poisoned lock means we cannot prove the snapshot is unused;
+            // report it as referenced so callers err on the side of keeping it.
+            return true;
+        };
+        conn.query_row(
+            "SELECT COUNT(*) FROM snapshots WHERE snapshot_id = ?1",
+            [&snapshot.0],
+            |row| row.get::<_, i64>(0),
+        )
+        .map_or(true, |n| n > 0)
+    }
+
     /// Returns the number of cached entries.
     #[must_use]
     pub fn len(&self) -> u64 {
@@ -203,53 +277,40 @@ impl ImageRegistry {
     }
 
     /// Evict the oldest entries (by `accessed_at`) when the registry exceeds
-    /// its capacity. Caller must already hold the connection lock.
+    /// its capacity. Runs inside the caller's transaction.
     ///
-    /// SELECT + DELETE run in one lock scope to prevent TOCTOU races
-    /// between concurrent callers.
-    fn evict_overflow_locked(
-        conn: &Connection,
+    /// A single `DELETE .. RETURNING` statement selects and removes the same
+    /// rows, with `key` as a deterministic tie-break for equal timestamps, so
+    /// the returned set can never diverge from the deleted set.
+    fn evict_overflow_tx(
+        tx: &rusqlite::Transaction<'_>,
         capacity: u64,
-    ) -> Vec<(SnapshotId, Option<String>)> {
-        let count: i64 = conn
-            .query_row("SELECT COUNT(*) FROM snapshots", [], |row| row.get(0))
-            .unwrap_or(0);
-        #[allow(clippy::cast_sign_loss)]
-        let count = count as u64;
+    ) -> Result<Vec<(SnapshotId, Option<String>)>> {
+        let count: i64 = tx.query_row("SELECT COUNT(*) FROM snapshots", [], |row| row.get(0))?;
+        let count = u64::try_from(count).unwrap_or(0);
 
         if count <= capacity {
-            return Vec::new();
+            return Ok(Vec::new());
         }
 
         let overflow = count - capacity;
 
-        let Ok(mut stmt) = conn.prepare(
-            "SELECT snapshot_id, workspace_dir FROM snapshots ORDER BY accessed_at ASC LIMIT ?1",
-        ) else {
-            return Vec::new();
-        };
+        let mut stmt = tx.prepare(
+            "DELETE FROM snapshots WHERE key IN (
+                 SELECT key FROM snapshots ORDER BY accessed_at ASC, key ASC LIMIT ?1
+             ) RETURNING snapshot_id, workspace_dir",
+        )?;
 
-        let evicted: Vec<(SnapshotId, Option<String>)> = stmt
+        let evicted = stmt
             .query_map([overflow], |row| {
                 Ok((
                     SnapshotId(row.get::<_, String>(0)?),
                     row.get::<_, Option<String>>(1)?,
                 ))
-            })
-            .ok()
-            .map(|rows| rows.filter_map(Result::ok).collect())
-            .unwrap_or_default();
-
-        drop(stmt);
-
-        let _deleted = conn.execute(
-            "DELETE FROM snapshots WHERE key IN (
-                 SELECT key FROM snapshots ORDER BY accessed_at ASC LIMIT ?1
-             )",
-            [overflow],
-        );
+            })?
+            .collect::<std::result::Result<Vec<_>, _>>()?;
 
-        evicted
+        Ok(evicted)
     }
 }
 
@@ -271,11 +332,23 @@ mod tests {
         assert!(reg.get("nonexistent").is_none());
     }
 
+    /// Insert a legacy-style row with a non-NULL `workspace_dir`, as written
+    /// by pre-fix versions that persisted cached workspaces.
+    fn insert_legacy_row(reg: &ImageRegistry, key: &str, snap: &str, ws: &str) {
+        let conn = reg.conn.lock().unwrap();
+        conn.execute(
+            "INSERT OR REPLACE INTO snapshots (key, snapshot_id, accessed_at, workspace_dir)
+             VALUES (?1, ?2, ?3, ?4)",
+            rusqlite::params![key, snap, epoch_secs(), ws],
+        )
+        .unwrap();
+    }
+
     #[test]
     fn put_then_get_returns_snapshot() {
         let (reg, _dir) = open_temp(10);
         let snap = SnapshotId("snap-abc".into());
-        let evicted = reg.put("my-key", &snap, None);
+        let evicted = reg.put("my-key", &snap).expect("put");
         assert!(evicted.is_empty());
 
         let got = reg.get("my-key");
@@ -287,12 +360,12 @@ mod tests {
         let (reg, _dir) = open_temp(2);
 
         // Insert a, then b. "a" is older by insertion order.
-        reg.put("a", &SnapshotId("snap-a".into()), None);
+        reg.put("a", &SnapshotId("snap-a".into())).expect("put a");
 
         // Tiny sleep so timestamps differ.
         std::thread::sleep(std::time::Duration::from_secs(1));
 
-        reg.put("b", &SnapshotId("snap-b".into()), None);
+        reg.put("b", &SnapshotId("snap-b".into())).expect("put b");
 
         // Touch "a" so it becomes the most recently accessed.
         std::thread::sleep(std::time::Duration::from_secs(1));
@@ -301,7 +374,7 @@ mod tests {
         // Now insert "c" -- capacity is 2, so one must be evicted.
         // "b" should be evicted since "a" was touched more recently.
         std::thread::sleep(std::time::Duration::from_secs(1));
-        let evicted = reg.put("c", &SnapshotId("snap-c".into()), None);
+        let evicted = reg.put("c", &SnapshotId("snap-c".into())).expect("put c");
 
         assert_eq!(evicted.len(), 1);
         assert_eq!(evicted[0].0, SnapshotId("snap-b".into()));
@@ -316,13 +389,13 @@ mod tests {
     fn eviction_returns_overflow_entries() {
         let (reg, _dir) = open_temp(2);
 
-        reg.put("x", &SnapshotId("snap-x".into()), None);
+        reg.put("x", &SnapshotId("snap-x".into())).expect("put x");
         std::thread::sleep(std::time::Duration::from_secs(1));
-        reg.put("y", &SnapshotId("snap-y".into()), None);
+        reg.put("y", &SnapshotId("snap-y".into())).expect("put y");
         std::thread::sleep(std::time::Duration::from_secs(1));
 
         // This third insert should evict the oldest ("x").
-        let evicted = reg.put("z", &SnapshotId("snap-z".into()), None);
+        let evicted = reg.put("z", &SnapshotId("snap-z".into())).expect("put z");
 
         assert_eq!(evicted.len(), 1);
         assert_eq!(evicted[0].0, SnapshotId("snap-x".into()));
@@ -336,7 +409,8 @@ mod tests {
 
         {
             let reg = ImageRegistry::open(&db_path, 10).expect("open");
-            reg.put("persistent", &SnapshotId("snap-persist".into()), None);
+            reg.put("persistent", &SnapshotId("snap-persist".into()))
+                .expect("put");
             assert_eq!(reg.len(), 1);
             // reg is dropped here, closing the connection.
         }
@@ -351,7 +425,7 @@ mod tests {
     fn invalidate_returns_removed_snapshot() {
         let (reg, _dir) = open_temp(10);
         let snap = SnapshotId("snap-rm".into());
-        reg.put("to-remove", &snap, None);
+        reg.put("to-remove", &snap).expect("put");
 
         let removed = reg.invalidate("to-remove");
         assert_eq!(removed, Some((SnapshotId("snap-rm".into()), None)));
@@ -364,32 +438,92 @@ mod tests {
     }
 
     #[test]
-    fn put_with_workspace_then_get_returns_both() {
+    fn put_writes_null_workspace() {
         let (reg, _dir) = open_temp(10);
-        let snap = SnapshotId("snap-ws".into());
-        reg.put("ws-key", &snap, Some("/cache/workspaces/ws-key"));
+        reg.put("plain-key", &SnapshotId("snap-plain".into()))
+            .expect("put");
 
-        let (got_snap, got_ws) = reg.get_with_workspace("ws-key").unwrap();
-        assert_eq!(got_snap, SnapshotId("snap-ws".into()));
-        assert_eq!(got_ws.as_deref(), Some("/cache/workspaces/ws-key"));
+        let (_, got_ws) = reg.get_with_workspace("plain-key").unwrap();
+        assert!(got_ws.is_none());
     }
 
     #[test]
-    fn put_without_workspace_returns_none_workspace() {
+    fn put_overwrites_legacy_workspace_with_null() {
         let (reg, _dir) = open_temp(10);
-        reg.put("plain-key", &SnapshotId("snap-plain".into()), None);
+        insert_legacy_row(&reg, "k", "snap-old", "/ws/legacy");
 
-        let (_, got_ws) = reg.get_with_workspace("plain-key").unwrap();
-        assert!(got_ws.is_none());
+        reg.put("k", &SnapshotId("snap-new".into())).expect("put");
+
+        let (snap, ws) = reg.get_with_workspace("k").unwrap();
+        assert_eq!(snap, SnapshotId("snap-new".into()));
+        assert!(ws.is_none());
+    }
+
+    #[test]
+    fn invalidate_if_matching_snapshot_deletes_row() {
+        let (reg, _dir) = open_temp(10);
+        let snap = SnapshotId("snap-cas".into());
+        reg.put("cas-key", &snap).expect("put");
+
+        let removed = reg.invalidate_if("cas-key", &snap);
+        assert_eq!(removed, Some(None));
+        assert!(reg.get("cas-key").is_none());
+    }
+
+    #[test]
+    fn invalidate_if_mismatched_snapshot_keeps_row() {
+        let (reg, _dir) = open_temp(10);
+        reg.put("cas-key", &SnapshotId("snap-fresh".into()))
+            .expect("put");
+
+        // A stale observer tries to invalidate with the snapshot it saw
+        // earlier; the fresh row must survive.
+        let removed = reg.invalidate_if("cas-key", &SnapshotId("snap-stale".into()));
+        assert!(removed.is_none());
+        assert_eq!(reg.get("cas-key"), Some(SnapshotId("snap-fresh".into())));
+
+        // Absent key is also a no-op.
+        assert!(
+            reg.invalidate_if("missing", &SnapshotId("whatever".into()))
+                .is_none()
+        );
+    }
+
+    #[test]
+    fn invalidate_if_returns_legacy_workspace_for_cleanup() {
+        let (reg, _dir) = open_temp(10);
+        insert_legacy_row(&reg, "legacy", "snap-legacy", "/ws/legacy");
+
+        let removed = reg.invalidate_if("legacy", &SnapshotId("snap-legacy".into()));
+        assert_eq!(removed, Some(Some("/ws/legacy".into())));
+        assert!(reg.get("legacy").is_none());
+    }
+
+    #[test]
+    fn contains_snapshot_tracks_rows() {
+        let (reg, _dir) = open_temp(10);
+        let snap = SnapshotId("snap-ref".into());
+        assert!(!reg.contains_snapshot(&snap));
+
+        reg.put("k", &snap).expect("put");
+        assert!(reg.contains_snapshot(&snap));
+
+        // The same snapshot under a second key still counts.
+        reg.put("k2", &snap).expect("put");
+        let _ = reg.invalidate("k");
+        assert!(reg.contains_snapshot(&snap));
+
+        let _ = reg.invalidate("k2");
+        assert!(!reg.contains_snapshot(&snap));
     }
 
     #[test]
-    fn eviction_includes_workspace_path() {
+    fn eviction_returns_legacy_workspace_path() {
         let (reg, _dir) = open_temp(1);
-        reg.put("a", &SnapshotId("snap-a".into()), Some("/ws/a"));
+        insert_legacy_row(&reg, "a", "snap-a", "/ws/a");
         std::thread::sleep(std::time::Duration::from_secs(1));
 
-        let evicted = reg.put("b", &SnapshotId("snap-b".into()), Some("/ws/b"));
+        let evicted = reg.put("b", &SnapshotId("snap-b".into())).expect("put b");
         assert_eq!(evicted.len(), 1);
         assert_eq!(evicted[0].0, SnapshotId("snap-a".into()));
         assert_eq!(evicted[0].1.as_deref(), Some("/ws/a"));
diff --git a/crates/hm-vm/src/types.rs b/crates/hm-vm/src/types.rs
index f94c2b16..6174af11 100644
--- a/crates/hm-vm/src/types.rs
+++ b/crates/hm-vm/src/types.rs
@@ -26,7 +26,10 @@ pub struct Action {
     pub working_dir: String,
     pub timeout: Option<Duration>,
     /// Host workspace directory to bind-mount into the VM.
-    /// Skipped on cache hits (snapshot already contains prior state).
+    ///
+    /// Bind mounts are EXCLUDED from snapshots (`docker commit` captures
+    /// system state only), so workspace contents never persist into the
+    /// cache and are never consulted on the cache-hit path.
     pub workspace: Option<WorkspaceMount>,
 }
 
@@ -50,7 +53,6 @@ pub struct ExecutionResult {
     pub exit_code: i32,
     pub snapshot: Option<SnapshotId>,
     pub cached: bool,
-    pub workspace_dir: Option<PathBuf>,
     /// True when the snapshot is ephemeral (not registered in the cache)
     /// and must be cleaned up by the caller after downstream steps finish.
     pub ephemeral_snapshot: bool,
@@ -62,7 +64,6 @@ pub struct VmConfig {
     pub cpus: Option<u32>,
     pub memory_mib: Option<u64>,
     pub disk_size_gb: Option<u64>,
-    pub workspace_cache_dir: Option<PathBuf>,
 }
 
 /// Receives stdout/stderr lines during execution.
diff --git a/crates/hm-vm/src/vm.rs b/crates/hm-vm/src/vm.rs
index 0a1dce1e..50d972b3 100644
--- a/crates/hm-vm/src/vm.rs
+++ b/crates/hm-vm/src/vm.rs
@@ -1,15 +1,34 @@
 //! High-level VM orchestrator.
 
-use std::path::PathBuf;
 use std::sync::Arc;
 
 use anyhow::Result;
+use tokio_util::sync::CancellationToken;
 use tracing::{instrument, warn};
 use uuid::Uuid;
 
 use crate::backend::VmBackend;
 use crate::registry::ImageRegistry;
-use crate::types::{Action, CachingPolicy, ExecutionResult, ImageSource, OutputSink, VmConfig};
+use crate::types::{
+    Action, CachingPolicy, ExecutionResult, ImageSource, OutputSink, SnapshotId, VmConfig,
+};
+
+/// Exit code reported when execution is cut short by cooperative
+/// cancellation (Ctrl-C, sibling failure, per-step timeout). Mirrors the
+/// conventional 128+SIGINT shell encoding; the scheduler maps it to a
+/// `Canceled` step status.
+const CANCELLED_EXIT_CODE: i32 = 130;
+
+/// The [`ExecutionResult`] returned when a step is cancelled before or
+/// during execution: nothing was committed, nothing was cached.
+const fn cancelled_result() -> ExecutionResult {
+    ExecutionResult {
+        exit_code: CANCELLED_EXIT_CODE,
+        snapshot: None,
+        cached: false,
+        ephemeral_snapshot: false,
+    }
+}
 
 /// High-level orchestrator that drives the VM lifecycle.
 ///
@@ -21,6 +40,14 @@ pub struct HmVm {
     backend: Arc<dyn VmBackend>,
     registry: ImageRegistry,
     config: VmConfig,
+    /// Snapshots (plus optional legacy workspace dirs) evicted from the
+    /// registry during this run. Their backend images are NOT removed at
+    /// eviction time: an in-flight step may still hold the evicted tag as
+    /// its `parent_snapshot` and restore from it later (cache-hit outcomes
+    /// propagate only the tag). Removal is deferred to
+    /// [`Self::cleanup_deferred_evictions`], which the local backend calls
+    /// strictly after the whole DAG has drained.
+    deferred_evictions: std::sync::Mutex<Vec<(crate::types::SnapshotId, Option<String>)>>,
 }
 
 impl HmVm {
@@ -30,42 +57,117 @@ impl HmVm {
             backend,
             registry,
             config,
+            deferred_evictions: std::sync::Mutex::new(Vec::new()),
         }
     }
 
     /// Check whether a cached result exists for `key` without executing anything.
     ///
-    /// Returns `Some(result)` on a valid hit (snapshot exists in backend and
-    /// workspace dir, if recorded, still exists on disk). Invalidates stale
-    /// entries as a side-effect. Returns `None` on miss.
+    /// Returns `Some(result)` on a valid hit (snapshot still exists in the
+    /// backend). Invalidates stale entries as a side-effect. Returns `None`
+    /// on miss.
     ///
     /// # Errors
     ///
     /// Returns an error if the backend's `snapshot_exists` check fails.
     pub async fn peek_cache(&self, key: &str) -> Result<Option<ExecutionResult>> {
-        if let Some((snap, ws)) = self.registry.get_with_workspace(key) {
-            let ws_valid = ws.as_ref().is_none_or(|p| std::path::Path::new(p).is_dir());
-            if ws_valid && self.backend.snapshot_exists(&snap).await? {
-                return Ok(Some(ExecutionResult {
-                    exit_code: 0,
-                    snapshot: Some(snap),
-                    cached: true,
-                    workspace_dir: ws.map(PathBuf::from),
-                    ephemeral_snapshot: false,
-                }));
+        self.check_cache(key).await
+    }
+
+    /// Single implementation of the cache-hit check shared by
+    /// [`Self::peek_cache`] and [`Self::execute`].
+    ///
+    /// A hit requires only a registry row plus a live backend snapshot: the
+    /// cache stores system state (what `docker commit` captures) and nothing
+    /// else. A stale row (image removed out-of-band, e.g. `docker rmi`) is
+    /// invalidated with a compare-and-delete so a concurrently re-inserted
+    /// fresh entry is never destroyed; any legacy workspace directory the
+    /// row still referenced is reaped lazily.
+    async fn check_cache(&self, key: &str) -> Result<Option<ExecutionResult>> {
+        let Some((snap, _legacy_ws)) = self.registry.get_with_workspace(key) else {
+            return Ok(None);
+        };
+        if self.backend.snapshot_exists(&snap).await? {
+            return Ok(Some(ExecutionResult {
+                exit_code: 0,
+                snapshot: Some(snap),
+                cached: true,
+                ephemeral_snapshot: false,
+            }));
+        }
+        // The image is already gone, so there is nothing to remove in the
+        // backend -- only the registry row (and any legacy workspace dir).
+        warn!(key, snapshot = %snap.0, "cached snapshot missing from backend; invalidating entry");
+        if let Some(Some(legacy_ws)) = self.registry.invalidate_if(key, &snap) {
+            tokio::task::spawn_blocking(move || std::fs::remove_dir_all(legacy_ws).ok())
+                .await
+                .ok();
+        }
+        Ok(None)
+    }
+
+    /// Remove backend images for registry entries evicted during this run.
+    ///
+    /// Must be called strictly after every step of the run has finished
+    /// (the local backend invokes it once the scheduler's DAG has fully
+    /// drained), so no in-flight step can still restore from an evicted
+    /// tag.
+    ///
+    /// A tag that has been re-registered since its eviction is skipped:
+    /// the same key may have been rebuilt later in this run, or a
+    /// concurrent process may have re-inserted it, and backend re-tagging
+    /// means the tag now names the *fresh* image — removing it would
+    /// destroy a live cache entry. (A narrow cross-process window remains:
+    /// a concurrent run that observed the tag before the eviction — as its
+    /// own cache hit, whose children then carry the tag as their
+    /// `parent_snapshot` — and restores from it after this cleanup gets a
+    /// hard "no such image" restore failure and the step FAILS; the
+    /// `parent_snapshot` restore path never goes through `check_cache`, so
+    /// this does not degrade gracefully into re-execution. The stale row is
+    /// invalidated on that process's next `check_cache` of the key. Closing
+    /// the window entirely needs backend-level coordination between
+    /// processes, which the registry cannot provide.)
+    ///
+    /// Legacy workspace directories riding on evicted rows are always
+    /// reaped. Best-effort throughout: failures are logged, never
+    /// propagated.
+    pub async fn cleanup_deferred_evictions(&self) {
+        let pending = std::mem::take(
+            &mut *self
+                .deferred_evictions
+                .lock()
+                .unwrap_or_else(std::sync::PoisonError::into_inner),
+        );
+        for (snap, legacy_ws) in pending {
+            if self.registry.contains_snapshot(&snap) {
+                tracing::debug!(snapshot = %snap.0, "evicted tag was re-registered; keeping image");
+            } else if let Err(e) = self.backend.remove_snapshot(&snap).await {
+                warn!(snapshot = %snap.0, error = %e, "failed to remove evicted snapshot");
             }
-            if let Some((old_snap, old_ws)) = self.registry.invalidate(key) {
-                if let Err(e) = self.backend.remove_snapshot(&old_snap).await {
-                    warn!(snapshot = %old_snap.0, error = %e, "failed to remove stale snapshot");
-                }
-                if let Some(ws_path) = old_ws {
-                    tokio::task::spawn_blocking(move || std::fs::remove_dir_all(ws_path).ok())
-                        .await
-                        .ok();
-                }
+            if let Some(ws_path) = legacy_ws {
+                tokio::task::spawn_blocking(move || std::fs::remove_dir_all(ws_path).ok())
+                    .await
+                    .ok();
             }
         }
-        Ok(None)
+    }
+
+    /// Remove `snapshot` from the backend unless the registry currently
+    /// maps some key to it.
+    ///
+    /// Guarded twin of a bare `VmBackend::remove_snapshot` for reaping
+    /// run-scoped (ephemeral or demoted-to-ephemeral) snapshots: if a
+    /// concurrent run re-registered the same tag after this run marked it
+    /// ephemeral — e.g. a `harmont-cache/*` tag demoted by a failed
+    /// `registry.put` that another process has since successfully cached —
+    /// removing the image would destroy a live cache entry. Best-effort:
+    /// failures are logged, never propagated.
+    pub async fn remove_snapshot_unless_registered(&self, snapshot: &SnapshotId) {
+        if self.registry.contains_snapshot(snapshot) {
+            tracing::debug!(snapshot = %snapshot.0, "snapshot was re-registered; keeping image");
+        } else if let Err(e) = self.backend.remove_snapshot(snapshot).await {
+            warn!(snapshot = %snapshot.0, error = %e, "failed to remove ephemeral snapshot");
+        }
     }
 
     /// Execute an [`Action`] inside a VM, obeying the given [`CachingPolicy`].
@@ -75,130 +177,204 @@ impl HmVm {
     /// When the policy is [`CachingPolicy::Cache`] the registry is consulted
     /// first. A cache hit that still exists in the backend returns immediately.
     /// On a successful (exit-code 0) execution the resulting snapshot is stored
-    /// in the registry; evicted entries are cleaned up in the backend.
+    /// in the registry; entries evicted by the insert are queued for removal
+    /// via [`Self::cleanup_deferred_evictions`] once the run has drained.
+    ///
+    /// # Cancellation
+    ///
+    /// Cancellation is cooperative via `cancel`: when the token fires, the
+    /// in-flight command is abandoned and the VM is destroyed — and that
+    /// teardown (including the bind-mount ownership reclaim on native Linux
+    /// Docker) is *awaited* before this future resolves. Callers must
+    /// therefore never `select!`-drop this future on cancellation; awaiting
+    /// it guarantees the workspace directory is safe to read or remove the
+    /// moment it returns. A cancelled execution yields exit code 130 with
+    /// no snapshot.
     ///
     /// # Errors
     ///
     /// Returns an error if the backend fails to create, restore, or
     /// execute. Best-effort cleanup is performed even on failure paths.
-    #[instrument(skip(self, action, sink), fields(cmd = %action.cmd))]
+    #[instrument(skip(self, action, sink, cancel), fields(cmd = %action.cmd))]
     pub async fn execute(
         &self,
         action: Action,
         policy: CachingPolicy,
         sink: &dyn OutputSink,
+        cancel: &CancellationToken,
     ) -> Result<ExecutionResult> {
-        // 1. Cache check
+        // 1. Cache check. Callers typically `peek_cache` first; this second
+        // check is deliberately retained -- it is cheap (one SQLite read +
+        // one image list) and lets a concurrent fill between peek and
+        // execute still count as a hit, at worst wasting one COW copy.
         if let CachingPolicy::Cache { ref key } = policy
-            && let Some((snap, ws)) = self.registry.get_with_workspace(key)
+            && let Some(result) = self.check_cache(key).await?
         {
-            let ws_valid = ws.as_ref().is_none_or(|p| std::path::Path::new(p).is_dir());
-            if ws_valid && self.backend.snapshot_exists(&snap).await? {
-                return Ok(ExecutionResult {
-                    exit_code: 0,
-                    snapshot: Some(snap),
-                    cached: true,
-                    workspace_dir: ws.map(PathBuf::from),
-                    ephemeral_snapshot: false,
-                });
-            }
-            if let Some((old_snap, old_ws)) = self.registry.invalidate(key) {
-                if let Err(e) = self.backend.remove_snapshot(&old_snap).await {
-                    warn!(snapshot = %old_snap.0, error = %e, "failed to remove stale snapshot");
+            return Ok(result);
+        }
+
+        // 2. Create or restore the VM, bailing cooperatively if the run is
+        // cancelled mid-boot (image pulls can take minutes). No container
+        // handle exists yet on the cancel branch, so there is nothing to
+        // tear down, and the workspace still holds only host-user-owned
+        // files, so the caller may delete it immediately. (`biased` makes
+        // an already-cancelled token win deterministically. Dropping a
+        // creation request mid-roundtrip can, in a sub-millisecond window,
+        // leave a never-started container behind on the daemon; it holds no
+        // processes and writes nothing, so it cannot affect the workspace.)
+        let create_fut = async {
+            match &action.source {
+                ImageSource::Image(image) => {
+                    self.backend
+                        .create(image, &self.config, action.workspace.as_ref())
+                        .await
                 }
-                if let Some(ws_path) = old_ws {
-                    tokio::task::spawn_blocking(move || std::fs::remove_dir_all(ws_path).ok())
+                ImageSource::Snapshot(snap) => {
+                    self.backend
+                        .restore(snap, &self.config, action.workspace.as_ref())
                         .await
-                        .ok();
                 }
             }
-        }
-
-        // 2. Create or restore VM
-        let mut vm = match &action.source {
-            ImageSource::Image(image) => {
-                self.backend
-                    .create(image, &self.config, action.workspace.as_ref())
-                    .await?
-            }
-            ImageSource::Snapshot(snap) => {
-                self.backend
-                    .restore(snap, &self.config, action.workspace.as_ref())
-                    .await?
-            }
+        };
+        let mut vm = tokio::select! {
+            biased;
+            () = cancel.cancelled() => return Ok(cancelled_result()),
+            vm = create_fut => vm?,
         };
 
-        let result = self.run_in_vm(&mut *vm, &action, &policy, sink).await;
+        let result = self
+            .run_in_vm(&mut *vm, &action, &policy, sink, cancel)
+            .await;
 
-        // Always destroy the VM, even on error.
+        // Always destroy the VM -- on success, error AND cancellation --
+        // and await it: teardown reclaims bind-mount ownership and must
+        // happen-before the caller touches or removes the workspace dir.
         vm.destroy().await.ok();
 
         result
     }
 
-    /// Inner lifecycle: exec, snapshot, persist workspace. Separated so the
-    /// caller can guarantee `vm.destroy()` runs regardless of outcome.
+    /// Best-effort sweep of aged snapshot images that neither a registry
+    /// row nor this run's deferred-eviction queue references.
+    ///
+    /// Intended for *run-scoped* tag namespaces (`harmont-ephemeral`, the
+    /// legacy `ephemeral`), whose images are created and removed within a
+    /// single run: an aged tag there is residue of a dead run. It must NOT
+    /// be pointed at the shared cache namespace (`harmont-cache/*`) by
+    /// anything that runs automatically: the registry DB is per-user while
+    /// the backend (Docker daemon) can be shared, so a missing row does not
+    /// prove an image is orphaned — another user's registry may still
+    /// reference it. The `older_than` floor protects a concurrent run's
+    /// freshly committed images that have not been registered yet.
+    ///
+    /// The keep predicate protects, in addition to live registry rows, any
+    /// tag queued in [`Self::deferred_evictions`]: an LRU-evicted row is
+    /// gone from `SQLite` immediately, but in-flight steps of this run may
+    /// still restore from the evicted tag until the DAG drains, so it must
+    /// survive any sweep that runs concurrently with the run.
+    ///
+    /// # Errors
+    ///
+    /// Returns an error only if the backend cannot be queried at all;
+    /// per-image removal failures are logged and skipped.
+    pub async fn gc_orphaned_snapshots(
+        &self,
+        reference: &str,
+        older_than: std::time::Duration,
+    ) -> Result<u64> {
+        self.backend
+            .gc_snapshots(reference, older_than, &|tag| {
+                self.registry.contains_snapshot(&SnapshotId(tag.to_owned()))
+                    || self.is_eviction_deferred(tag)
+            })
+            .await
+    }
+
+    /// Whether `tag` is queued for deferred end-of-run removal — i.e. an
+    /// in-flight step of this run may still restore from it even though its
+    /// registry row is already gone.
+    fn is_eviction_deferred(&self, tag: &str) -> bool {
+        self.deferred_evictions
+            .lock()
+            .unwrap_or_else(std::sync::PoisonError::into_inner)
+            .iter()
+            .any(|(snap, _)| snap.0 == tag)
+    }
+
+    /// Inner lifecycle: exec, snapshot, register. Separated so the caller
+    /// can guarantee `vm.destroy()` runs regardless of outcome.
+    ///
+    /// The workspace bind mount is never persisted: snapshots capture system
+    /// state only, and workspace state is strictly run-scoped (owned by the
+    /// caller's temp directories).
     async fn run_in_vm(
         &self,
         vm: &mut dyn crate::backend::Vm,
         action: &Action,
         policy: &CachingPolicy,
         sink: &dyn OutputSink,
+        cancel: &CancellationToken,
     ) -> Result<ExecutionResult> {
-        let mut workspace_dir: Option<PathBuf> = None;
-
-        // 3. Execute command (with optional timeout)
-        let exec_fut = vm.exec(&action.cmd, &action.env, &action.working_dir, sink);
-        let exit_code = if let Some(timeout) = action.timeout {
-            match tokio::time::timeout(timeout, exec_fut).await {
-                Ok(result) => result?,
-                Err(_) => anyhow::bail!("command timed out after {timeout:?}"),
+        // 3. Execute command (with optional timeout), bailing cooperatively
+        // on cancellation. Dropping the exec future abandons only the
+        // output stream -- the in-container process may keep running until
+        // the caller's `vm.destroy()` quiesces the container (the backend
+        // tracks the interrupted exec and stops every process before
+        // reclaiming workspace ownership).
+        let exec_inner = vm.exec(&action.cmd, &action.env, &action.working_dir, sink);
+        let exec_fut = async {
+            if let Some(timeout) = action.timeout {
+                match tokio::time::timeout(timeout, exec_inner).await {
+                    Ok(result) => result,
+                    Err(_) => anyhow::bail!("command timed out after {timeout:?}"),
+                }
+            } else {
+                exec_inner.await
             }
-        } else {
-            exec_fut.await?
+        };
+        let exit_code = tokio::select! {
+            biased;
+            () = cancel.cancelled() => return Ok(cancelled_result()),
+            exit = exec_fut => exit?,
         };
 
         // 4. Snapshot and cache on success
         let (snapshot, ephemeral) = if exit_code == 0 {
-            let (label, is_ephemeral) = match &policy {
+            let (label, mut is_ephemeral) = match &policy {
                 CachingPolicy::Cache { key } => (key.clone(), false),
-                CachingPolicy::None => (format!("ephemeral-{}", Uuid::new_v4()), true),
+                CachingPolicy::None => (format!("harmont-ephemeral:{}", Uuid::new_v4()), true),
             };
             let snap = vm.snapshot(&label).await?;
 
             if let CachingPolicy::Cache { key } = &policy {
-                // Persist workspace to cache directory.
-                if let (Some(ws), Some(cache_dir)) = (
-                    action.workspace.as_ref(),
-                    self.config.workspace_cache_dir.as_ref(),
-                ) {
-                    let ws_cache = cache_dir.join(key.as_str());
-                    let host_path = ws.host_path.clone();
-                    let ws_cache_clone = ws_cache.clone();
-                    tokio::task::spawn_blocking(move || -> anyhow::Result<()> {
-                        if ws_cache_clone.exists() {
-                            std::fs::remove_dir_all(&ws_cache_clone).ok();
+                match self.registry.put(key, &snap) {
+                    Ok(evicted) => {
+                        // Do NOT remove evicted images here: a cache-hit
+                        // outcome propagates only the snapshot tag to its
+                        // children, and a child may restore from that tag
+                        // long after a sibling's put evicts the row (the
+                        // child can be parked on the parallelism semaphore).
+                        // Queue the entries; the local backend drains them
+                        // via `cleanup_deferred_evictions` strictly after
+                        // the whole DAG has finished.
+                        if !evicted.is_empty() {
+                            self.deferred_evictions
+                                .lock()
+                                .unwrap_or_else(std::sync::PoisonError::into_inner)
+                                .extend(evicted);
                         }
-                        std::fs::create_dir_all(&ws_cache_clone)?;
-                        crate::workspace::cow_copy(&host_path, &ws_cache_clone)?;
-                        Ok(())
-                    })
-                    .await??;
-                    workspace_dir = Some(ws_cache);
-                }
-
-                let ws_str = workspace_dir.as_ref().map(|ws| ws.display().to_string());
-                let evicted = self.registry.put(key, &snap, ws_str.as_deref());
-                for (old_snap, old_ws) in evicted {
-                    if let Err(e) = self.backend.remove_snapshot(&old_snap).await {
-                        warn!(snapshot = %old_snap.0, error = %e, "failed to remove evicted snapshot");
                     }
-                    if let Some(ws_path) = old_ws {
-                        tokio::task::spawn_blocking(move || {
-                            std::fs::remove_dir_all(ws_path).ok();
-                        })
-                        .await
-                        .ok();
+                    Err(e) => {
+                        // The snapshot exists but could not be registered.
+                        // Demote it to ephemeral so the scheduler removes it
+                        // after the run (same-run children still restore from
+                        // it for lineage) instead of orphaning it forever.
+                        warn!(
+                            key,
+                            error = %e,
+                            "failed to record snapshot in registry; demoting to ephemeral"
+                        );
+                        is_ephemeral = true;
                     }
                 }
             }
@@ -212,7 +388,6 @@ impl HmVm {
             exit_code,
             snapshot,
             cached: false,
-            workspace_dir: if exit_code == 0 { workspace_dir } else { None },
             ephemeral_snapshot: ephemeral,
         })
     }
@@ -240,6 +415,11 @@ mod tests {
         exit_code: i32,
         /// Whether `snapshot_exists` should return true.
         snapshot_exists: bool,
+        /// Artificial latency for `MockVm::exec` (lets cancellation tests
+        /// fire mid-execution).
+        exec_delay: Option<std::time::Duration>,
+        /// Aged snapshot tags the mock GC considers removable candidates.
+        gc_candidates: Vec<String>,
     }
 
     impl MockBackend {
@@ -248,6 +428,8 @@ mod tests {
                 calls: Arc::new(Mutex::new(Vec::new())),
                 exit_code,
                 snapshot_exists,
+                exec_delay: None,
+                gc_candidates: Vec::new(),
             }
         }
     }
@@ -266,6 +448,7 @@ mod tests {
             Ok(Box::new(MockVm {
                 calls: Arc::clone(&self.calls),
                 exit_code: self.exit_code,
+                exec_delay: self.exec_delay,
             }))
         }
 
@@ -281,6 +464,7 @@ mod tests {
             Ok(Box::new(MockVm {
                 calls: Arc::clone(&self.calls),
                 exit_code: self.exit_code,
+                exec_delay: self.exec_delay,
             }))
         }
 
@@ -299,11 +483,31 @@ mod tests {
             );
             Ok(())
         }
+
+        async fn gc_snapshots(
+            &self,
+            _reference: &str,
+            _older_than: std::time::Duration,
+            keep: &(dyn for<'a> Fn(&'a str) -> bool + Send + Sync),
+        ) -> Result<u64> {
+            let mut removed = 0;
+            for tag in &self.gc_candidates {
+                if keep(tag) {
+                    continue;
+                }
+                self.calls
+                    .lock()
+                    .map_or_else(|_| {}, |mut c| c.push(format!("gc_remove:{tag}")));
+                removed += 1;
+            }
+            Ok(removed)
+        }
     }
 
     struct MockVm {
         calls: Arc<Mutex<Vec<String>>>,
         exit_code: i32,
+        exec_delay: Option<std::time::Duration>,
     }
 
     #[async_trait]
@@ -318,6 +522,9 @@ mod tests {
             self.calls
                 .lock()
                 .map_or_else(|_| {}, |mut c| c.push(format!("exec:{cmd}")));
+            if let Some(delay) = self.exec_delay {
+                tokio::time::sleep(delay).await;
+            }
             Ok(self.exit_code)
         }
 
@@ -382,6 +589,7 @@ mod tests {
                     key: "step-1".into(),
                 },
                 &NullSink,
+                &CancellationToken::new(),
             )
             .await
             .expect("execute should succeed");
@@ -403,7 +611,9 @@ mod tests {
         let (registry, _dir) = open_temp_registry(10);
 
         // Pre-populate the registry.
-        registry.put("step-1", &SnapshotId("cached-snap".into()), None);
+        registry
+            .put("step-1", &SnapshotId("cached-snap".into()))
+            .expect("put");
 
         let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
 
@@ -414,6 +624,7 @@ mod tests {
                     key: "step-1".into(),
                 },
                 &NullSink,
+                &CancellationToken::new(),
             )
             .await
             .expect("execute should succeed");
@@ -436,7 +647,12 @@ mod tests {
         let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
 
         let result = hm
-            .execute(make_action(), CachingPolicy::None, &NullSink)
+            .execute(
+                make_action(),
+                CachingPolicy::None,
+                &NullSink,
+                &CancellationToken::new(),
+            )
             .await
             .expect("execute should succeed");
 
@@ -464,6 +680,7 @@ mod tests {
                     key: "step-fail".into(),
                 },
                 &NullSink,
+                &CancellationToken::new(),
             )
             .await
             .expect("execute should succeed");
@@ -481,6 +698,167 @@ mod tests {
         assert!(hm.registry.is_empty());
     }
 
+    #[tokio::test]
+    async fn stale_entry_is_invalidated_and_step_reexecutes() {
+        // Registry has a row, but the backend image is gone (e.g. an
+        // out-of-band `docker rmi`). The step must re-execute and the stale
+        // row must be replaced -- with no remove_snapshot call, since the
+        // image is already absent.
+        let backend = MockBackend::new(0, false);
+        let (registry, _dir) = open_temp_registry(10);
+        registry
+            .put("step-1", &SnapshotId("gone-snap".into()))
+            .expect("put");
+
+        let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
+
+        let result = hm
+            .execute(
+                make_action(),
+                CachingPolicy::Cache {
+                    key: "step-1".into(),
+                },
+                &NullSink,
+                &CancellationToken::new(),
+            )
+            .await
+            .expect("execute should succeed");
+
+        assert!(!result.cached);
+        assert_eq!(result.exit_code, 0);
+
+        let log = calls(&backend);
+        assert!(log.iter().any(|c| c.starts_with("exec:")));
+        assert!(!log.iter().any(|c| c.starts_with("remove_snapshot:")));
+
+        // The stale row was replaced by the fresh snapshot.
+        assert_eq!(
+            hm.registry.get("step-1"),
+            Some(SnapshotId("snap-step-1".into()))
+        );
+    }
+
+    #[tokio::test]
+    async fn eviction_defers_snapshot_removal_until_cleanup() {
+        // Capacity 1: caching a second key evicts the first. The evicted
+        // image must NOT be removed inline -- an in-flight step may still
+        // restore from the evicted tag -- only when the run has drained and
+        // `cleanup_deferred_evictions` is invoked.
+        let backend = MockBackend::new(0, false);
+        let (registry, _dir) = open_temp_registry(1);
+        let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
+
+        for key in ["step-a", "step-b"] {
+            hm.execute(
+                make_action(),
+                CachingPolicy::Cache { key: key.into() },
+                &NullSink,
+                &CancellationToken::new(),
+            )
+            .await
+            .expect("execute should succeed");
+        }
+
+        // No inline removal while steps could still be in flight.
+        let log = calls(&backend);
+        assert!(!log.iter().any(|c| c.starts_with("remove_snapshot:")));
+        assert_eq!(hm.registry.len(), 1);
+        assert!(hm.registry.get("step-b").is_some());
+
+        // End of run: the evicted snapshot is removed now.
+        // "step-a" ties with "step-b" on accessed_at; key ASC tie-break
+        // makes the eviction deterministic.
+        hm.cleanup_deferred_evictions().await;
+        let log = calls(&backend);
+        assert!(log.iter().any(|c| c == "remove_snapshot:snap-step-a"));
+        assert!(!log.iter().any(|c| c == "remove_snapshot:snap-step-b"));
+    }
+
+    #[tokio::test]
+    async fn deferred_eviction_skips_reregistered_snapshot() {
+        // An evicted tag that is re-inserted before cleanup (same key
+        // rebuilt later this run, or a concurrent process) now names the
+        // fresh image; removing it would destroy a live cache entry.
+        let backend = MockBackend::new(0, false);
+        let (registry, _dir) = open_temp_registry(1);
+        let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
+
+        for key in ["step-a", "step-b"] {
+            hm.execute(
+                make_action(),
+                CachingPolicy::Cache { key: key.into() },
+                &NullSink,
+                &CancellationToken::new(),
+            )
+            .await
+            .expect("execute should succeed");
+        }
+        // "step-a" was evicted and its removal deferred. Re-register the
+        // same snapshot tag (as a concurrent process or a later rebuild of
+        // the same key would after Docker re-tagging). The sleep gives the
+        // re-inserted row a strictly newer accessed_at so the capacity-1
+        // eviction removes "step-b", not the fresh row (timestamps have
+        // one-second granularity).
+        std::thread::sleep(std::time::Duration::from_secs(1));
+        hm.registry
+            .put("step-a", &SnapshotId("snap-step-a".into()))
+            .expect("put");
+
+        hm.cleanup_deferred_evictions().await;
+
+        // The re-registered tag survives cleanup: it now names a live
+        // cache entry's image.
+        let log = calls(&backend);
+        assert!(!log.iter().any(|c| c == "remove_snapshot:snap-step-a"));
+        assert_eq!(
+            hm.registry.get("step-a"),
+            Some(SnapshotId("snap-step-a".into()))
+        );
+    }
+
+    #[tokio::test]
+    async fn remove_unless_registered_guards_live_entries() {
+        // A demoted-to-ephemeral `harmont-cache/*` tag that a concurrent
+        // run re-registered must survive the scheduler's ephemeral
+        // cleanup; an unregistered tag must be removed.
+        let backend = MockBackend::new(0, false);
+        let (registry, _dir) = open_temp_registry(10);
+        let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
+
+        hm.registry
+            .put("step-a", &SnapshotId("snap-live".into()))
+            .expect("put");
+
+        hm.remove_snapshot_unless_registered(&SnapshotId("snap-live".into()))
+            .await;
+        hm.remove_snapshot_unless_registered(&SnapshotId("snap-orphan".into()))
+            .await;
+
+        let log = calls(&backend);
+        assert!(!log.iter().any(|c| c == "remove_snapshot:snap-live"));
+        assert!(log.iter().any(|c| c == "remove_snapshot:snap-orphan"));
+    }
+
+    #[tokio::test]
+    async fn cleanup_with_no_evictions_is_a_noop() {
+        let backend = MockBackend::new(0, false);
+        let (registry, _dir) = open_temp_registry(10);
+        let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
+
+        hm.execute(
+            make_action(),
+            CachingPolicy::Cache { key: "only".into() },
+            &NullSink,
+            &CancellationToken::new(),
+        )
+        .await
+        .expect("execute should succeed");
+        hm.cleanup_deferred_evictions().await;
+
+        let log = calls(&backend);
+        assert!(!log.iter().any(|c| c.starts_with("remove_snapshot:")));
+    }
+
     #[tokio::test]
     async fn cache_miss_from_snapshot_passes_workspace() {
         let backend = MockBackend::new(0, false);
@@ -498,6 +876,7 @@ mod tests {
                     key: "child-step".into(),
                 },
                 &NullSink,
+                &CancellationToken::new(),
             )
             .await
             .expect("execute should succeed");
@@ -511,4 +890,154 @@ mod tests {
         assert!(log.iter().any(|c| c.starts_with("exec:")));
         assert!(log.iter().any(|c| c.starts_with("snapshot:")));
     }
+
+    #[tokio::test]
+    async fn cancellation_mid_exec_destroys_vm_before_returning() {
+        // Cancellation must be cooperative: execute() resolves only after
+        // the VM has been destroyed (so the bind-mount ownership reclaim
+        // has run) and reports exit 130 with nothing snapshotted or cached.
+        let mut backend = MockBackend::new(0, false);
+        backend.exec_delay = Some(std::time::Duration::from_secs(30));
+        let (registry, _dir) = open_temp_registry(10);
+        let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
+
+        let cancel = CancellationToken::new();
+        let trigger = cancel.clone();
+        tokio::spawn(async move {
+            tokio::time::sleep(std::time::Duration::from_millis(50)).await;
+            trigger.cancel();
+        });
+
+        let result = hm
+            .execute(
+                make_action(),
+                CachingPolicy::Cache {
+                    key: "step-1".into(),
+                },
+                &NullSink,
+                &cancel,
+            )
+            .await
+            .expect("execute should succeed");
+
+        assert_eq!(result.exit_code, 130);
+        assert!(!result.cached);
+        assert!(result.snapshot.is_none());
+        assert!(!result.ephemeral_snapshot);
+
+        let log = calls(&backend);
+        // The exec started, was abandoned, and the VM was still destroyed
+        // before execute() returned.
+        assert!(log.iter().any(|c| c.starts_with("exec:")));
+        assert!(log.iter().any(|c| c == "destroy"));
+        assert!(!log.iter().any(|c| c.starts_with("snapshot:")));
+        assert!(hm.registry.is_empty());
+    }
+
+    #[tokio::test]
+    async fn pre_cancelled_token_skips_vm_creation() {
+        let backend = MockBackend::new(0, false);
+        let (registry, _dir) = open_temp_registry(10);
+        let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
+
+        let cancel = CancellationToken::new();
+        cancel.cancel();
+
+        let result = hm
+            .execute(
+                make_action(),
+                CachingPolicy::Cache {
+                    key: "step-1".into(),
+                },
+                &NullSink,
+                &cancel,
+            )
+            .await
+            .expect("execute should succeed");
+
+        assert_eq!(result.exit_code, 130);
+        // The biased select bails before any container is created.
+        let log = calls(&backend);
+        assert!(!log.iter().any(|c| c.starts_with("create:")));
+        assert!(!log.iter().any(|c| c.starts_with("exec:")));
+    }
+
+    #[tokio::test]
+    async fn gc_keeps_tags_queued_for_deferred_eviction() {
+        // An LRU-evicted row is gone from SQLite immediately, but in-flight
+        // steps of this run may still restore from the evicted tag until
+        // the DAG drains. A GC sweep running concurrently with the run must
+        // therefore keep any tag in the deferred-eviction queue even though
+        // no registry row references it any more.
+        let mut backend = MockBackend::new(0, false);
+        backend.gc_candidates = vec!["snap-step-a".into(), "snap-orphan".into()];
+        let (registry, _dir) = open_temp_registry(1);
+        let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
+
+        // Capacity 1: caching "step-b" evicts "step-a"'s row and queues
+        // "snap-step-a" for deferred removal.
+        for key in ["step-a", "step-b"] {
+            hm.execute(
+                make_action(),
+                CachingPolicy::Cache { key: key.into() },
+                &NullSink,
+                &CancellationToken::new(),
+            )
+            .await
+            .expect("execute should succeed");
+        }
+        assert!(hm.registry.get("step-a").is_none());
+
+        let removed = hm
+            .gc_orphaned_snapshots("harmont-cache/*", std::time::Duration::from_secs(0))
+            .await
+            .expect("gc should succeed");
+
+        // Only the true orphan goes; the deferred-eviction tag survives.
+        assert_eq!(removed, 1);
+        let log = calls(&backend);
+        assert!(log.iter().any(|c| c == "gc_remove:snap-orphan"));
+        assert!(!log.iter().any(|c| c == "gc_remove:snap-step-a"));
+
+        // End-of-run cleanup still removes it once the DAG has drained.
+        hm.cleanup_deferred_evictions().await;
+        let log = calls(&backend);
+        assert!(log.iter().any(|c| c == "remove_snapshot:snap-step-a"));
+    }
+
+    #[tokio::test]
+    async fn gc_orphans_removes_unregistered_and_keeps_registered() {
+        // The registry is the source of truth: an aged tag with a live row
+        // must survive the sweep; an aged tag without one is an orphan and
+        // goes.
+        let mut backend = MockBackend::new(0, false);
+        backend.gc_candidates = vec![
+            "harmont-cache/live:aaaabbbbccccdddd".into(),
+            "harmont-cache/orphan:eeeeffff00001111".into(),
+        ];
+        let (registry, _dir) = open_temp_registry(10);
+        registry
+            .put(
+                "harmont-cache/live:aaaabbbbccccdddd",
+                &SnapshotId("harmont-cache/live:aaaabbbbccccdddd".into()),
+            )
+            .expect("put");
+        let hm = HmVm::new(Arc::new(backend.clone()), registry, VmConfig::default());
+
+        let removed = hm
+            .gc_orphaned_snapshots("harmont-cache/*", std::time::Duration::from_secs(0))
+            .await
+            .expect("gc should succeed");
+
+        assert_eq!(removed, 1);
+        let log = calls(&backend);
+        assert!(
+            log.iter()
+                .any(|c| c == "gc_remove:harmont-cache/orphan:eeeeffff00001111")
+        );
+        assert!(
+            !log.iter()
+                .any(|c| c == "gc_remove:harmont-cache/live:aaaabbbbccccdddd")
+        );
+    }
 }
diff --git a/crates/hm-vm/src/workspace.rs b/crates/hm-vm/src/workspace.rs
index 6ad11d73..8f69f4ce 100644
--- a/crates/hm-vm/src/workspace.rs
+++ b/crates/hm-vm/src/workspace.rs
@@ -7,69 +7,45 @@ use anyhow::{Context, Result, ensure};
 
 /// Create a copy-on-write clone of `src` contents into `dst`.
 ///
-/// macOS APFS: `cp -cR` with full-copy fallback for cross-volume.
-/// Linux: `cp --reflink=auto -a` (COW on btrfs/XFS, full copy on ext4).
+/// macOS: `cp -cpR` (APFS clonefile; `cp -c` itself falls back to
+/// `copyfile(2)` when cloning is unsupported, so no manual retry is
+/// needed). Linux: `cp --reflink=auto -a` (COW on btrfs/XFS, full copy
+/// on ext4). Symlinks are copied as symlinks; mode and mtime are
+/// preserved (incremental build tools depend on mtimes).
 ///
 /// # Errors
 ///
-/// Returns an error if `cp` cannot be spawned or exits with a non-zero status.
+/// Returns an error if `cp` cannot be spawned or exits with a non-zero
+/// status; `cp`'s stderr is captured into the error rather than leaked
+/// to the terminal.
 pub fn cow_copy(src: &Path, dst: &Path) -> Result<()> {
     let src_dot = format!("{}/.", src.display());
 
-    let status = if cfg!(target_os = "macos") {
-        let st = Command::new("cp")
-            .args(["-cpR", &src_dot])
-            .arg(dst)
-            .status();
-        match st {
-            Ok(s) if s.success() => s,
-            _ => {
-                // APFS clonefile failed (cross-volume or non-APFS). Wipe any
-                // partial state left by the failed attempt before falling back
-                // to a regular copy.
-                if dst.exists() {
-                    for entry in std::fs::read_dir(dst)
-                        .with_context(|| format!("cleaning partial COW dst {}", dst.display()))?
-                    {
-                        let p = entry?.path();
-                        if p.is_dir() {
-                            std::fs::remove_dir_all(&p).ok();
-                        } else {
-                            std::fs::remove_file(&p).ok();
-                        }
-                    }
-                }
-                Command::new("cp")
-                    .args(["-pR", &src_dot])
-                    .arg(dst)
-                    .status()
-                    .with_context(|| {
-                        format!(
-                            "spawning cp fallback: {} -> {}",
-                            src.display(),
-                            dst.display()
-                        )
-                    })?
-            }
-        }
+    let mut cmd = Command::new("cp");
+    if cfg!(target_os = "macos") {
+        cmd.args(["-cpR", &src_dot]);
     } else {
-        Command::new("cp")
-            .args(["--reflink=auto", "-a", &src_dot])
-            .arg(dst)
-            .status()
-            .with_context(|| format!("spawning cp: {} -> {}", src.display(), dst.display()))?
-    };
+        cmd.args(["--reflink=auto", "-a", &src_dot]);
+    }
+    cmd.arg(dst);
+
+    let output = cmd
+        .output()
+        .with_context(|| format!("spawning cp: {} -> {}", src.display(), dst.display()))?;
 
     ensure!(
-        status.success(),
-        "cp {} -> {} exited with {status}",
+        output.status.success(),
+        "cp {} -> {} exited with {}: {}",
         src.display(),
-        dst.display()
+        dst.display(),
+        output.status,
+        String::from_utf8_lossy(&output.stderr)
     );
     Ok(())
 }
 
 #[cfg(test)]
+#[allow(clippy::unwrap_used, clippy::expect_used)]
 mod tests {
     use super::*;
     use std::fs;
diff --git a/crates/hm/tests/deep_cache_chain_workspace.rs b/crates/hm/tests/deep_cache_chain_workspace.rs
index 2e48468d..5d146d27 100644
--- a/crates/hm/tests/deep_cache_chain_workspace.rs
+++ b/crates/hm/tests/deep_cache_chain_workspace.rs
@@ -22,11 +22,12 @@ fn write_pipeline(dir: &Path, marker: &str) {
         r#"
 import harmont as hm
 
-def build():
-    a = hm.scratch().run("echo step-a", label="a", cache=hm.forever())
-    b = a.run("echo step-b", label="b", cache=hm.forever())
-    c = b.run("cat /workspace/marker.txt", label="c")
-    return hm.pipeline(c, default_image="alpine:3.20")
+
+@hm.pipeline("deep-cache-chain", default_image="alpine:3.20")
+def build() -> hm.Step:
+    a = hm.scratch().sh("echo step-a", label="a", cache=hm.forever())
+    b = a.sh("echo step-b", label="b", cache=hm.forever())
+    return b.sh("cat /workspace/marker.txt", label="c")
 "#,
     )
     .expect("pipeline.py");
@@ -35,7 +36,7 @@ def build():
 fn run_hm(repo: &Path) -> String {
     let bin = env!("CARGO_BIN_EXE_hm");
     let out = Command::new(bin)
-        .args(["run"])
+        .args(["run", "--format", "human", "--logs"])
         .current_dir(repo)
         .output()
         .expect("spawn hm");
@@ -68,4 +69,14 @@ fn deep_chain_child_sees_fresh_workspace() {
         !out2.contains("deep-v1"),
         "run 2 leaked stale workspace through deep cache chain:\n{out2}"
     );
+    // Freshness must come from rebasing onto current source, not from
+    // silently re-executing the cached ancestors.
+    assert!(
+        out2.contains("[a] cache hit"),
+        "run 2 expected step a to be a cache hit:\n{out2}"
+    );
+    assert!(
+        out2.contains("[b] cache hit"),
+        "run 2 expected step b to be a cache hit:\n{out2}"
+    );
 }
diff --git a/crates/hm/tests/local_fork_cache.rs b/crates/hm/tests/local_fork_cache.rs
index 411a770d..6434f025 100644
--- a/crates/hm/tests/local_fork_cache.rs
+++ b/crates/hm/tests/local_fork_cache.rs
@@ -24,17 +24,18 @@ fn write_pipeline(dir: &Path, marker_contents: &str) {
         r#"
 import harmont as hm
 
-def build():
-    base = hm.scratch().run(
+
+@hm.pipeline("fork-cache", default_image="alpine:3.20")
+def build() -> hm.Step:
+    base = hm.scratch().sh(
         "echo base-ran",
         label="base",
         cache=hm.forever(),
     )
-    child = base.fork(label="child").run(
+    return base.fork(label="fork").sh(
         "cat /workspace/marker.txt",
         label="child",
     )
-    return hm.pipeline(child, default_image="alpine:3.20")
 "#,
     )
     .expect("pipeline.py");
@@ -43,7 +44,7 @@ def build():
 fn run_harmont(repo: &Path) -> String {
     let bin = env!("CARGO_BIN_EXE_hm");
     let output = Command::new(bin)
-        .args(["run"])
+        .args(["run", "--format", "human", "--logs"])
         .current_dir(repo)
         .output()
         .expect("spawn harmont");
diff --git a/crates/hm/tests/local_parallelism.rs b/crates/hm/tests/local_parallelism.rs
index f18175a2..1204a79c 100644
--- a/crates/hm/tests/local_parallelism.rs
+++ b/crates/hm/tests/local_parallelism.rs
@@ -22,10 +22,12 @@ fn write_pipeline(dir: &Path) {
         r#"
 import harmont as hm
 
-def build():
-    a = hm.scratch().run("sleep 3", label="sleep-a")
-    b = hm.scratch().run("sleep 3", label="sleep-b")
-    return hm.pipeline(a, b, default_image="alpine:3.20")
+
+@hm.pipeline("parallelism", default_image="alpine:3.20")
+def build() -> list[hm.Step]:
+    a = hm.scratch().sh("sleep 3", label="sleep-a")
+    b = hm.scratch().sh("sleep 3", label="sleep-b")
+    return [a, b]
 "#,
     )
     .expect("pipeline.py");