From 2255d0e6846e7b30cfbf00eeb7130ffe20512969 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 26 May 2026 19:31:36 -0400
Subject: [PATCH 01/21] config: introduce top-level `permissions:` block for
 rootfs/initramfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Users and groups are now declared in a top-level `permissions:` map and
referenced by name from `rootfs.<name>.permissions` / `initramfs.<name>.permissions`
(or inlined), instead of buried inside one extension. This puts identity
provisioning at the image layer where a single coherent passwd/shadow/group
makes sense, lets the same block be reused across rootfs and initramfs,
and leaves room to grow into directory perms or sudoers without further
grammar churn.

When no `permissions:` is set on an image, no script section is emitted —
the base packages' generic /etc/passwd/shadow/group are left untouched.

Extensions that still declare `users:` / `groups:` continue to work but
emit a deprecation warning; that path will be removed in a future release.

The script generator was extracted from ext/build.rs into a shared
`utils::permissions::render_users_groups_script` helper so the legacy
extension path and the new rootfs/initramfs path share one implementation.
---
 src/commands/ext/build.rs                    | 479 ++-----------------
 src/commands/initramfs/image.rs              |  19 +
 src/commands/rootfs/image.rs                 |  19 +
 src/commands/runtime/build.rs                |  28 +-
 src/utils/config.rs                          | 243 ++++++++++
 src/utils/mod.rs                             |   1 +
 src/utils/permissions.rs                     | 476 ++++++++++++++++++
 src/utils/runtime.rs                         |   1 +
 src/utils/target.rs                          |   3 +
 tests/fixtures/configs/with-permissions.yaml |  35 ++
 10 files changed, 857 insertions(+), 447 deletions(-)
 create mode 100644 src/utils/permissions.rs
 create mode 100644 tests/fixtures/configs/with-permissions.yaml
diff --git a/src/commands/ext/build.rs b/src/commands/ext/build.rs
index 82a2e511..4346645d 100644
--- a/src/commands/ext/build.rs
+++ b/src/commands/ext/build.rs
@@ -8,6 +8,7 @@ use crate::utils::config::{ComposedConfig, Config, ExtensionLocation};
 use crate::utils::container::{RunConfig, SdkContainer, TuiContext};
 use crate::utils::lockfile::LockFile;
 use crate::utils::output::{print_error, print_info, print_success, print_warning, OutputLevel};
+use crate::utils::permissions::render_users_groups_script;
 use crate::utils::stamps::{
     compute_ext_input_hash, generate_batch_read_stamps_script, generate_write_stamp_script,
     resolve_required_stamps, validate_stamps_batch, Stamp, StampCommand, StampComponent,
@@ -438,9 +439,25 @@ impl ExtBuildCommand {
             .and_then(|v| v.as_bool())
             .unwrap_or(false);
 
-        // Get users and groups configuration
+        // Get users and groups configuration. These are now deprecated on
+        // extensions — they should be declared in a top-level `permissions:`
+        // block and referenced from rootfs/initramfs. Continue to honor
+        // them during the deprecation window, but surface a warning so
+        // users have time to migrate.
         let users_config = ext_config.get("users").and_then(|v| v.as_mapping());
         let groups_config = ext_config.get("groups").and_then(|v| v.as_mapping());
+        if users_config.is_some() || groups_config.is_some() {
+            print_warning(
+                &format!(
+                    "[DEPRECATED] extension '{}' declares `users:` / `groups:`. \
+                     These are deprecated on extensions and will be removed in a future release. \
+                     Declare users and groups in a top-level `permissions:` block and reference \
+                     it from your rootfs/initramfs entries instead.",
+                    self.extension
+                ),
+                OutputLevel::Normal,
+            );
+        }
 
         // Validate that confext is present if enable_services is used
         if !enable_services.is_empty() && !ext_types.contains(&"confext") {
@@ -1284,454 +1301,22 @@ fi
     }
 
     /// Creates a script section for handling user and group configuration
-    /// This will copy passwd/shadow/group files and create/modify users and groups
+    /// Thin wrapper around [`render_users_groups_script`] that targets the
+    /// extension's sysroot `/etc` and seeds it from the rootfs `/etc`. Kept
+    /// for the legacy `extensions.<name>.users`/`groups` path during the
+    /// deprecation period; new code should call the shared helper directly.
     fn create_users_script_section(
         &self,
         users_config: Option<&serde_yaml::Mapping>,
         groups_config: Option<&serde_yaml::Mapping>,
     ) -> String {
-        // If neither users nor groups are configured, return empty string
-        if users_config.is_none() && groups_config.is_none() {
-            return String::new();
-        }
-
-        let mut script_lines = Vec::new();
-        let mut has_valid_users = false;
-        script_lines.push("\n# Copy and manage user authentication files".to_string());
-
-        // Copy authentication files from rootfs
-        script_lines.push(format!(
-            r#"
-# Copy authentication files from rootfs to extension
-echo "Copying /etc/passwd, /etc/shadow, and /etc/group from rootfs to extension"
-mkdir -p "$AVOCADO_EXT_SYSROOTS/{}/etc"
-cp "$AVOCADO_PREFIX/rootfs/etc/passwd" "$AVOCADO_EXT_SYSROOTS/{}/etc/passwd"
-cp "$AVOCADO_PREFIX/rootfs/etc/shadow" "$AVOCADO_EXT_SYSROOTS/{}/etc/shadow"
-cp "$AVOCADO_PREFIX/rootfs/etc/group" "$AVOCADO_EXT_SYSROOTS/{}/etc/group"
-"#,
-            self.extension, self.extension, self.extension, self.extension
-        ));
-
-        // Auto-incrementing counters for uid/gid starting at 1000
-        script_lines.push(
-            "# Auto-incrementing counters for uid/gid\nCURRENT_UID=1000\nCURRENT_GID=1000\n"
-                .to_string(),
-        );
-
-        // Process groups first (they might be referenced by users)
-        if let Some(groups) = groups_config {
-            script_lines.push("\n# Create groups".to_string());
-
-            for (groupname_val, group_config) in groups {
-                // Convert groupname from Value to String
-                let groupname = match groupname_val.as_str() {
-                    Some(name) => name,
-                    None => continue, // Skip if groupname is not a string
-                };
-
-                if let Some(group_table) = group_config.as_mapping() {
-                    // Parse comprehensive group configuration with defaults
-                    let gid = if let Some(gid_value) = group_table.get("gid") {
-                        if let Some(gid_num) = gid_value.as_i64() {
-                            gid_num.to_string()
-                        } else if let Some(gid_num) = gid_value.as_u64() {
-                            gid_num.to_string()
-                        } else {
-                            "$CURRENT_GID".to_string()
-                        }
-                    } else {
-                        "$CURRENT_GID".to_string()
-                    };
-
-                    let system_group = group_table
-                        .get("system")
-                        .and_then(|s| s.as_bool())
-                        .unwrap_or(false);
-
-                    let password = group_table
-                        .get("password")
-                        .and_then(|p| p.as_str())
-                        .unwrap_or(""); // Default: no group password
-
-                    let members = if let Some(members_value) = group_table.get("members") {
-                        if let Some(members_array) = members_value.as_sequence() {
-                            members_array
-                                .iter()
-                                .filter_map(|m| m.as_str())
-                                .collect::<Vec<_>>()
-                                .join(",")
-                        } else {
-                            "".to_string()
-                        }
-                    } else {
-                        "".to_string()
-                    };
-
-                    let _admins = if let Some(admins_value) = group_table.get("admins") {
-                        if let Some(admins_array) = admins_value.as_sequence() {
-                            admins_array
-                                .iter()
-                                .filter_map(|a| a.as_str())
-                                .collect::<Vec<_>>()
-                        } else {
-                            vec![]
-                        }
-                    } else {
-                        vec![]
-                    };
-
-                    // Escape password for potential gshadow entry
-                    let _escaped_group_password = password.replace("/", "\\/").replace("&", "\\&");
-
-                    let system_type = if system_group { " (system group)" } else { "" };
-                    let password_note = if !password.is_empty() {
-                        " with password"
-                    } else {
-                        ""
-                    };
-                    let members_msg = if !members.is_empty() {
-                        format!(" and members: {members}")
-                    } else {
-                        "".to_string()
-                    };
-                    let password_config = if !password.is_empty() {
-                        format!("\n# Set group password for '{groupname}'\necho \"Note: Group password configured for '{groupname}'\"")
-                    } else {
-                        "".to_string()
-                    };
-
-                    script_lines.push(format!(
-                        r#"
-# Create group '{}'{}
-echo "Creating group '{}'"{}
-if ! grep -q "^{}:" "$AVOCADO_EXT_SYSROOTS/{}/etc/group"; then
-    echo "{}:x:{}:{}" >> "$AVOCADO_EXT_SYSROOTS/{}/etc/group"
-    echo "Group '{}' created with GID {}{}"
-    if [ "{}" = "$CURRENT_GID" ]; then
-        CURRENT_GID=$((CURRENT_GID + 1))
-    fi
-else
-    echo "Group '{}' already exists, updating members"
-    # Update members if specified
-    if [ -n "{}" ]; then
-        sed -i "s|^{}:x:{}:.*$|{}:x:{}:{}|" "$AVOCADO_EXT_SYSROOTS/{}/etc/group"
-        echo "Updated members for group '{}'"
-    fi
-fi{}"#,
-                        groupname,
-                        system_type,
-                        groupname,
-                        password_note,
-                        groupname,
-                        self.extension,
-                        groupname,
-                        gid,
-                        members,
-                        self.extension,
-                        groupname,
-                        gid,
-                        members_msg,
-                        gid,
-                        groupname,
-                        members,
-                        groupname,
-                        gid,
-                        groupname,
-                        gid,
-                        members,
-                        self.extension,
-                        groupname,
-                        password_config
-                    ));
-                } else {
-                    // Simple group with just GID auto-assignment
-                    script_lines.push(format!(
-                        r#"
-# Create group '{}'
-echo "Creating group '{}'"
-if ! grep -q "^{}:" "$AVOCADO_EXT_SYSROOTS/{}/etc/group"; then
-    echo "{}:x:$CURRENT_GID:" >> "$AVOCADO_EXT_SYSROOTS/{}/etc/group"
-    echo "Group '{}' created with GID $CURRENT_GID"
-    CURRENT_GID=$((CURRENT_GID + 1))
-else
-    echo "Group '{}' already exists"
-fi"#,
-                        groupname,
-                        groupname,
-                        groupname,
-                        self.extension,
-                        groupname,
-                        self.extension,
-                        groupname,
-                        groupname
-                    ));
-                }
-            }
-        }
-
-        // Process users
-        if let Some(users) = users_config {
-            let mut user_script_lines = Vec::new();
-
-            for (username_val, user_config) in users {
-                // Convert username from Value to String
-                let username = match username_val.as_str() {
-                    Some(name) => name,
-                    None => continue, // Skip if username is not a string
-                };
-
-                if let Some(user_table) = user_config.as_mapping() {
-                    // Check if user has password field - if not, create with disabled login
-                    let password = user_table
-                        .get("password")
-                        .and_then(|p| p.as_str())
-                        .unwrap_or("*"); // Default to no login allowed
-
-                    has_valid_users = true;
-
-                    // Parse comprehensive user configuration with defaults
-                    let uid = if let Some(uid_value) = user_table.get("uid") {
-                        if let Some(uid_num) = uid_value.as_i64() {
-                            uid_num.to_string()
-                        } else {
-                            "$CURRENT_UID".to_string()
-                        }
-                    } else {
-                        "$CURRENT_UID".to_string()
-                    };
-
-                    let gid = if let Some(gid_value) = user_table.get("gid") {
-                        if let Some(gid_num) = gid_value.as_i64() {
-                            gid_num.to_string()
-                        } else {
-                            "$CURRENT_UID".to_string() // Default to same as UID for user private groups
-                        }
-                    } else {
-                        "$CURRENT_UID".to_string()
-                    };
-
-                    let gecos = user_table
-                        .get("gecos")
-                        .and_then(|g| g.as_str())
-                        .unwrap_or(username); // Default to username
-
-                    let default_home = format!("/home/{username}");
-                    let home = user_table
-                        .get("home")
-                        .and_then(|h| h.as_str())
-                        .unwrap_or(&default_home); // Default to /home/username
-
-                    let shell = user_table
-                        .get("shell")
-                        .and_then(|s| s.as_str())
-                        .unwrap_or("/bin/sh"); // Default shell
-
-                    let groups = if let Some(groups_value) = user_table.get("groups") {
-                        if let Some(groups_array) = groups_value.as_sequence() {
-                            groups_array
-                                .iter()
-                                .filter_map(|g| g.as_str())
-                                .map(|s| s.to_string())
-                                .collect::<Vec<_>>()
-                        } else {
-                            vec![username.to_string()] // Default to user's own group
-                        }
-                    } else {
-                        vec![username.to_string()] // Default to user's own group
-                    };
-
-                    let _primary_group = groups.first().map(|s| s.as_str()).unwrap_or(username);
-
-                    // Shadow file attributes with defaults
-                    let last_change = user_table
-                        .get("last_change")
-                        .and_then(|l| l.as_i64())
-                        .unwrap_or(19000); // Default to a reasonable epoch day
-
-                    let min_days = user_table
-                        .get("min_days")
-                        .and_then(|m| m.as_i64())
-                        .unwrap_or(0); // Default: no minimum
-
-                    let max_days = user_table
-                        .get("max_days")
-                        .and_then(|m| m.as_i64())
-                        .unwrap_or(99999); // Default: no maximum
-
-                    let warn_days = user_table
-                        .get("warn_days")
-                        .and_then(|w| w.as_i64())
-                        .unwrap_or(7); // Default: warn 7 days before
-
-                    let inactive_days = user_table
-                        .get("inactive_days")
-                        .and_then(|i| i.as_i64())
-                        .map(|i| i.to_string())
-                        .unwrap_or_else(|| "".to_string()); // Default: no inactive period
-
-                    let expire_date = user_table
-                        .get("expire_date")
-                        .and_then(|e| e.as_i64())
-                        .map(|e| e.to_string())
-                        .unwrap_or_else(|| "".to_string()); // Default: no expiration
-
-                    let disabled = user_table
-                        .get("disabled")
-                        .and_then(|d| d.as_bool())
-                        .unwrap_or(false);
-
-                    let system_user = user_table
-                        .get("system")
-                        .and_then(|s| s.as_bool())
-                        .unwrap_or(false);
-
-                    // Escape special characters in password for sed
-                    // Note: We use | as sed delimiter to avoid conflicts with / in passwords
-                    // We only need to escape characters that have special meaning in sed replacement strings
-                    let escaped_password = password
-                        .replace("\\", "\\\\") // Escape backslashes first
-                        .replace("&", "\\&") // Escape ampersands (sed replacement reference)
-                        .replace("$", "\\$"); // Escape dollar signs (sed end-of-line anchor)
-
-                    let warning_message = if password.is_empty() {
-                        format!("\necho \"[WARNING] User '{username}' will be able to login with NO PASSWORD\"")
-                    } else {
-                        String::new()
-                    };
-
-                    // Create user in passwd file
-                    user_script_lines.push(format!(
-                        r#"
-# Create user '{}'
-echo "Creating user '{}'{}"{}
-if ! grep -q "^{}:" "$AVOCADO_EXT_SYSROOTS/{}/etc/passwd"; then
-    # Add user to passwd file with comprehensive attributes
-    echo "{}:x:{}:{}:{}:{}:{}" >> "$AVOCADO_EXT_SYSROOTS/{}/etc/passwd"
-    echo "User '{}' created with UID {}, GID {}, home '{}', shell '{}'"
-
-    if [ "{}" = "$CURRENT_UID" ]; then
-        CURRENT_UID=$((CURRENT_UID + 1))
-    fi
-else
-    echo "User '{}' already exists, updating attributes"
-fi"#,
-                        username,
-                        username,
-                        if system_user { " (system user)" } else { "" },
-                        warning_message,
-                        username,
-                        self.extension,
-                        username,
-                        uid,
-                        gid,
-                        gecos,
-                        home,
-                        shell,
-                        self.extension,
-                        username,
-                        uid,
-                        gid,
-                        home,
-                        shell,
-                        uid,
-                        username
-                    ));
-
-                    // Create/update user in shadow file with comprehensive attributes
-                    user_script_lines.push(format!(
-                        r#"
-# Set password and shadow attributes for user '{}'
-echo "Setting password and aging policy for user '{}'"
-if grep -q "^{}:" "$AVOCADO_EXT_SYSROOTS/{}/etc/shadow"; then
-    # Update existing user's shadow entry completely
-    sed -i "s|^{}:.*$|{}:{}:{}:{}:{}:{}:{}:{}:|" "$AVOCADO_EXT_SYSROOTS/{}/etc/shadow"
-    echo "Updated shadow entry for existing user '{}'"
-else
-    # Add new user to shadow file with full attributes
-    echo "{}:{}:{}:{}:{}:{}:{}:{}:" >> "$AVOCADO_EXT_SYSROOTS/{}/etc/shadow"
-    echo "Added new user '{}' to shadow file"
-fi{}"#,
-                        username,
-                        username,
-                        username,
-                        self.extension,
-                        username,
-                        username,
-                        escaped_password,
-                        last_change,
-                        min_days,
-                        max_days,
-                        warn_days,
-                        inactive_days,
-                        expire_date,
-                        self.extension,
-                        username,
-                        username,
-                        escaped_password,
-                        last_change,
-                        min_days,
-                        max_days,
-                        warn_days,
-                        inactive_days,
-                        expire_date,
-                        self.extension,
-                        username,
-                        if disabled {
-                            "\necho \"Note: User account is marked as disabled\""
-                        } else {
-                            ""
-                        }
-                    ));
-
-                    // Add user to additional groups if specified
-                    if groups.len() > 1 {
-                        user_script_lines.push(format!(
-                            r#"
-# Add user '{username}' to additional groups"#
-                        ));
-
-                        for group in &groups[1..] {
-                            // Skip primary group
-                            user_script_lines.push(format!(
-                                r#"
-if grep -q "^{}:" "$AVOCADO_EXT_SYSROOTS/{}/etc/group"; then
-    # Add user to group if not already present
-    if ! grep "^{}:" "$AVOCADO_EXT_SYSROOTS/{}/etc/group" | grep -q "{}"; then
-        sed -i "s|^{}:\([^:]*\):\([^:]*\):\(.*\)$|{}:\1:\2:\3,{}|" "$AVOCADO_EXT_SYSROOTS/{}/etc/group"
-        echo "Added user '{}' to group '{}'"
-    fi
-else
-    echo "Warning: Group '{}' not found, cannot add user '{}'"
-fi"#,
-                                group, self.extension, group, self.extension, username, group, group, username, self.extension, username, group, group, username
-                            ));
-                        }
-                    }
-                }
-            }
-
-            // Add user scripts to main script if there are valid users
-            if has_valid_users {
-                script_lines.push("\n# Create and configure users".to_string());
-                script_lines.extend(user_script_lines);
-            }
-        }
-
-        // Set proper permissions only if we processed any users or groups
-        if groups_config.is_some() || has_valid_users {
-            script_lines.push(format!(
-                r#"
-# Set proper ownership and permissions for authentication files
-chown root:root "$AVOCADO_EXT_SYSROOTS/{}/etc/passwd" "$AVOCADO_EXT_SYSROOTS/{}/etc/shadow" "$AVOCADO_EXT_SYSROOTS/{}/etc/group"
-chmod 644 "$AVOCADO_EXT_SYSROOTS/{}/etc/passwd"
-chmod 640 "$AVOCADO_EXT_SYSROOTS/{}/etc/shadow"
-chmod 644 "$AVOCADO_EXT_SYSROOTS/{}/etc/group"
-echo "Set proper permissions on authentication files""#,
-                self.extension, self.extension, self.extension, self.extension, self.extension, self.extension
-            ));
-        }
-
-        script_lines.join("")
+        let etc_dir = format!("$AVOCADO_EXT_SYSROOTS/{}/etc", self.extension);
+        render_users_groups_script(
+            users_config,
+            groups_config,
+            &etc_dir,
+            Some("$AVOCADO_PREFIX/rootfs/etc"),
+        )
     }
 
     /// Run the extension's `post_build` script inside the SDK container.
@@ -3022,8 +2607,10 @@ mod tests {
 
         // Verify the users script section contains the expected commands
         assert!(script.contains("# Copy and manage user authentication files"));
-        assert!(script
-            .contains("Copying /etc/passwd, /etc/shadow, and /etc/group from rootfs to extension"));
+        assert!(script.contains(
+            "Copying /etc/passwd, /etc/shadow, and /etc/group from \
+             $AVOCADO_PREFIX/rootfs/etc to $AVOCADO_EXT_SYSROOTS/avocado-dev/etc"
+        ));
         assert!(script.contains("mkdir -p \"$AVOCADO_EXT_SYSROOTS/avocado-dev/etc\""));
         assert!(script.contains("cp \"$AVOCADO_PREFIX/rootfs/etc/passwd\" \"$AVOCADO_EXT_SYSROOTS/avocado-dev/etc/passwd\""));
         assert!(script.contains("cp \"$AVOCADO_PREFIX/rootfs/etc/shadow\" \"$AVOCADO_EXT_SYSROOTS/avocado-dev/etc/shadow\""));
diff --git a/src/commands/initramfs/image.rs b/src/commands/initramfs/image.rs
index a9a291e0..b256bdac 100644
--- a/src/commands/initramfs/image.rs
+++ b/src/commands/initramfs/image.rs
@@ -11,6 +11,7 @@ use crate::utils::{
     host_copy::copy_volume_path_to_host,
     kab_wrap::generate_kab_wrap_script,
     output::{print_error, print_info, print_success, OutputLevel},
+    permissions::{mapping_from_hashmap, render_users_groups_script},
     runs_on::RunsOnContext,
     target::resolve_target_required,
 };
@@ -62,6 +63,7 @@ pub fn generate_initramfs_build_script(
     namespace_uuid: &str,
     initramfs_filesystem: &str,
     post_install: Option<&str>,
+    permissions_section: &str,
 ) -> String {
     let post = resolve_install_hooks(post_install, DEFAULT_INITRAMFS_POST_INSTALL);
     let post_install_block = render_hook_block("post_install", &post);
@@ -80,6 +82,7 @@ if [ -d "$INITRAMFS_SYSROOT/usr" ]; then
     mkdir -p "$(dirname "$INITRAMFS_WORK")"
     rm -rf "$INITRAMFS_WORK"
     cp -a "$INITRAMFS_SYSROOT" "$INITRAMFS_WORK"
+{permissions_section}
 
 {post_install_block}
 
@@ -130,6 +133,7 @@ fi"#,
         namespace_uuid = namespace_uuid,
         initramfs_filesystem = initramfs_filesystem,
         post_install_block = post_install_block,
+        permissions_section = permissions_section,
     )
 }
 
@@ -216,10 +220,25 @@ impl InitramfsImageCommand {
         let initramfs_filesystem = config.get_initramfs_filesystem();
         let initramfs_node = composed.merged_value.get("initramfs");
         let post_install = get_post_install(initramfs_node);
+        let permissions_section = config
+            .initramfs_default()
+            .and_then(|img| config.resolve_image_permissions(img))
+            .map(|p| {
+                let users = mapping_from_hashmap(p.users.as_ref());
+                let groups = mapping_from_hashmap(p.groups.as_ref());
+                render_users_groups_script(
+                    users.as_ref(),
+                    groups.as_ref(),
+                    "$INITRAMFS_WORK/etc",
+                    None,
+                )
+            })
+            .unwrap_or_default();
         let build_section = generate_initramfs_build_script(
             NAMESPACE_UUID,
             &initramfs_filesystem,
             post_install.as_deref(),
+            &permissions_section,
         );
 
         // Same kab-wrap pipeline as rootfs/image.rs — see comments
diff --git a/src/commands/rootfs/image.rs b/src/commands/rootfs/image.rs
index 9241b721..661dd9ba 100644
--- a/src/commands/rootfs/image.rs
+++ b/src/commands/rootfs/image.rs
@@ -11,6 +11,7 @@ use crate::utils::{
     host_copy::copy_volume_path_to_host,
     kab_wrap::generate_kab_wrap_script,
     output::{print_error, print_info, print_success, OutputLevel},
+    permissions::{mapping_from_hashmap, render_users_groups_script},
     runs_on::RunsOnContext,
     target::resolve_target_required,
 };
@@ -119,6 +120,7 @@ pub fn generate_rootfs_build_script(
     namespace_uuid: &str,
     rootfs_filesystem: &str,
     post_install: Option<&str>,
+    permissions_section: &str,
 ) -> String {
     let post = resolve_install_hooks(post_install, DEFAULT_ROOTFS_POST_INSTALL);
     let post_install_block = render_hook_block("post_install", &post);
@@ -138,6 +140,7 @@ if [ -d "$ROOTFS_SYSROOT/usr" ]; then
     mkdir -p "$(dirname "$ROOTFS_WORK")"
     rm -rf "$ROOTFS_WORK"
     cp -a "$ROOTFS_SYSROOT" "$ROOTFS_WORK"
+{permissions_section}
 
 {post_install_block}
 
@@ -199,6 +202,7 @@ fi"#,
         namespace_uuid = namespace_uuid,
         rootfs_filesystem = rootfs_filesystem,
         post_install_block = post_install_block,
+        permissions_section = permissions_section,
     )
 }
 
@@ -285,10 +289,25 @@ impl RootfsImageCommand {
         let rootfs_filesystem = config.get_rootfs_filesystem();
         let rootfs_node = composed.merged_value.get("rootfs");
         let post_install = get_post_install(rootfs_node);
+        let permissions_section = config
+            .rootfs_default()
+            .and_then(|img| config.resolve_image_permissions(img))
+            .map(|p| {
+                let users = mapping_from_hashmap(p.users.as_ref());
+                let groups = mapping_from_hashmap(p.groups.as_ref());
+                render_users_groups_script(
+                    users.as_ref(),
+                    groups.as_ref(),
+                    "$ROOTFS_WORK/etc",
+                    None,
+                )
+            })
+            .unwrap_or_default();
         let build_section = generate_rootfs_build_script(
             NAMESPACE_UUID,
             &rootfs_filesystem,
             post_install.as_deref(),
+            &permissions_section,
         );
 
         // If the avocado.yaml asks for a kab-wrapped rootfs, validate the
diff --git a/src/commands/runtime/build.rs b/src/commands/runtime/build.rs
index 9de590ed..51da4098 100644
--- a/src/commands/runtime/build.rs
+++ b/src/commands/runtime/build.rs
@@ -3,9 +3,10 @@ use crate::commands::rootfs::image::{generate_rootfs_build_script, NAMESPACE_UUI
 use crate::commands::sdk::SdkCompileCommand;
 use crate::utils::config::get_post_install;
 use crate::utils::{
-    config::{ComposedConfig, Config},
+    config::{ComposedConfig, Config, ImageConfig},
     container::{RunConfig, SdkContainer, TuiContext},
     output::{print_error, print_info, print_success, OutputLevel},
+    permissions::{mapping_from_hashmap, render_users_groups_script},
     runs_on::RunsOnContext,
     stamps::{
         compute_runtime_input_hash, generate_batch_read_stamps_script, generate_write_stamp_script,
@@ -2220,18 +2221,43 @@ echo "Docker image priming complete.""#,
             }
         };
 
+        // Helper closure: given the rootfs/initramfs ImageConfig the runtime
+        // resolves to, render the users/groups script that will edit the
+        // image's work dir /etc/{passwd,shadow,group} in place. Returns an
+        // empty string when no permissions are configured on the image —
+        // the base packages (avocado-pkg-rootfs / avocado-pkg-initramfs)
+        // ship a generic passwd/shadow/group that we leave untouched.
+        let render_perms = |image: Option<&ImageConfig>, etc_dir: &str| -> String {
+            let Some(perms) = image.and_then(|img| config.resolve_image_permissions(img)) else {
+                return String::new();
+            };
+            let users = mapping_from_hashmap(perms.users.as_ref());
+            let groups = mapping_from_hashmap(perms.groups.as_ref());
+            render_users_groups_script(users.as_ref(), groups.as_ref(), etc_dir, None)
+        };
+
         let rootfs_post_install = get_post_install(parsed.get("rootfs"));
+        let rootfs_permissions_section = render_perms(
+            config.resolve_runtime_rootfs(&self.runtime_name),
+            "$ROOTFS_WORK/etc",
+        );
         let rootfs_build_section = generate_rootfs_build_script(
             NAMESPACE_UUID,
             &config.get_rootfs_filesystem(),
             rootfs_post_install.as_deref(),
+            &rootfs_permissions_section,
         );
 
         let initramfs_post_install = get_post_install(parsed.get("initramfs"));
+        let initramfs_permissions_section = render_perms(
+            config.resolve_runtime_initramfs(&self.runtime_name),
+            "$INITRAMFS_WORK/etc",
+        );
         let initramfs_build_section = generate_initramfs_build_script(
             NAMESPACE_UUID,
             &config.get_initramfs_filesystem(),
             initramfs_post_install.as_deref(),
+            &initramfs_permissions_section,
         );
 
         let script = format!(
diff --git a/src/utils/config.rs b/src/utils/config.rs
index c5bf169a..c46c002b 100644
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@@ -154,6 +154,7 @@ where
             "overlay",
             "image",
             "post_install",
+            "permissions",
         ],
         "rootfs",
     )
@@ -175,11 +176,25 @@ where
             "overlay",
             "image",
             "post_install",
+            "permissions",
         ],
         "initramfs",
     )
 }
 
+/// Custom deserializer for top-level `permissions:` field. Accepts either a
+/// singleton form (`permissions: { users: ..., groups: ... }` — synthesized
+/// as the implicit `default` entry) or a named-map form
+/// (`permissions: { main: { users: ... } }`).
+fn deserialize_permissions_map<'de, D>(
+    deserializer: D,
+) -> Result<Option<HashMap<String, PermissionsConfig>>, D::Error>
+where
+    D: serde::Deserializer<'de>,
+{
+    named_or_single_deserializer::deserialize(deserializer, &["users", "groups"], "permissions")
+}
+
 /// Custom deserializer module for container_args
 mod container_args_deserializer {
     use serde::{Deserialize, Deserializer};
@@ -596,6 +611,16 @@ pub enum ImageRef {
     Inline(Box<ImageConfig>),
 }
 
+/// A permissions reference on a rootfs/initramfs image: either a name
+/// pointing at a top-level `permissions.<name>` entry, or an inline
+/// anonymous block. Same untagged shape as [`KernelRef`] / [`ImageRef`].
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(untagged)]
+pub enum PermissionsRef {
+    Named(String),
+    Inline(Box<PermissionsConfig>),
+}
+
 /// Runtime configuration section
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct RuntimeConfig {
@@ -746,6 +771,22 @@ pub struct ImageConfig {
     /// the defaults run.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub post_install: Option<String>,
+    /// Optional permissions reference. Either a string name pointing at a
+    /// top-level `permissions.<name>` entry, or an inline `PermissionsConfig`.
+    /// When present, users/groups are provisioned into this image's work
+    /// directory (`/etc/passwd`, `/etc/shadow`, `/etc/group`) during build.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub permissions: Option<PermissionsRef>,
+}
+
+/// Permissions block: users and groups to provision into a built image
+/// (rootfs or initramfs). Values are kept as raw YAML so the existing
+/// dynamic field parser in [`crate::utils::permissions`] can consume them
+/// without re-typing every shadow attribute.
+#[derive(Debug, Clone, Deserialize, Serialize, Default)]
+pub struct PermissionsConfig {
+    pub users: Option<HashMap<String, serde_yaml::Value>>,
+    pub groups: Option<HashMap<String, serde_yaml::Value>>,
 }
 
 /// Provision profile configuration
@@ -1315,6 +1356,14 @@ pub struct Config {
     /// applied across runtimes that don't pin their own.
     #[serde(default, deserialize_with = "deserialize_kernels")]
     pub kernel: Option<HashMap<String, KernelConfig>>,
+    /// Top-level permissions definition(s). Accepts either a singleton
+    /// `PermissionsConfig` (synthesized as the implicit `default` entry) or a
+    /// `name → PermissionsConfig` map. Referenced by name from rootfs or
+    /// initramfs entries via their `permissions:` field. Users/groups
+    /// declared here are baked into the corresponding image's
+    /// `/etc/passwd|shadow|group` during build.
+    #[serde(default, deserialize_with = "deserialize_permissions_map")]
+    pub permissions: Option<HashMap<String, PermissionsConfig>>,
 }
 
 impl Config {
@@ -1776,6 +1825,7 @@ impl Config {
                 signing_keys: None,
                 connect: None,
                 kernel: None,
+                permissions: None,
             });
 
         // Resolve target: CLI arg > env var > config default
@@ -3389,9 +3439,65 @@ impl Config {
                 }
             }
         }
+
+        // Validate permissions refs on top-level rootfs/initramfs entries.
+        let check_perms =
+            |owner_kind: &str, entries: Option<&HashMap<String, ImageConfig>>| -> Result<()> {
+                let Some(entries) = entries else {
+                    return Ok(());
+                };
+                for (entry_name, img) in entries {
+                    let Some(PermissionsRef::Named(pname)) = img.permissions.as_ref() else {
+                        continue;
+                    };
+                    let map = self.permissions.as_ref();
+                    if map.is_none_or(|m| !m.contains_key(pname)) {
+                        let available = map
+                            .map(|m| {
+                                let mut names: Vec<&str> = m.keys().map(|s| s.as_str()).collect();
+                                names.sort_unstable();
+                                names.join(", ")
+                            })
+                            .unwrap_or_default();
+                        return Err(if available.is_empty() {
+                            anyhow::anyhow!(
+                                "{owner_kind} '{entry_name}' references permissions '{pname}', \
+                             but no top-level `permissions:` map is defined in avocado.yaml"
+                            )
+                        } else {
+                            anyhow::anyhow!(
+                                "{owner_kind} '{entry_name}' references permissions '{pname}', \
+                             which is not defined in the top-level `permissions:` map. \
+                             Available: {available}"
+                            )
+                        });
+                    }
+                }
+                Ok(())
+            };
+        check_perms("rootfs", self.rootfs.as_ref())?;
+        check_perms("initramfs", self.initramfs.as_ref())?;
+
         Ok(())
     }
 
+    /// Resolve an [`ImageConfig`]'s `permissions:` reference to a borrowed
+    /// [`PermissionsConfig`]. Returns the inline body for `Inline(_)`, looks
+    /// up the top-level map for `Named(_)`, and `None` when no permissions
+    /// are configured on the image. Assumes `validate_runtime_refs` has
+    /// already run — an unresolved named ref returns `None` here rather
+    /// than erroring (callers in build paths treat absence as "no
+    /// permissions to apply").
+    pub fn resolve_image_permissions<'a>(
+        &'a self,
+        image: &'a ImageConfig,
+    ) -> Option<&'a PermissionsConfig> {
+        match image.permissions.as_ref()? {
+            PermissionsRef::Inline(b) => Some(b.as_ref()),
+            PermissionsRef::Named(name) => self.permissions.as_ref()?.get(name),
+        }
+    }
+
     /// Load configuration from a YAML string
     /// Used primarily in tests
     #[allow(dead_code)]
@@ -10937,6 +11043,143 @@ default_target: qemux86-64
         assert!(config.runtimes.is_none());
     }
 
+    // --- permissions: top-level + per-image ref tests ---
+
+    #[test]
+    fn test_permissions_singleton_form_synthesizes_default() {
+        let yaml = r#"
+permissions:
+  users:
+    root:
+      password: ""
+"#;
+        let config = Config::load_from_yaml_str(yaml).unwrap();
+        let perms = config.permissions.as_ref().expect("permissions parsed");
+        assert_eq!(perms.len(), 1);
+        assert!(perms.contains_key("default"));
+        let entry = perms.get("default").unwrap();
+        let users = entry.users.as_ref().expect("users present");
+        assert!(users.contains_key("root"));
+    }
+
+    #[test]
+    fn test_permissions_named_map_form_with_multiple_entries() {
+        let yaml = r#"
+permissions:
+  main:
+    users:
+      root:
+        password: ""
+  service:
+    users:
+      avocado:
+        uid: 1000
+"#;
+        let config = Config::load_from_yaml_str(yaml).unwrap();
+        let perms = config.permissions.as_ref().unwrap();
+        assert_eq!(perms.len(), 2);
+        assert!(perms.contains_key("main"));
+        assert!(perms.contains_key("service"));
+    }
+
+    #[test]
+    fn test_image_permissions_named_ref_resolves() {
+        let yaml = r#"
+permissions:
+  main:
+    users:
+      root:
+        password: ""
+rootfs:
+  default:
+    packages: { avocado-pkg-rootfs: "*" }
+    permissions: main
+"#;
+        let config = Config::load_from_yaml_str(yaml).unwrap();
+        let rootfs = config.rootfs_default().expect("rootfs default present");
+        let resolved = config
+            .resolve_image_permissions(rootfs)
+            .expect("permissions resolved");
+        assert!(resolved.users.as_ref().unwrap().contains_key("root"));
+    }
+
+    #[test]
+    fn test_image_permissions_inline_form() {
+        let yaml = r#"
+rootfs:
+  default:
+    packages: { avocado-pkg-rootfs: "*" }
+    permissions:
+      users:
+        root:
+          password: ""
+"#;
+        let config = Config::load_from_yaml_str(yaml).unwrap();
+        let rootfs = config.rootfs_default().expect("rootfs default present");
+        let resolved = config
+            .resolve_image_permissions(rootfs)
+            .expect("inline permissions resolved");
+        assert!(resolved.users.as_ref().unwrap().contains_key("root"));
+    }
+
+    #[test]
+    fn test_validate_runtime_refs_rejects_unresolved_rootfs_permissions() {
+        let yaml = r#"
+permissions:
+  main:
+    users:
+      root:
+        password: ""
+rootfs:
+  base:
+    packages: { avocado-pkg-rootfs: "*" }
+    permissions: nope
+runtimes:
+  prod:
+    rootfs: base
+"#;
+        let err = Config::load_from_yaml_str(yaml).unwrap_err().to_string();
+        assert!(
+            err.contains("rootfs 'base'")
+                && err.contains("'nope'")
+                && err.contains("Available: main"),
+            "expected unresolved-permissions-ref error mentioning the rootfs entry; got: {err}"
+        );
+    }
+
+    #[test]
+    fn test_validate_runtime_refs_rejects_unresolved_initramfs_permissions() {
+        let yaml = r#"
+initramfs:
+  base:
+    packages: { avocado-pkg-initramfs: "*" }
+    permissions: ghost
+runtimes:
+  prod:
+    initramfs: base
+"#;
+        let err = Config::load_from_yaml_str(yaml).unwrap_err().to_string();
+        assert!(
+            err.contains("initramfs 'base'")
+                && err.contains("'ghost'")
+                && err.contains("no top-level"),
+            "got: {err}"
+        );
+    }
+
+    #[test]
+    fn test_image_permissions_absent_is_none() {
+        let yaml = r#"
+rootfs:
+  default:
+    packages: { avocado-pkg-rootfs: "*" }
+"#;
+        let config = Config::load_from_yaml_str(yaml).unwrap();
+        let rootfs = config.rootfs_default().unwrap();
+        assert!(rootfs.permissions.is_none());
+        assert!(config.resolve_image_permissions(rootfs).is_none());
+    }
+
     #[test]
     fn test_validate_runtime_refs_rejects_unresolved_initramfs_ref() {
         let yaml = r#"
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index ebd176f2..bd54339b 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -16,6 +16,7 @@ pub mod lockfile;
 pub mod nfs_server;
 pub mod output;
 pub mod output_format;
+pub mod permissions;
 pub mod pkcs11_devices;
 pub mod prerequisites;
 pub mod provision_result;
diff --git a/src/utils/permissions.rs b/src/utils/permissions.rs
new file mode 100644
index 00000000..cc746679
--- /dev/null
+++ b/src/utils/permissions.rs
@@ -0,0 +1,476 @@
+//! Shared shell-script generator for baking users and groups into an
+//! image's `/etc/passwd`, `/etc/shadow`, and `/etc/group`.
+//!
+//! Used by:
+//! - Extension builds (`ext build`) — legacy path, copies passwd/shadow/group
+//!   from `$AVOCADO_PREFIX/rootfs/etc/` into the extension sysroot, then
+//!   adds users/groups. Will be removed once the deprecation period ends.
+//! - Rootfs / initramfs builds (`runtime build`) — new path, edits the
+//!   files in the image's work directory in place (the base-passwd / shadow
+//!   packages have already staged them there).
+//!
+//! The function consumes raw `serde_yaml::Mapping`s for users/groups so the
+//! existing dynamic field handling (uid/gid/gecos/shell/home/groups/shadow
+//! attributes) keeps working without re-typing every field.
+
+use serde_yaml::Mapping;
+
+/// Render the shell-script section that creates/updates users and groups
+/// inside `etc_dir`.
+///
+/// * `users` — the `users:` mapping (username → attribute map), or `None`.
+/// * `groups` — the `groups:` mapping (groupname → attribute map), or `None`.
+/// * `etc_dir` — shell expression pointing at the target `/etc` directory.
+///   Examples: `"$AVOCADO_EXT_SYSROOTS/myext/etc"`, `"$ROOTFS_WORK/etc"`.
+///   Embedded verbatim into the script — the caller is responsible for
+///   ensuring it resolves correctly at script-run time.
+/// * `copy_from` — when `Some(dir)`, the script begins by copying
+///   `passwd`, `shadow`, `group` from `dir` into `etc_dir`. When `None`,
+///   the files are assumed to already exist at `etc_dir` (the package
+///   install staged them).
+///
+/// Returns an empty string when both `users` and `groups` are `None`.
+pub fn render_users_groups_script(
+    users: Option<&Mapping>,
+    groups: Option<&Mapping>,
+    etc_dir: &str,
+    copy_from: Option<&str>,
+) -> String {
+    if users.is_none() && groups.is_none() {
+        return String::new();
+    }
+
+    let mut script_lines = Vec::new();
+    let mut has_valid_users = false;
+    script_lines.push("\n# Copy and manage user authentication files".to_string());
+
+    // Optional copy of base passwd/shadow/group from a source dir
+    // (e.g. the rootfs sysroot's /etc) into the target /etc.
+    if let Some(src) = copy_from {
+        script_lines.push(format!(
+            r#"
+# Copy authentication files into target /etc
+echo "Copying /etc/passwd, /etc/shadow, and /etc/group from {src} to {etc_dir}"
+mkdir -p "{etc_dir}"
+cp "{src}/passwd" "{etc_dir}/passwd"
+cp "{src}/shadow" "{etc_dir}/shadow"
+cp "{src}/group" "{etc_dir}/group"
+"#
+        ));
+    }
+
+    // Auto-incrementing counters for uid/gid starting at 1000
+    script_lines.push(
+        "# Auto-incrementing counters for uid/gid\nCURRENT_UID=1000\nCURRENT_GID=1000\n"
+            .to_string(),
+    );
+
+    // Process groups first (they might be referenced by users)
+    if let Some(groups) = groups {
+        script_lines.push("\n# Create groups".to_string());
+
+        for (groupname_val, group_config) in groups {
+            let groupname = match groupname_val.as_str() {
+                Some(name) => name,
+                None => continue,
+            };
+
+            if let Some(group_table) = group_config.as_mapping() {
+                let gid = if let Some(gid_value) = group_table.get("gid") {
+                    if let Some(gid_num) = gid_value.as_i64() {
+                        gid_num.to_string()
+                    } else if let Some(gid_num) = gid_value.as_u64() {
+                        gid_num.to_string()
+                    } else {
+                        "$CURRENT_GID".to_string()
+                    }
+                } else {
+                    "$CURRENT_GID".to_string()
+                };
+
+                let system_group = group_table
+                    .get("system")
+                    .and_then(|s| s.as_bool())
+                    .unwrap_or(false);
+
+                let password = group_table
+                    .get("password")
+                    .and_then(|p| p.as_str())
+                    .unwrap_or("");
+
+                let members = if let Some(members_value) = group_table.get("members") {
+                    if let Some(members_array) = members_value.as_sequence() {
+                        members_array
+                            .iter()
+                            .filter_map(|m| m.as_str())
+                            .collect::<Vec<_>>()
+                            .join(",")
+                    } else {
+                        String::new()
+                    }
+                } else {
+                    String::new()
+                };
+
+                let system_type = if system_group { " (system group)" } else { "" };
+                let password_note = if !password.is_empty() {
+                    " with password"
+                } else {
+                    ""
+                };
+                let members_msg = if !members.is_empty() {
+                    format!(" and members: {members}")
+                } else {
+                    String::new()
+                };
+                let password_config = if !password.is_empty() {
+                    format!("\n# Set group password for '{groupname}'\necho \"Note: Group password configured for '{groupname}'\"")
+                } else {
+                    String::new()
+                };
+
+                script_lines.push(format!(
+                    r#"
+# Create group '{groupname}'{system_type}
+echo "Creating group '{groupname}'"{password_note}
+if ! grep -q "^{groupname}:" "{etc_dir}/group"; then
+    echo "{groupname}:x:{gid}:{members}" >> "{etc_dir}/group"
+    echo "Group '{groupname}' created with GID {gid}{members_msg}"
+    if [ "{gid}" = "$CURRENT_GID" ]; then
+        CURRENT_GID=$((CURRENT_GID + 1))
+    fi
+else
+    echo "Group '{groupname}' already exists, updating members"
+    if [ -n "{members}" ]; then
+        sed -i "s|^{groupname}:x:{gid}:.*$|{groupname}:x:{gid}:{members}|" "{etc_dir}/group"
+        echo "Updated members for group '{groupname}'"
+    fi
+fi{password_config}"#
+                ));
+            } else {
+                // Simple group with just GID auto-assignment
+                script_lines.push(format!(
+                    r#"
+# Create group '{groupname}'
+echo "Creating group '{groupname}'"
+if ! grep -q "^{groupname}:" "{etc_dir}/group"; then
+    echo "{groupname}:x:$CURRENT_GID:" >> "{etc_dir}/group"
+    echo "Group '{groupname}' created with GID $CURRENT_GID"
+    CURRENT_GID=$((CURRENT_GID + 1))
+else
+    echo "Group '{groupname}' already exists"
+fi"#
+                ));
+            }
+        }
+    }
+
+    // Process users
+    if let Some(users) = users {
+        let mut user_script_lines = Vec::new();
+
+        for (username_val, user_config) in users {
+            let username = match username_val.as_str() {
+                Some(name) => name,
+                None => continue,
+            };
+
+            if let Some(user_table) = user_config.as_mapping() {
+                let password = user_table
+                    .get("password")
+                    .and_then(|p| p.as_str())
+                    .unwrap_or("*");
+
+                has_valid_users = true;
+
+                let uid = if let Some(uid_value) = user_table.get("uid") {
+                    if let Some(uid_num) = uid_value.as_i64() {
+                        uid_num.to_string()
+                    } else {
+                        "$CURRENT_UID".to_string()
+                    }
+                } else {
+                    "$CURRENT_UID".to_string()
+                };
+
+                let gid = if let Some(gid_value) = user_table.get("gid") {
+                    if let Some(gid_num) = gid_value.as_i64() {
+                        gid_num.to_string()
+                    } else {
+                        "$CURRENT_UID".to_string()
+                    }
+                } else {
+                    "$CURRENT_UID".to_string()
+                };
+
+                let gecos = user_table
+                    .get("gecos")
+                    .and_then(|g| g.as_str())
+                    .unwrap_or(username);
+
+                let default_home = format!("/home/{username}");
+                let home = user_table
+                    .get("home")
+                    .and_then(|h| h.as_str())
+                    .unwrap_or(&default_home);
+
+                let shell = user_table
+                    .get("shell")
+                    .and_then(|s| s.as_str())
+                    .unwrap_or("/bin/sh");
+
+                let groups_list = if let Some(groups_value) = user_table.get("groups") {
+                    if let Some(groups_array) = groups_value.as_sequence() {
+                        groups_array
+                            .iter()
+                            .filter_map(|g| g.as_str())
+                            .map(|s| s.to_string())
+                            .collect::<Vec<_>>()
+                    } else {
+                        vec![username.to_string()]
+                    }
+                } else {
+                    vec![username.to_string()]
+                };
+
+                let last_change = user_table
+                    .get("last_change")
+                    .and_then(|l| l.as_i64())
+                    .unwrap_or(19000);
+
+                let min_days = user_table
+                    .get("min_days")
+                    .and_then(|m| m.as_i64())
+                    .unwrap_or(0);
+
+                let max_days = user_table
+                    .get("max_days")
+                    .and_then(|m| m.as_i64())
+                    .unwrap_or(99999);
+
+                let warn_days = user_table
+                    .get("warn_days")
+                    .and_then(|w| w.as_i64())
+                    .unwrap_or(7);
+
+                let inactive_days = user_table
+                    .get("inactive_days")
+                    .and_then(|i| i.as_i64())
+                    .map(|i| i.to_string())
+                    .unwrap_or_default();
+
+                let expire_date = user_table
+                    .get("expire_date")
+                    .and_then(|e| e.as_i64())
+                    .map(|e| e.to_string())
+                    .unwrap_or_default();
+
+                let disabled = user_table
+                    .get("disabled")
+                    .and_then(|d| d.as_bool())
+                    .unwrap_or(false);
+
+                let system_user = user_table
+                    .get("system")
+                    .and_then(|s| s.as_bool())
+                    .unwrap_or(false);
+
+                // We use | as sed delimiter to avoid conflicts with / in
+                // password hashes; we still need to escape the chars that
+                // are special inside a sed replacement string itself.
+                let escaped_password = password
+                    .replace("\\", "\\\\")
+                    .replace("&", "\\&")
+                    .replace("$", "\\$");
+
+                let system_label = if system_user { " (system user)" } else { "" };
+                let warning_message = if password.is_empty() {
+                    format!("\necho \"[WARNING] User '{username}' will be able to login with NO PASSWORD\"")
+                } else {
+                    String::new()
+                };
+                let disabled_note = if disabled {
+                    "\necho \"Note: User account is marked as disabled\""
+                } else {
+                    ""
+                };
+
+                // Create user in passwd file
+                user_script_lines.push(format!(
+                    r#"
+# Create user '{username}'
+echo "Creating user '{username}'{system_label}"{warning_message}
+if ! grep -q "^{username}:" "{etc_dir}/passwd"; then
+    echo "{username}:x:{uid}:{gid}:{gecos}:{home}:{shell}" >> "{etc_dir}/passwd"
+    echo "User '{username}' created with UID {uid}, GID {gid}, home '{home}', shell '{shell}'"
+
+    if [ "{uid}" = "$CURRENT_UID" ]; then
+        CURRENT_UID=$((CURRENT_UID + 1))
+    fi
+else
+    echo "User '{username}' already exists, updating attributes"
+fi"#
+                ));
+
+                // Create/update user in shadow file with comprehensive attributes
+                user_script_lines.push(format!(
+                    r#"
+# Set password and shadow attributes for user '{username}'
+echo "Setting password and aging policy for user '{username}'"
+if grep -q "^{username}:" "{etc_dir}/shadow"; then
+    sed -i "s|^{username}:.*$|{username}:{escaped_password}:{last_change}:{min_days}:{max_days}:{warn_days}:{inactive_days}:{expire_date}:|" "{etc_dir}/shadow"
+    echo "Updated shadow entry for existing user '{username}'"
+else
+    echo "{username}:{escaped_password}:{last_change}:{min_days}:{max_days}:{warn_days}:{inactive_days}:{expire_date}:" >> "{etc_dir}/shadow"
+    echo "Added new user '{username}' to shadow file"
+fi{disabled_note}"#
+                ));
+
+                // Add user to additional groups if specified
+                if groups_list.len() > 1 {
+                    user_script_lines.push(format!(
+                        r#"
+# Add user '{username}' to additional groups"#
+                    ));
+
+                    for group in &groups_list[1..] {
+                        user_script_lines.push(format!(
+                            r#"
+if grep -q "^{group}:" "{etc_dir}/group"; then
+    if ! grep "^{group}:" "{etc_dir}/group" | grep -q "{username}"; then
+        sed -i "s|^{group}:\([^:]*\):\([^:]*\):\(.*\)$|{group}:\1:\2:\3,{username}|" "{etc_dir}/group"
+        echo "Added user '{username}' to group '{group}'"
+    fi
+else
+    echo "Warning: Group '{group}' not found, cannot add user '{username}'"
+fi"#
+                        ));
+                    }
+                }
+            }
+        }
+
+        if has_valid_users {
+            script_lines.push("\n# Create and configure users".to_string());
+            script_lines.extend(user_script_lines);
+        }
+    }
+
+    // Set proper permissions only if we processed any users or groups
+    if groups.is_some() || has_valid_users {
+        script_lines.push(format!(
+            r#"
+# Set proper ownership and permissions for authentication files
+chown root:root "{etc_dir}/passwd" "{etc_dir}/shadow" "{etc_dir}/group"
+chmod 644 "{etc_dir}/passwd"
+chmod 640 "{etc_dir}/shadow"
+chmod 644 "{etc_dir}/group"
+echo "Set proper permissions on authentication files""#
+        ));
+    }
+
+    script_lines.join("")
+}
+
+/// Convert an `Option<&HashMap<String, serde_yaml::Value>>` (the shape
+/// stored in [`crate::utils::config::PermissionsConfig`]) into an owned
+/// `serde_yaml::Mapping` ref appropriate for [`render_users_groups_script`].
+///
+/// Returns `None` if the input is `None` or empty.
+pub fn mapping_from_hashmap(
+    src: Option<&std::collections::HashMap<String, serde_yaml::Value>>,
+) -> Option<Mapping> {
+    let map = src?;
+    if map.is_empty() {
+        return None;
+    }
+    let mut out = Mapping::new();
+    for (k, v) in map {
+        out.insert(serde_yaml::Value::String(k.clone()), v.clone());
+    }
+    Some(out)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn user(password: &str) -> serde_yaml::Value {
+        let mut m = Mapping::new();
+        m.insert(
+            serde_yaml::Value::String("password".to_string()),
+            serde_yaml::Value::String(password.to_string()),
+        );
+        serde_yaml::Value::Mapping(m)
+    }
+
+    #[test]
+    fn empty_inputs_produce_empty_script() {
+        assert_eq!(render_users_groups_script(None, None, "/etc", None), "");
+    }
+
+    #[test]
+    fn target_etc_is_substituted_verbatim() {
+        let mut users = Mapping::new();
+        users.insert(serde_yaml::Value::String("root".to_string()), user(""));
+        let script = render_users_groups_script(Some(&users), None, "$ROOTFS_WORK/etc", None);
+        assert!(script.contains("$ROOTFS_WORK/etc/passwd"));
+        assert!(script.contains("$ROOTFS_WORK/etc/shadow"));
+        assert!(script.contains("$ROOTFS_WORK/etc/group"));
+        // No copy preamble when copy_from is None.
+        assert!(!script.contains("cp \""));
+    }
+
+    #[test]
+    fn copy_from_emits_preamble() {
+        let mut users = Mapping::new();
+        users.insert(serde_yaml::Value::String("root".to_string()), user(""));
+        let script = render_users_groups_script(
+            Some(&users),
+            None,
+            "$AVOCADO_EXT_SYSROOTS/myext/etc",
+            Some("$AVOCADO_PREFIX/rootfs/etc"),
+        );
+        assert!(script.contains("cp \"$AVOCADO_PREFIX/rootfs/etc/passwd\""));
+        assert!(script.contains("cp \"$AVOCADO_PREFIX/rootfs/etc/shadow\""));
+        assert!(script.contains("cp \"$AVOCADO_PREFIX/rootfs/etc/group\""));
+        assert!(script.contains("$AVOCADO_EXT_SYSROOTS/myext/etc"));
+    }
+
+    #[test]
+    fn empty_password_emits_no_login_warning() {
+        let mut users = Mapping::new();
+        users.insert(serde_yaml::Value::String("root".to_string()), user(""));
+        let script = render_users_groups_script(Some(&users), None, "/etc", None);
+        assert!(script.contains("[WARNING] User 'root' will be able to login with NO PASSWORD"));
+    }
+
+    #[test]
+    fn hashed_password_does_not_warn() {
+        let mut users = Mapping::new();
+        users.insert(
+            serde_yaml::Value::String("alice".to_string()),
+            user("$6$salt$hash"),
+        );
+        let script = render_users_groups_script(Some(&users), None, "/etc", None);
+        assert!(!script.contains("[WARNING]"));
+        assert!(script.contains("alice:\\$6\\$salt\\$hash"));
+    }
+
+    #[test]
+    fn groups_only_still_runs_chown() {
+        let mut groups = Mapping::new();
+        let mut docker = Mapping::new();
+        docker.insert(
+            serde_yaml::Value::String("gid".to_string()),
+            serde_yaml::Value::Number(999.into()),
+        );
+        groups.insert(
+            serde_yaml::Value::String("docker".to_string()),
+            serde_yaml::Value::Mapping(docker),
+        );
+        let script = render_users_groups_script(None, Some(&groups), "/etc", None);
+        assert!(script.contains("Creating group 'docker'"));
+        assert!(script.contains("chown root:root \"/etc/passwd\""));
+    }
+}
diff --git a/src/utils/runtime.rs b/src/utils/runtime.rs
index b01b5a64..019b0fd0 100644
--- a/src/utils/runtime.rs
+++ b/src/utils/runtime.rs
@@ -228,6 +228,7 @@ mod tests {
             rootfs: None,
             initramfs: None,
             kernel: None,
+            permissions: None,
         }
     }
 
diff --git a/src/utils/target.rs b/src/utils/target.rs
index 0e4cf051..e9969c57 100644
--- a/src/utils/target.rs
+++ b/src/utils/target.rs
@@ -255,6 +255,7 @@ mod tests {
             rootfs: None,
             initramfs: None,
             kernel: None,
+            permissions: None,
         }
     }
 
@@ -277,6 +278,7 @@ mod tests {
             rootfs: None,
             initramfs: None,
             kernel: None,
+            permissions: None,
         }
     }
 
@@ -299,6 +301,7 @@ mod tests {
             rootfs: None,
             initramfs: None,
             kernel: None,
+            permissions: None,
         }
     }
 
diff --git a/tests/fixtures/configs/with-permissions.yaml b/tests/fixtures/configs/with-permissions.yaml
new file mode 100644
index 00000000..b8458f64
--- /dev/null
+++ b/tests/fixtures/configs/with-permissions.yaml
@@ -0,0 +1,35 @@
+default_target: qemux86-64
+sdk:
+  image: ghcr.io/avocado-framework/avocado-sdk:latest
+runtimes:
+  default:
+    target: x86_64-unknown-linux-gnu
+
+permissions:
+  main:
+    users:
+      root:
+        password: ''
+      avocado:
+        uid: 1000
+        groups:
+        - avocado
+        password: $6$9YieAo4LtYEIqB6K$og/ykbnIiXP21yc6WHAVKkkIMNE5jaho8Ijj6zFo0UlOxWGpH9xrduFc0P9UYBtQXz2LrJjx7DK7/XAObLoqh0
+        gecos: Avocado service user
+        home: /home/avocado
+        shell: /bin/bash
+    groups:
+      avocado:
+        gid: 1000
+
+rootfs:
+  default:
+    packages:
+      avocado-pkg-rootfs: '*'
+    permissions: main
+
+initramfs:
+  default:
+    packages:
+      avocado-pkg-initramfs: '*'
+    permissions: main

From c7e165e862b44093e9cc4744a2aa7a5534432c45 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 26 May 2026 20:23:21 -0400
Subject: [PATCH 02/21] stamps: split input hashes per build step to fix
 over-invalidation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, ext install/build/image all shared a single
`compute_ext_input_hash` and runtime install/build shared a single
`compute_runtime_input_hash`. Editing a field that only affects the
build (e.g. ext `image:` kabtool args, `var_files:`, runtime `var:`,
runtime `post_build:`) invalidated the install stamp too, which cascaded
into the install step being re-run via the dependency chain.

Per-step hash functions now cover exactly what each step uses:

  ext install -> packages, types, source
  ext build   -> install inputs + image, overlay, post_build (path + content)
  ext image   -> build inputs + var_files, subvolumes, filesystem
  runtime install -> packages, target
  runtime build   -> install inputs + narrowed kernel, var, var_files,
                     post_build (path + content), rootfs/initramfs filesystem,
                     ext docker_images
  sdk install     -> sdk.packages/image/repo_url/repo_release (no longer
                     includes rootfs/initramfs.packages — those have their
                     own install stamps)
  rootfs install  -> rootfs.packages, rootfs.overlay, narrowed kernel,
                     post_install (path + content)
  initramfs install -> same shape as rootfs

The `kernel:` block is now hashed via a narrow {package, version, compile,
install} mapping at every call site, so cosmetic edits (metadata, new
fields) don't invalidate stamps that don't actually consume them. The
`post_build` / `post_install` hooks now hash script *contents* in addition
to the path, so editing the script body invalidates the stamp without
`--no-stamps`.

`validate_stamps_batch` now accepts a slice of (component, command,
hash) triples so each requirement is compared against the matching
step's hash instead of one shared hash applied to all stamps for a
component.

STAMP_VERSION bumped 1 -> 2; older stamps invalidate on first run after
upgrade, then the new narrower hashes apply going forward.

Adds 14 negative-invalidation tests locking the new shape in place ("X
must NOT invalidate Y" for each step+field pair we untangled).
---
 src/commands/build.rs             |   3 +-
 src/commands/ext/build.rs         |  39 +-
 src/commands/ext/checkout.rs      |  10 +-
 src/commands/ext/clean.rs         |   3 +-
 src/commands/ext/image.rs         |  52 +-
 src/commands/ext/install.rs       |   4 +-
 src/commands/hitl/server.rs       |  12 +-
 src/commands/rootfs/install.rs    |   4 +-
 src/commands/runtime/build.rs     |  38 +-
 src/commands/runtime/clean.rs     |   6 +-
 src/commands/runtime/deploy.rs    |   3 +-
 src/commands/runtime/install.rs   |   4 +-
 src/commands/runtime/provision.rs |   2 +-
 src/commands/runtime/sign.rs      |   3 +-
 src/commands/sdk/clean.rs         |   2 +-
 src/commands/sdk/compile.rs       |  10 +-
 src/commands/sdk/package.rs       |   2 +-
 src/utils/config.rs               |  14 +
 src/utils/prerequisites.rs        |   2 +-
 src/utils/stamps.rs               | 893 +++++++++++++++++++++++++-----
 20 files changed, 895 insertions(+), 211 deletions(-)

diff --git a/src/commands/build.rs b/src/commands/build.rs
index 30cfde0f..c7af3269 100644
--- a/src/commands/build.rs
+++ b/src/commands/build.rs
@@ -197,8 +197,7 @@ impl BuildCommand {
             let output = container_helper
                 .run_in_container_with_output(run_config)
                 .await?;
-            let validation =
-                validate_stamps_batch(&required, output.as_deref().unwrap_or(""), None);
+            let validation = validate_stamps_batch(&required, output.as_deref().unwrap_or(""), &[]);
 
             if !validation.is_satisfied() {
                 let error =
diff --git a/src/commands/ext/build.rs b/src/commands/ext/build.rs
index 4346645d..efffaced 100644
--- a/src/commands/ext/build.rs
+++ b/src/commands/ext/build.rs
@@ -10,9 +10,9 @@ use crate::utils::lockfile::LockFile;
 use crate::utils::output::{print_error, print_info, print_success, print_warning, OutputLevel};
 use crate::utils::permissions::render_users_groups_script;
 use crate::utils::stamps::{
-    compute_ext_input_hash, generate_batch_read_stamps_script, generate_write_stamp_script,
-    resolve_required_stamps, validate_stamps_batch, Stamp, StampCommand, StampComponent,
-    StampOutputs,
+    compute_ext_build_input_hash, compute_ext_install_input_hash,
+    generate_batch_read_stamps_script, generate_write_stamp_script, resolve_required_stamps,
+    validate_stamps_batch, Stamp, StampCommand, StampComponent, StampOutputs,
 };
 use crate::utils::target::resolve_target_required;
 use crate::utils::tui::{TaskId, TuiGuard};
@@ -266,17 +266,22 @@ impl ExtBuildCommand {
                 .run_in_container_with_output(run_config)
                 .await?;
 
-            // Compute current inputs from composed config for staleness detection.
-            // This ensures that changes to path-based extension packages are detected.
-            // Only compare against Extension stamps — SDK/compile-deps stamps use their own hash.
-            let current_inputs = compute_ext_input_hash(parsed, &self.extension).ok();
-            let validation = validate_stamps_batch(
-                &required,
-                output.as_deref().unwrap_or(""),
-                current_inputs
-                    .as_ref()
-                    .map(|i| (&StampComponent::Extension, i)),
-            );
+            // Compute step-scoped current inputs for staleness detection.
+            // Install + build stamps have different input hashes — each requirement
+            // is matched against the entry for its (component, command) pair.
+            let project_root = config.project_root(&self.config_path);
+            let install_inputs = compute_ext_install_input_hash(parsed, &self.extension).ok();
+            let build_inputs =
+                compute_ext_build_input_hash(parsed, &self.extension, &project_root).ok();
+            let mut current_inputs: Vec<crate::utils::stamps::CurrentInput<'_>> = Vec::new();
+            if let Some(ref i) = install_inputs {
+                current_inputs.push((StampComponent::Extension, StampCommand::Install, i));
+            }
+            if let Some(ref i) = build_inputs {
+                current_inputs.push((StampComponent::Extension, StampCommand::Build, i));
+            }
+            let validation =
+                validate_stamps_batch(&required, output.as_deref().unwrap_or(""), &current_inputs);
 
             if !validation.is_satisfied() {
                 let err =
@@ -706,7 +711,11 @@ impl ExtBuildCommand {
             // Use the composed/merged config (which includes remote extension configs)
             // rather than re-reading the raw local file, so that path-based extension
             // packages are included in the hash for proper staleness detection.
-            let inputs = compute_ext_input_hash(parsed, &self.extension)?;
+            let inputs = compute_ext_build_input_hash(
+                parsed,
+                &self.extension,
+                &config.project_root(&self.config_path),
+            )?;
             let outputs = StampOutputs::default();
             let stamp = Stamp::ext_build(&self.extension, &target, inputs, outputs);
             let stamp_script = generate_write_stamp_script(&stamp)?;
diff --git a/src/commands/ext/checkout.rs b/src/commands/ext/checkout.rs
index 672a4942..b121ff28 100644
--- a/src/commands/ext/checkout.rs
+++ b/src/commands/ext/checkout.rs
@@ -136,7 +136,7 @@ impl ExtCheckoutCommand {
                     .await?;
 
                 let validation =
-                    validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), None);
+                    validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), &[]);
 
                 if !validation.is_satisfied() {
                     validation
@@ -676,7 +676,7 @@ mod tests {
             install_json
         );
 
-        let result = validate_stamps_batch(&requirements, &output, None);
+        let result = validate_stamps_batch(&requirements, &output, &[]);
 
         // Should pass without needing build stamp
         assert!(result.is_satisfied());
@@ -708,7 +708,7 @@ mod tests {
             sdk_json
         );
 
-        let result = validate_stamps_batch(&requirements, &output, None);
+        let result = validate_stamps_batch(&requirements, &output, &[]);
 
         assert!(!result.is_satisfied());
         assert_eq!(result.missing.len(), 1);
@@ -752,7 +752,7 @@ mod tests {
             install_json
         );
 
-        let result_before = validate_stamps_batch(&requirements, &output_before, None);
+        let result_before = validate_stamps_batch(&requirements, &output_before, &[]);
         assert!(result_before.is_satisfied(), "Should pass before clean");
 
         // After ext clean: SDK still there, ext stamp gone
@@ -762,7 +762,7 @@ mod tests {
             sdk_json
         );
 
-        let result_after = validate_stamps_batch(&requirements, &output_after, None);
+        let result_after = validate_stamps_batch(&requirements, &output_after, &[]);
         assert!(!result_after.is_satisfied(), "Should fail after ext clean");
         assert_eq!(result_after.missing.len(), 1);
     }
diff --git a/src/commands/ext/clean.rs b/src/commands/ext/clean.rs
index 9682d703..6fd13c7a 100644
--- a/src/commands/ext/clean.rs
+++ b/src/commands/ext/clean.rs
@@ -235,8 +235,7 @@ impl ExtCleanCommand {
             .run_in_container_with_output(run_config)
             .await?;
 
-        let validation =
-            validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), None);
+        let validation = validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), &[]);
 
         if !validation.is_satisfied() {
             validation
diff --git a/src/commands/ext/image.rs b/src/commands/ext/image.rs
index 8d0fb715..d16075c1 100644
--- a/src/commands/ext/image.rs
+++ b/src/commands/ext/image.rs
@@ -9,9 +9,9 @@ use crate::utils::container::{RunConfig, SdkContainer, TuiContext};
 use crate::utils::lockfile::LockFile;
 use crate::utils::output::{print_info, print_success, print_warning, OutputLevel};
 use crate::utils::stamps::{
-    compute_ext_input_hash, compute_ext_input_hash_with_fs, generate_batch_read_stamps_script,
-    generate_write_stamp_script, resolve_required_stamps, validate_stamps_batch, Stamp,
-    StampCommand, StampComponent, StampOutputs,
+    compute_ext_build_input_hash, compute_ext_image_input_hash, compute_ext_install_input_hash,
+    generate_batch_read_stamps_script, generate_write_stamp_script, resolve_required_stamps,
+    validate_stamps_batch, CurrentInput, Stamp, StampCommand, StampComponent, StampOutputs,
 };
 use crate::utils::target::resolve_target_required;
 use crate::utils::tui::{TaskId, TuiGuard};
@@ -267,17 +267,33 @@ impl ExtImageCommand {
                 .run_in_container_with_output(run_config)
                 .await?;
 
-            // Compute current inputs from composed config for staleness detection.
-            // Use the base hash (without filesystem) to match what install/build stamps wrote.
-            // The filesystem-aware hash is only used when writing/reading the image stamp itself.
-            let current_inputs = compute_ext_input_hash(parsed, &self.extension).ok();
-            let validation = validate_stamps_batch(
-                &required,
-                output.as_deref().unwrap_or(""),
-                current_inputs
-                    .as_ref()
-                    .map(|i| (&StampComponent::Extension, i)),
-            );
+            // Compute step-scoped current inputs. Each Extension stamp
+            // (install / build / image) has its own narrow hash; pass all
+            // three so validate_stamps_batch can match each requirement
+            // against the matching step's hash.
+            let project_root = config.project_root(&self.config_path);
+            let install_inputs = compute_ext_install_input_hash(parsed, &self.extension).ok();
+            let build_inputs =
+                compute_ext_build_input_hash(parsed, &self.extension, &project_root).ok();
+            let image_inputs = compute_ext_image_input_hash(
+                parsed,
+                &self.extension,
+                Some(effective_fs),
+                &project_root,
+            )
+            .ok();
+            let mut current_inputs: Vec<CurrentInput<'_>> = Vec::new();
+            if let Some(ref i) = install_inputs {
+                current_inputs.push((StampComponent::Extension, StampCommand::Install, i));
+            }
+            if let Some(ref i) = build_inputs {
+                current_inputs.push((StampComponent::Extension, StampCommand::Build, i));
+            }
+            if let Some(ref i) = image_inputs {
+                current_inputs.push((StampComponent::Extension, StampCommand::Image, i));
+            }
+            let validation =
+                validate_stamps_batch(&required, output.as_deref().unwrap_or(""), &current_inputs);
 
             if !validation.is_satisfied() {
                 validation
@@ -631,8 +647,12 @@ impl ExtImageCommand {
 
             // Write extension image stamp (unless --no-stamps)
             if !self.no_stamps {
-                let inputs =
-                    compute_ext_input_hash_with_fs(parsed, &self.extension, Some(filesystem))?;
+                let inputs = compute_ext_image_input_hash(
+                    parsed,
+                    &self.extension,
+                    Some(filesystem),
+                    &config.project_root(&self.config_path),
+                )?;
                 let outputs = StampOutputs::default();
                 let stamp = Stamp::ext_image(&self.extension, &target, inputs, outputs);
                 let stamp_script = generate_write_stamp_script(&stamp)?;
diff --git a/src/commands/ext/install.rs b/src/commands/ext/install.rs
index 790a485e..e025ba31 100644
--- a/src/commands/ext/install.rs
+++ b/src/commands/ext/install.rs
@@ -13,7 +13,7 @@ use crate::utils::lockfile::{build_package_spec_with_lock, LockFile, SysrootType
 use crate::utils::output::{print_debug, print_error, print_info, print_success, OutputLevel};
 use crate::utils::runs_on::RunsOnContext;
 use crate::utils::stamps::{
-    compute_ext_input_hash, generate_write_stamp_script, Stamp, StampOutputs,
+    compute_ext_install_input_hash, generate_write_stamp_script, Stamp, StampOutputs,
 };
 use crate::utils::target::resolve_target_required;
 use crate::utils::tui::{TaskId, TuiGuard};
@@ -412,7 +412,7 @@ impl ExtInstallCommand {
                     ctx.renderer
                         .append_output(&ctx.task_id, "Writing install stamp...".to_string());
                 }
-                let inputs = compute_ext_input_hash(parsed, ext_name)?;
+                let inputs = compute_ext_install_input_hash(parsed, ext_name)?;
                 let outputs = StampOutputs::default();
                 let stamp = Stamp::ext_install(ext_name, target, inputs, outputs);
                 let stamp_script = generate_write_stamp_script(&stamp)?;
diff --git a/src/commands/hitl/server.rs b/src/commands/hitl/server.rs
index 3cbc3dc6..579ac439 100644
--- a/src/commands/hitl/server.rs
+++ b/src/commands/hitl/server.rs
@@ -130,7 +130,7 @@ impl HitlServerCommand {
 
             // Validate all stamps from batch output
             let validation =
-                validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), None);
+                validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), &[]);
 
             if !validation.is_satisfied() {
                 validation
@@ -526,7 +526,7 @@ mod tests {
             build_json
         );
 
-        let result = validate_stamps_batch(&requirements, &output, None);
+        let result = validate_stamps_batch(&requirements, &output, &[]);
         assert!(result.is_satisfied());
     }
 
@@ -567,7 +567,7 @@ mod tests {
             install_json
         );
 
-        let result = validate_stamps_batch(&requirements, &output, None);
+        let result = validate_stamps_batch(&requirements, &output, &[]);
         assert!(!result.is_satisfied());
         assert_eq!(result.missing.len(), 1);
         assert_eq!(result.missing[0].relative_path(), "ext/app/build.stamp");
@@ -618,7 +618,7 @@ mod tests {
             build_json
         );
 
-        let result_before = validate_stamps_batch(&requirements, &output_before, None);
+        let result_before = validate_stamps_batch(&requirements, &output_before, &[]);
         assert!(result_before.is_satisfied(), "Should pass before clean");
 
         // After ext clean network-driver: SDK still there, ext stamps gone
@@ -628,7 +628,7 @@ mod tests {
             sdk_json
         );
 
-        let result_after = validate_stamps_batch(&requirements, &output_after, None);
+        let result_after = validate_stamps_batch(&requirements, &output_after, &[]);
         assert!(!result_after.is_satisfied(), "Should fail after ext clean");
         assert_eq!(
             result_after.missing.len(),
@@ -698,7 +698,7 @@ mod tests {
             ext_b_build_json
         );
 
-        let result = validate_stamps_batch(&requirements, &output_partial, None);
+        let result = validate_stamps_batch(&requirements, &output_partial, &[]);
         assert!(
             !result.is_satisfied(),
             "Should fail when one extension is cleaned"
diff --git a/src/commands/rootfs/install.rs b/src/commands/rootfs/install.rs
index e7207f2f..5095a5dc 100644
--- a/src/commands/rootfs/install.rs
+++ b/src/commands/rootfs/install.rs
@@ -816,12 +816,12 @@ $DNF_SDK_HOST $DNF_SDK_TARGET_REPO_CONF \
             if let Some(parsed) = params.parsed {
                 let stamp_result = match params.sysroot_type {
                     SysrootType::Rootfs => {
-                        let inputs = compute_rootfs_input_hash(parsed)?;
+                        let inputs = compute_rootfs_input_hash(parsed, params.src_dir)?;
                         let outputs = StampOutputs::default();
                         Ok(Stamp::rootfs_install(params.target, inputs, outputs))
                     }
                     SysrootType::Initramfs => {
-                        let inputs = compute_initramfs_input_hash(parsed)?;
+                        let inputs = compute_initramfs_input_hash(parsed, params.src_dir)?;
                         let outputs = StampOutputs::default();
                         Ok(Stamp::initramfs_install(params.target, inputs, outputs))
                     }
diff --git a/src/commands/runtime/build.rs b/src/commands/runtime/build.rs
index 51da4098..b247653a 100644
--- a/src/commands/runtime/build.rs
+++ b/src/commands/runtime/build.rs
@@ -9,9 +9,10 @@ use crate::utils::{
     permissions::{mapping_from_hashmap, render_users_groups_script},
     runs_on::RunsOnContext,
     stamps::{
-        compute_runtime_input_hash, generate_batch_read_stamps_script, generate_write_stamp_script,
-        resolve_required_stamps_for_runtime_build, validate_stamps_batch, Stamp, StampComponent,
-        StampOutputs,
+        compute_runtime_build_input_hash, compute_runtime_install_input_hash,
+        generate_batch_read_stamps_script, generate_write_stamp_script,
+        resolve_required_stamps_for_runtime_build, validate_stamps_batch, CurrentInput, Stamp,
+        StampCommand, StampComponent, StampOutputs,
     },
     target::resolve_target_required,
     tui::{TaskId, TuiGuard},
@@ -271,16 +272,22 @@ impl RuntimeBuildCommand {
                 .get_merged_runtime_config(&self.runtime_name, target_arch, &self.config_path)
                 .ok()
                 .flatten();
-            let current_inputs = merged_runtime
+            let project_root = config.project_root(&self.config_path);
+            let install_inputs = merged_runtime
                 .as_ref()
-                .and_then(|mr| compute_runtime_input_hash(mr, &self.runtime_name, parsed).ok());
-            let validation = validate_stamps_batch(
-                &required,
-                output.as_deref().unwrap_or(""),
-                current_inputs
-                    .as_ref()
-                    .map(|i| (&StampComponent::Runtime, i)),
-            );
+                .and_then(|mr| compute_runtime_install_input_hash(mr, &self.runtime_name).ok());
+            let build_inputs = merged_runtime.as_ref().and_then(|mr| {
+                compute_runtime_build_input_hash(mr, &self.runtime_name, parsed, &project_root).ok()
+            });
+            let mut current_inputs: Vec<CurrentInput<'_>> = Vec::new();
+            if let Some(ref i) = install_inputs {
+                current_inputs.push((StampComponent::Runtime, StampCommand::Install, i));
+            }
+            if let Some(ref i) = build_inputs {
+                current_inputs.push((StampComponent::Runtime, StampCommand::Build, i));
+            }
+            let validation =
+                validate_stamps_batch(&required, output.as_deref().unwrap_or(""), &current_inputs);
 
             if !validation.is_satisfied() {
                 validation
@@ -736,7 +743,12 @@ impl RuntimeBuildCommand {
             let merged_runtime = config
                 .get_merged_runtime_config(&self.runtime_name, target_arch, &self.config_path)?
                 .unwrap_or_default();
-            let inputs = compute_runtime_input_hash(&merged_runtime, &self.runtime_name, parsed)?;
+            let inputs = compute_runtime_build_input_hash(
+                &merged_runtime,
+                &self.runtime_name,
+                parsed,
+                &config.project_root(&self.config_path),
+            )?;
             let outputs = StampOutputs::default();
             let stamp = Stamp::runtime_build(&self.runtime_name, target_arch, inputs, outputs);
             let stamp_script = generate_write_stamp_script(&stamp)?;
diff --git a/src/commands/runtime/clean.rs b/src/commands/runtime/clean.rs
index 6b6bb0d0..c36129be 100644
--- a/src/commands/runtime/clean.rs
+++ b/src/commands/runtime/clean.rs
@@ -359,7 +359,7 @@ mod tests {
             rt_json
         );
 
-        let result_before = validate_stamps_batch(&requirements, &output_before, None);
+        let result_before = validate_stamps_batch(&requirements, &output_before, &[]);
         assert!(result_before.is_satisfied());
 
         // After runtime clean: SDK still there, runtime stamps gone
@@ -369,7 +369,7 @@ mod tests {
             sdk_json
         );
 
-        let result_after = validate_stamps_batch(&requirements, &output_after, None);
+        let result_after = validate_stamps_batch(&requirements, &output_after, &[]);
         assert!(!result_after.is_satisfied());
         assert_eq!(result_after.missing.len(), 1);
         assert_eq!(
@@ -434,7 +434,7 @@ mod tests {
             ext_build_json
         );
 
-        let result = validate_stamps_batch(&requirements, &output_after, None);
+        let result = validate_stamps_batch(&requirements, &output_after, &[]);
         assert!(!result.is_satisfied());
         // Only runtime stamp should be missing
         assert_eq!(result.satisfied.len(), 3);
diff --git a/src/commands/runtime/deploy.rs b/src/commands/runtime/deploy.rs
index 8e08b1f4..b234c75c 100644
--- a/src/commands/runtime/deploy.rs
+++ b/src/commands/runtime/deploy.rs
@@ -270,8 +270,7 @@ impl RuntimeDeployCommand {
                 }
             };
 
-            let validation =
-                validate_stamps_batch(&required, output.as_deref().unwrap_or(""), None);
+            let validation = validate_stamps_batch(&required, output.as_deref().unwrap_or(""), &[]);
 
             if !validation.is_satisfied() {
                 let msg = format!("Cannot deploy runtime '{}'", self.runtime_name);
diff --git a/src/commands/runtime/install.rs b/src/commands/runtime/install.rs
index 3312b32e..38ca4cf9 100644
--- a/src/commands/runtime/install.rs
+++ b/src/commands/runtime/install.rs
@@ -13,7 +13,7 @@ use crate::utils::lockfile::{build_package_spec_with_lock, LockFile, SysrootType
 use crate::utils::output::{print_debug, print_error, print_info, print_success, OutputLevel};
 use crate::utils::runs_on::RunsOnContext;
 use crate::utils::stamps::{
-    compute_runtime_input_hash, generate_write_stamp_script, Stamp, StampOutputs,
+    compute_runtime_install_input_hash, generate_write_stamp_script, Stamp, StampOutputs,
 };
 use crate::utils::target::resolve_target_required;
 use crate::utils::tui::{TaskId, TuiGuard};
@@ -314,7 +314,7 @@ impl RuntimeInstallCommand {
                     &target_arch,
                     &self.config_path,
                 )? {
-                    let inputs = compute_runtime_input_hash(&merged_runtime, runtime_name, parsed)?;
+                    let inputs = compute_runtime_install_input_hash(&merged_runtime, runtime_name)?;
                     let outputs = StampOutputs::default();
                     let stamp = Stamp::runtime_install(runtime_name, &target_arch, inputs, outputs);
                     let stamp_script = generate_write_stamp_script(&stamp)?;
diff --git a/src/commands/runtime/provision.rs b/src/commands/runtime/provision.rs
index fde9c1b1..8b681155 100644
--- a/src/commands/runtime/provision.rs
+++ b/src/commands/runtime/provision.rs
@@ -180,7 +180,7 @@ impl RuntimeProvisionCommand {
             };
 
             // Validate all stamps from batch output
-            let validation = validate_stamps_batch(&required, output_str, None);
+            let validation = validate_stamps_batch(&required, output_str, &[]);
 
             if !validation.is_satisfied() {
                 // Include the --runs-on target in error message for SDK install hints
diff --git a/src/commands/runtime/sign.rs b/src/commands/runtime/sign.rs
index 29e45fec..032b5e59 100644
--- a/src/commands/runtime/sign.rs
+++ b/src/commands/runtime/sign.rs
@@ -122,8 +122,7 @@ impl RuntimeSignCommand {
                 .await?;
 
             // Validate all stamps from batch output
-            let validation =
-                validate_stamps_batch(&required, output.as_deref().unwrap_or(""), None);
+            let validation = validate_stamps_batch(&required, output.as_deref().unwrap_or(""), &[]);
 
             if !validation.is_satisfied() {
                 validation
diff --git a/src/commands/sdk/clean.rs b/src/commands/sdk/clean.rs
index d6acfce2..903d146e 100644
--- a/src/commands/sdk/clean.rs
+++ b/src/commands/sdk/clean.rs
@@ -132,7 +132,7 @@ impl SdkCleanCommand {
                 .await?;
 
             let validation =
-                validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), None);
+                validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), &[]);
 
             if !validation.is_satisfied() {
                 validation
diff --git a/src/commands/sdk/compile.rs b/src/commands/sdk/compile.rs
index 2cbb8272..6efc2857 100644
--- a/src/commands/sdk/compile.rs
+++ b/src/commands/sdk/compile.rs
@@ -154,7 +154,7 @@ impl SdkCompileCommand {
                 .await?;
 
             let validation =
-                validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), None);
+                validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), &[]);
 
             if !validation.is_satisfied() {
                 validation
@@ -533,7 +533,7 @@ dependencies = { gcc = "*" }
 
         // SDK stamp missing
         let output = format!("sdk/{}/install.stamp:::null", get_local_arch());
-        let result = validate_stamps_batch(&requirements, &output, None);
+        let result = validate_stamps_batch(&requirements, &output, &[]);
 
         assert!(!result.is_satisfied());
         assert_eq!(result.missing.len(), 1);
@@ -558,7 +558,7 @@ dependencies = { gcc = "*" }
         let sdk_json = serde_json::to_string(&sdk_stamp).unwrap();
 
         let output = format!("sdk/{}/install.stamp:::{}", get_local_arch(), sdk_json);
-        let result = validate_stamps_batch(&requirements, &output, None);
+        let result = validate_stamps_batch(&requirements, &output, &[]);
 
         assert!(result.is_satisfied());
         assert_eq!(result.satisfied.len(), 1);
@@ -582,12 +582,12 @@ dependencies = { gcc = "*" }
         let sdk_json = serde_json::to_string(&sdk_stamp).unwrap();
 
         let output_before = format!("sdk/{}/install.stamp:::{}", get_local_arch(), sdk_json);
-        let result_before = validate_stamps_batch(&requirements, &output_before, None);
+        let result_before = validate_stamps_batch(&requirements, &output_before, &[]);
         assert!(result_before.is_satisfied(), "Should pass before clean");
 
         // After clean --stamps: SDK stamp gone (simulating rm -rf .stamps/)
         let output_after = format!("sdk/{}/install.stamp:::null", get_local_arch());
-        let result_after = validate_stamps_batch(&requirements, &output_after, None);
+        let result_after = validate_stamps_batch(&requirements, &output_after, &[]);
         assert!(
             !result_after.is_satisfied(),
             "Should fail after clean --stamps"
diff --git a/src/commands/sdk/package.rs b/src/commands/sdk/package.rs
index 1c732b28..9aea2aec 100644
--- a/src/commands/sdk/package.rs
+++ b/src/commands/sdk/package.rs
@@ -123,7 +123,7 @@ impl SdkPackageCommand {
                 .await?;
 
             let validation =
-                validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), None);
+                validate_stamps_batch(&requirements, output.as_deref().unwrap_or(""), &[]);
 
             if !validation.is_satisfied() {
                 validation
diff --git a/src/utils/config.rs b/src/utils/config.rs
index c46c002b..44a73a2e 100644
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@@ -3867,6 +3867,20 @@ impl Config {
     }
 
     /// Get the resolved source directory path
+    /// Best-effort project root for resolving project-relative paths
+    /// (`post_install` / `post_build` scripts, etc.) when reading them off
+    /// disk on the host. Uses the resolved `src_dir` when set, otherwise
+    /// falls back to the directory containing the config file.
+    pub fn project_root<P: AsRef<Path>>(&self, config_path: P) -> PathBuf {
+        self.get_resolved_src_dir(&config_path).unwrap_or_else(|| {
+            config_path
+                .as_ref()
+                .parent()
+                .unwrap_or_else(|| Path::new("."))
+                .to_path_buf()
+        })
+    }
+
     /// If src_dir is configured, it resolves relative paths relative to the config file
     /// If not configured, returns None (use default behavior)
     pub fn get_resolved_src_dir<P: AsRef<Path>>(&self, config_path: P) -> Option<PathBuf> {
diff --git a/src/utils/prerequisites.rs b/src/utils/prerequisites.rs
index 375c2fe4..6b79c1e3 100644
--- a/src/utils/prerequisites.rs
+++ b/src/utils/prerequisites.rs
@@ -58,7 +58,7 @@ pub async fn check_prerequisites<T: TaskPrerequisites>(
         .context("Failed to run prerequisite stamp check")?
         .unwrap_or_default();
 
-    let validation = validate_stamps_batch(&requirements, &stdout, None);
+    let validation = validate_stamps_batch(&requirements, &stdout, &[]);
 
     if !validation.is_satisfied() {
         validation
diff --git a/src/utils/stamps.rs b/src/utils/stamps.rs
index f0828757..9ca1090e 100644
--- a/src/utils/stamps.rs
+++ b/src/utils/stamps.rs
@@ -7,14 +7,12 @@
 //! 2. Detects staleness via content-addressable hashing (config + package list)
 //! 3. Enforces command ordering with dependency resolution from config
 
-// Allow deprecated variants for backward compatibility during migration
-#![allow(deprecated)]
-
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
 use std::fmt;
+use std::path::Path;
 
 /// Get the local machine's CPU architecture
 ///
@@ -35,8 +33,11 @@ pub fn get_local_arch() -> &'static str {
     }
 }
 
-/// Current stamp format version
-pub const STAMP_VERSION: u32 = 1;
+/// Current stamp format version. Bumped from 1 → 2 in the per-step input-hash
+/// rework: each component step now has its own narrow hash, so old stamps
+/// written under the broader shared hashes cannot be compared with current
+/// inputs. Any stamp at an older version is treated as stale.
+pub const STAMP_VERSION: u32 = 2;
 
 /// Command types that can have stamps
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
@@ -357,6 +358,13 @@ impl Stamp {
 
     /// Check if the stamp inputs match the current inputs
     pub fn is_current(&self, current_inputs: &StampInputs) -> bool {
+        // Stamp format version must match — older stamps were written
+        // against the pre-split shared hash functions and cannot be
+        // compared against the new narrower per-step hashes.
+        if self.version != STAMP_VERSION {
+            return false;
+        }
+
         // Config hash must always match
         if self.inputs.config_hash != current_inputs.config_hash {
             return false;
@@ -814,12 +822,90 @@ pub fn compute_config_hash(value: &serde_yaml::Value) -> Result<String> {
     Ok(compute_hash(&json))
 }
 
+// ─── Per-step input-hash helpers ────────────────────────────────────────
+//
+// The hash functions below split each component's inputs into narrow,
+// step-scoped subsets. Adding a field to `runtime build`'s hash should NOT
+// invalidate `runtime install`'s stamp; this is enforced via separate
+// `compute_<component>_<step>_input_hash` functions, each pulling only the
+// keys that actually affect that step.
+//
+// `narrow_kernel_for_hash` and `hash_script_at` are shared building blocks
+// to keep the hash-data construction consistent across components.
+
+/// Extract the subset of a `kernel:` YAML block that actually affects what
+/// gets installed or built. Returns a fresh mapping with only `package`,
+/// `version`, `compile`, `install` keys (when present). Unknown / new
+/// fields are deliberately ignored so cosmetic kernel-block edits
+/// (comments, metadata, future additions that don't drive selection) do
+/// not invalidate stamps.
+fn narrow_kernel_for_hash(kernel: &serde_yaml::Value) -> serde_yaml::Value {
+    let mut out = serde_yaml::Mapping::new();
+    for key in ["package", "version", "compile", "install"] {
+        if let Some(v) = kernel.get(key) {
+            out.insert(serde_yaml::Value::String(key.to_string()), v.clone());
+        }
+    }
+    serde_yaml::Value::Mapping(out)
+}
+
+/// Hash the contents of a project-relative script file. The returned
+/// string is embedded into a hash mapping alongside the original relative
+/// path so the stamp invalidates on either (a) path changes, or (b)
+/// script-content edits.
+///
+/// Missing files hash to the literal `"missing"` sentinel — that way, a
+/// stamp written when the file existed will invalidate if the file is
+/// later removed, and adding the file later (path unchanged) invalidates
+/// the old "missing" stamp.
+fn hash_script_at(project_root: &Path, rel_path: &str) -> String {
+    let abs = project_root.join(rel_path);
+    match std::fs::read(&abs) {
+        Ok(bytes) => {
+            let mut hasher = Sha256::new();
+            hasher.update(&bytes);
+            let result = hasher.finalize();
+            let mut hex = String::with_capacity(result.len() * 2);
+            for b in result.iter() {
+                use std::fmt::Write;
+                let _ = write!(hex, "{b:02x}");
+            }
+            format!("sha256:{hex}")
+        }
+        Err(_) => "missing".to_string(),
+    }
+}
+
+/// Build the `{path, content_sha256}` mapping that we embed into input
+/// hashes for `post_build` / `post_install` hooks. Both fields go into
+/// the parent mapping so a path swap OR a content edit invalidates.
+fn script_hash_value(project_root: &Path, rel_path: &str) -> serde_yaml::Value {
+    let mut m = serde_yaml::Mapping::new();
+    m.insert(
+        serde_yaml::Value::String("path".to_string()),
+        serde_yaml::Value::String(rel_path.to_string()),
+    );
+    m.insert(
+        serde_yaml::Value::String("content_sha256".to_string()),
+        serde_yaml::Value::String(hash_script_at(project_root, rel_path)),
+    );
+    serde_yaml::Value::Mapping(m)
+}
+
 /// Compute input hash for SDK install
-/// Includes: sdk.dependencies, sdk.image, repo URLs
+///
+/// Includes only inputs that affect the SDK toolchain install itself:
+/// `sdk.packages`, `sdk.image`, `sdk.repo_url`, `sdk.repo_release`.
+///
+/// **Does NOT include `rootfs.packages` / `initramfs.packages`** —
+/// the rootfs and initramfs sysroots are populated by separate
+/// `rootfs install` / `initramfs install` steps with their own stamps.
+/// The orchestrating `avocado sdk install` command writes each of those
+/// stamps independently, so a rootfs-package change invalidates only
+/// the rootfs-install stamp and not the entire SDK toolchain install.
 pub fn compute_sdk_input_hash(config: &serde_yaml::Value) -> Result<StampInputs> {
     let mut hash_data = serde_yaml::Mapping::new();
 
-    // Include sdk.dependencies
     if let Some(sdk) = config.get("sdk") {
         if let Some(deps) = sdk.get("packages") {
             hash_data.insert(
@@ -847,26 +933,6 @@ pub fn compute_sdk_input_hash(config: &serde_yaml::Value) -> Result<StampInputs>
         }
     }
 
-    // Include rootfs.packages (affects rootfs sysroot installed during sdk install)
-    if let Some(rootfs) = config.get("rootfs") {
-        if let Some(packages) = rootfs.get("packages") {
-            hash_data.insert(
-                serde_yaml::Value::String("rootfs.packages".to_string()),
-                packages.clone(),
-            );
-        }
-    }
-
-    // Include initramfs.packages (affects initramfs sysroot installed during sdk install)
-    if let Some(initramfs) = config.get("initramfs") {
-        if let Some(packages) = initramfs.get("packages") {
-            hash_data.insert(
-                serde_yaml::Value::String("initramfs.packages".to_string()),
-                packages.clone(),
-            );
-        }
-    }
-
     let config_hash = compute_config_hash(&serde_yaml::Value::Mapping(hash_data))?;
     Ok(StampInputs::new(config_hash))
 }
@@ -918,22 +984,22 @@ pub fn compute_compile_deps_input_hash(
     Ok(StampInputs::new(config_hash))
 }
 
-pub fn compute_ext_input_hash(config: &serde_yaml::Value, ext_name: &str) -> Result<StampInputs> {
-    compute_ext_input_hash_with_fs(config, ext_name, None)
-}
-
-/// Compute input hash for an extension, including an optional resolved filesystem format.
-/// When `filesystem` is `Some`, it is included in the hash so that changing the image
-/// format (e.g. squashfs → erofs-lz4) invalidates the stamp.  The caller is responsible
-/// for resolving the effective value (explicit per-extension override or rootfs default).
-pub fn compute_ext_input_hash_with_fs(
+/// Compute input hash for **extension install**.
+///
+/// Includes only inputs that affect the package-install step:
+/// - `ext.<name>.packages` (what gets installed)
+/// - `ext.<name>.types` (sysext/confext drives a small set of auto-included packages)
+/// - `ext.<name>.source` (where the extension is fetched from)
+///
+/// Deliberately excludes `image`, `var_files`, `subvolumes`, `post_build`,
+/// `filesystem`, `permissions`, `overlay`, `version`, and all merge/service
+/// fields — those affect build/image output, not what gets installed.
+pub fn compute_ext_install_input_hash(
     config: &serde_yaml::Value,
     ext_name: &str,
-    filesystem: Option<&str>,
 ) -> Result<StampInputs> {
     let mut hash_data = serde_yaml::Mapping::new();
 
-    // Include ext.<name>.dependencies
     if let Some(ext) = config.get("extensions").and_then(|e| e.get(ext_name)) {
         if let Some(deps) = ext.get("packages") {
             hash_data.insert(
@@ -941,67 +1007,145 @@ pub fn compute_ext_input_hash_with_fs(
                 deps.clone(),
             );
         }
-        // Also include types as they affect build
         if let Some(types) = ext.get("types") {
             hash_data.insert(
                 serde_yaml::Value::String(format!("ext.{ext_name}.types")),
                 types.clone(),
             );
         }
-        // Include var_files as they affect which files are excluded from the .raw image
+        if let Some(source) = ext.get("source") {
+            hash_data.insert(
+                serde_yaml::Value::String(format!("ext.{ext_name}.source")),
+                source.clone(),
+            );
+        }
+    }
+
+    let config_hash = compute_config_hash(&serde_yaml::Value::Mapping(hash_data))?;
+    Ok(StampInputs::new(config_hash))
+}
+
+/// Compute input hash for **extension build**.
+///
+/// Includes the install inputs (so a package change invalidates build too)
+/// plus build-only inputs: `image` (kabtool args), `overlay`, and the
+/// `post_build` hook (both the relative path and its file content).
+///
+/// Excludes `var_files`, `subvolumes`, and the resolved `filesystem` —
+/// those only affect the image step.
+pub fn compute_ext_build_input_hash(
+    config: &serde_yaml::Value,
+    ext_name: &str,
+    project_root: &Path,
+) -> Result<StampInputs> {
+    let hash_data = ext_build_hash_data(config, ext_name, project_root);
+    let config_hash = compute_config_hash(&serde_yaml::Value::Mapping(hash_data))?;
+    Ok(StampInputs::new(config_hash))
+}
+
+/// Compute input hash for **extension image**.
+///
+/// Includes the build inputs plus image-only inputs: `var_files`,
+/// `subvolumes`, and the resolved `filesystem` format.
+pub fn compute_ext_image_input_hash(
+    config: &serde_yaml::Value,
+    ext_name: &str,
+    filesystem: Option<&str>,
+    project_root: &Path,
+) -> Result<StampInputs> {
+    let mut hash_data = ext_build_hash_data(config, ext_name, project_root);
+
+    if let Some(ext) = config.get("extensions").and_then(|e| e.get(ext_name)) {
         if let Some(var_files) = ext.get("var_files") {
             hash_data.insert(
                 serde_yaml::Value::String(format!("ext.{ext_name}.var_files")),
                 var_files.clone(),
             );
         }
-        // Include subvolumes as they affect var image creation flags
         if let Some(subvolumes) = ext.get("subvolumes") {
             hash_data.insert(
                 serde_yaml::Value::String(format!("ext.{ext_name}.subvolumes")),
                 subvolumes.clone(),
             );
         }
-        // Include image config as it determines output format and kabtool args
+    }
+    if let Some(fs) = filesystem {
+        hash_data.insert(
+            serde_yaml::Value::String(format!("ext.{ext_name}.filesystem")),
+            serde_yaml::Value::String(fs.to_string()),
+        );
+    }
+
+    let config_hash = compute_config_hash(&serde_yaml::Value::Mapping(hash_data))?;
+    Ok(StampInputs::new(config_hash))
+}
+
+/// Shared mapping construction for `ext build` (and the subset used by
+/// `ext image`). Keeping both steps' shared inputs in one place avoids
+/// drift between the two hash functions.
+fn ext_build_hash_data(
+    config: &serde_yaml::Value,
+    ext_name: &str,
+    project_root: &Path,
+) -> serde_yaml::Mapping {
+    let mut hash_data = serde_yaml::Mapping::new();
+
+    if let Some(ext) = config.get("extensions").and_then(|e| e.get(ext_name)) {
+        // Install-time inputs are also build-time inputs — a package change
+        // invalidates everything downstream.
+        if let Some(deps) = ext.get("packages") {
+            hash_data.insert(
+                serde_yaml::Value::String(format!("ext.{ext_name}.dependencies")),
+                deps.clone(),
+            );
+        }
+        if let Some(types) = ext.get("types") {
+            hash_data.insert(
+                serde_yaml::Value::String(format!("ext.{ext_name}.types")),
+                types.clone(),
+            );
+        }
+        if let Some(source) = ext.get("source") {
+            hash_data.insert(
+                serde_yaml::Value::String(format!("ext.{ext_name}.source")),
+                source.clone(),
+            );
+        }
+        // Build-only inputs.
         if let Some(image) = ext.get("image") {
             hash_data.insert(
                 serde_yaml::Value::String(format!("ext.{ext_name}.image")),
                 image.clone(),
             );
         }
-        // Include post_build so adding/removing/changing the hook re-runs the build.
-        // Note: this hashes the *path*, not the script's contents — re-run with
-        // --no-stamps to pick up edits to the script itself.
-        if let Some(post_build) = ext.get("post_build") {
+        if let Some(overlay) = ext.get("overlay") {
+            hash_data.insert(
+                serde_yaml::Value::String(format!("ext.{ext_name}.overlay")),
+                overlay.clone(),
+            );
+        }
+        if let Some(post_build) = ext.get("post_build").and_then(|v| v.as_str()) {
             hash_data.insert(
                 serde_yaml::Value::String(format!("ext.{ext_name}.post_build")),
-                post_build.clone(),
+                script_hash_value(project_root, post_build),
             );
         }
     }
 
-    // Include the resolved filesystem format when provided — determines the image
-    // format (.raw contents) and must invalidate the stamp when it changes.
-    if let Some(fs) = filesystem {
-        hash_data.insert(
-            serde_yaml::Value::String(format!("ext.{ext_name}.filesystem")),
-            serde_yaml::Value::String(fs.to_string()),
-        );
-    }
-
-    let config_hash = compute_config_hash(&serde_yaml::Value::Mapping(hash_data))?;
-    Ok(StampInputs::new(config_hash))
+    hash_data
 }
 
-/// Compute input hash for rootfs install
-/// Includes: rootfs.packages, top-level kernel config
+/// Compute input hash for **rootfs install**.
 ///
-/// The kernel block matters because rootfs install auto-appends
-/// `kernel-image-<kver>` and `packagegroup-avocado-rootfs-modules-<kver>`
-/// based on the resolved kernel version. Changing `kernel.version`
-/// changes what gets installed even though `rootfs.packages` is unchanged,
-/// so the stamp must invalidate when the kernel block changes.
-pub fn compute_rootfs_input_hash(config: &serde_yaml::Value) -> Result<StampInputs> {
+/// Includes `rootfs.packages`, `rootfs.overlay`, and the narrowed kernel
+/// selection (`package`/`version`/`compile`/`install` only — adding an
+/// unrelated `kernel.metadata` field does NOT invalidate). Also includes
+/// the `post_install` hook path and its file contents so an in-place
+/// script edit invalidates without `--no-stamps`.
+pub fn compute_rootfs_input_hash(
+    config: &serde_yaml::Value,
+    project_root: &Path,
+) -> Result<StampInputs> {
     let mut hash_data = serde_yaml::Mapping::new();
 
     if let Some(rootfs) = config.get("rootfs") {
@@ -1017,12 +1161,18 @@ pub fn compute_rootfs_input_hash(config: &serde_yaml::Value) -> Result<StampInpu
                 overlay.clone(),
             );
         }
+        if let Some(post_install) = rootfs.get("post_install").and_then(|v| v.as_str()) {
+            hash_data.insert(
+                serde_yaml::Value::String("rootfs.post_install".to_string()),
+                script_hash_value(project_root, post_install),
+            );
+        }
     }
 
     if let Some(kernel) = config.get("kernel") {
         hash_data.insert(
             serde_yaml::Value::String("kernel".to_string()),
-            kernel.clone(),
+            narrow_kernel_for_hash(kernel),
         );
     }
 
@@ -1030,13 +1180,14 @@ pub fn compute_rootfs_input_hash(config: &serde_yaml::Value) -> Result<StampInpu
     Ok(StampInputs::new(config_hash))
 }
 
-/// Compute input hash for initramfs install
-/// Includes: initramfs.packages, top-level kernel config
+/// Compute input hash for **initramfs install**.
 ///
-/// Same rationale as `compute_rootfs_input_hash`: initramfs install
-/// auto-appends `packagegroup-avocado-initramfs-modules-<kver>` based
-/// on the resolved kernel version.
-pub fn compute_initramfs_input_hash(config: &serde_yaml::Value) -> Result<StampInputs> {
+/// Same shape as [`compute_rootfs_input_hash`] — narrowed kernel block,
+/// `post_install` content hashed alongside its path.
+pub fn compute_initramfs_input_hash(
+    config: &serde_yaml::Value,
+    project_root: &Path,
+) -> Result<StampInputs> {
     let mut hash_data = serde_yaml::Mapping::new();
 
     if let Some(initramfs) = config.get("initramfs") {
@@ -1052,12 +1203,18 @@ pub fn compute_initramfs_input_hash(config: &serde_yaml::Value) -> Result<StampI
                 overlay.clone(),
             );
         }
+        if let Some(post_install) = initramfs.get("post_install").and_then(|v| v.as_str()) {
+            hash_data.insert(
+                serde_yaml::Value::String("initramfs.post_install".to_string()),
+                script_hash_value(project_root, post_install),
+            );
+        }
     }
 
     if let Some(kernel) = config.get("kernel") {
         hash_data.insert(
             serde_yaml::Value::String("kernel".to_string()),
-            kernel.clone(),
+            narrow_kernel_for_hash(kernel),
         );
     }
 
@@ -1065,25 +1222,60 @@ pub fn compute_initramfs_input_hash(config: &serde_yaml::Value) -> Result<StampI
     Ok(StampInputs::new(config_hash))
 }
 
-/// Compute input hash for runtime install
-/// Includes: runtime.<name>.dependencies (merged with target), kernel config,
-/// extension docker_images (affects var partition priming)
-pub fn compute_runtime_input_hash(
+/// Compute input hash for **runtime install**.
+///
+/// Includes only the inputs that affect the package-install step for the
+/// runtime sysroot: `runtime.<name>.packages` (merged with per-target
+/// overrides) and `runtime.<name>.target`. Excludes kernel, var, var_files,
+/// post_build, rootfs/initramfs filesystem, and extension docker_images —
+/// those affect the build step, not what gets installed for the runtime
+/// itself.
+pub fn compute_runtime_install_input_hash(
     merged_runtime: &serde_yaml::Value,
     runtime_name: &str,
-    parsed: &serde_yaml::Value,
 ) -> Result<StampInputs> {
     let mut hash_data = serde_yaml::Mapping::new();
 
-    // Include the merged dependencies section
     if let Some(deps) = merged_runtime.get("packages") {
         hash_data.insert(
             serde_yaml::Value::String(format!("runtime.{runtime_name}.dependencies")),
             deps.clone(),
         );
     }
+    if let Some(target) = merged_runtime.get("target") {
+        hash_data.insert(
+            serde_yaml::Value::String(format!("runtime.{runtime_name}.target")),
+            target.clone(),
+        );
+    }
 
-    // Include target if specified
+    let config_hash = compute_config_hash(&serde_yaml::Value::Mapping(hash_data))?;
+    Ok(StampInputs::new(config_hash))
+}
+
+/// Compute input hash for **runtime build**.
+///
+/// Includes the install inputs plus build-only inputs: the narrowed
+/// kernel selection (`package`/`version`/`compile`/`install` only), the
+/// runtime-level `var` and `var_files` config, the `post_build` hook
+/// (path + content), the rootfs/initramfs filesystem formats this
+/// runtime consumes, and any extension `docker_images` that this runtime
+/// needs primed at build time.
+pub fn compute_runtime_build_input_hash(
+    merged_runtime: &serde_yaml::Value,
+    runtime_name: &str,
+    parsed: &serde_yaml::Value,
+    project_root: &Path,
+) -> Result<StampInputs> {
+    let mut hash_data = serde_yaml::Mapping::new();
+
+    // Install inputs are also build inputs.
+    if let Some(deps) = merged_runtime.get("packages") {
+        hash_data.insert(
+            serde_yaml::Value::String(format!("runtime.{runtime_name}.dependencies")),
+            deps.clone(),
+        );
+    }
     if let Some(target) = merged_runtime.get("target") {
         hash_data.insert(
             serde_yaml::Value::String(format!("runtime.{runtime_name}.target")),
@@ -1091,16 +1283,14 @@ pub fn compute_runtime_input_hash(
         );
     }
 
-    // Include kernel config if specified (changes to kernel config should trigger rebuild)
+    // Build-only inputs.
     if let Some(kernel) = merged_runtime.get("kernel") {
         hash_data.insert(
             serde_yaml::Value::String(format!("runtime.{runtime_name}.kernel")),
-            kernel.clone(),
+            narrow_kernel_for_hash(kernel),
         );
     }
 
-    // Include docker_images from extensions in this runtime
-    // (changes to extension docker_images should trigger runtime rebuild to re-prime images)
     if let Some(ext_list) = merged_runtime
         .get("extensions")
         .and_then(|e| e.as_sequence())
@@ -1124,33 +1314,25 @@ pub fn compute_runtime_input_hash(
         }
     }
 
-    // Include runtime-level var_files if specified
     if let Some(var_files) = merged_runtime.get("var_files") {
         hash_data.insert(
             serde_yaml::Value::String(format!("runtime.{runtime_name}.var_files")),
             var_files.clone(),
         );
     }
-
-    // Include runtime-level var config (subvolumes, compression) if specified
     if let Some(var) = merged_runtime.get("var") {
         hash_data.insert(
             serde_yaml::Value::String(format!("runtime.{runtime_name}.var")),
             var.clone(),
         );
     }
-
-    // Include post_build so adding/removing/changing the hook re-runs the build.
-    // Note: this hashes the *path*, not the script's contents — re-run with
-    // --no-stamps to pick up edits to the script itself.
-    if let Some(post_build) = merged_runtime.get("post_build") {
+    if let Some(post_build) = merged_runtime.get("post_build").and_then(|v| v.as_str()) {
         hash_data.insert(
             serde_yaml::Value::String(format!("runtime.{runtime_name}.post_build")),
-            post_build.clone(),
+            script_hash_value(project_root, post_build),
         );
     }
 
-    // Include rootfs/initramfs filesystem formats (changes should trigger rebuild)
     if let Some(rootfs) = parsed.get("rootfs") {
         if let Some(fs) = rootfs.get("filesystem") {
             hash_data.insert(
@@ -1271,16 +1453,23 @@ pub fn parse_batch_stamps_output(
     result
 }
 
+/// A (component, command) key paired with the freshly computed input
+/// hash for that specific step. Passed into [`validate_stamps_batch`]
+/// so each requirement is compared against the correct step-scoped hash.
+pub type CurrentInput<'a> = (StampComponent, StampCommand, &'a StampInputs);
+
 /// Validate all stamp requirements from batch output in a single pass.
 ///
-/// `current_inputs` is an optional (component, hash) pair used for staleness detection.
-/// The hash is only compared against stamps matching the specified component type.
-/// Dependency stamps (e.g., SDK stamps when building an extension) are validated
-/// for existence only — their content hash was verified when they were created.
+/// `current_inputs` is a slice of (component, command, hash) triples
+/// used for staleness detection. A requirement is matched against the
+/// triple whose component AND command both match it. Requirements with
+/// no matching entry are validated for existence only — appropriate for
+/// dependency stamps (e.g. SDK stamps when building an extension) whose
+/// content hash was verified when they were created.
 pub fn validate_stamps_batch(
     requirements: &[StampRequirement],
     batch_output: &str,
-    current_inputs: Option<(&StampComponent, &StampInputs)>,
+    current_inputs: &[CurrentInput<'_>],
 ) -> StampValidationResult {
     let stamp_data = parse_batch_stamps_output(batch_output);
     let mut validation = StampValidationResult::new();
@@ -1289,10 +1478,10 @@ pub fn validate_stamps_batch(
         let stamp_path = req.relative_path();
         let json_content = stamp_data.get(&stamp_path).and_then(|v| v.as_ref());
 
-        // Only apply current_inputs to stamps matching the specified component type.
         let inputs_for_req = current_inputs
-            .filter(|(component, _)| req.component == **component)
-            .map(|(_, inputs)| inputs);
+            .iter()
+            .find(|(component, command, _)| req.component == *component && req.command == *command)
+            .map(|(_, _, inputs)| *inputs);
 
         check_stamp_requirement(
             req,
@@ -2099,7 +2288,7 @@ ext/my-ext/build.stamp:::null"#
             ext_json
         );
 
-        let result = validate_stamps_batch(&requirements, &output, None);
+        let result = validate_stamps_batch(&requirements, &output, &[]);
 
         assert!(result.is_satisfied());
         assert_eq!(result.satisfied.len(), 2);
@@ -2129,7 +2318,7 @@ ext/my-ext/build.stamp:::null"#
             sdk_json
         );
 
-        let result = validate_stamps_batch(&requirements, &output, None);
+        let result = validate_stamps_batch(&requirements, &output, &[]);
 
         assert!(!result.is_satisfied());
         assert_eq!(result.satisfied.len(), 1);
@@ -2144,7 +2333,7 @@ ext/my-ext/build.stamp:::null"#
             StampRequirement::ext_install("my-ext"),
         ];
 
-        let result = validate_stamps_batch(&requirements, "", None);
+        let result = validate_stamps_batch(&requirements, "", &[]);
 
         assert!(!result.is_satisfied());
         assert!(result.satisfied.is_empty());
@@ -2364,7 +2553,7 @@ ext/my-ext/build.stamp:::null"#
             sdk_json,
             ext_json
         );
-        let result_before = validate_stamps_batch(&requirements, &output_before, None);
+        let result_before = validate_stamps_batch(&requirements, &output_before, &[]);
         assert!(result_before.is_satisfied());
 
         // After ext clean: SDK still there, ext stamps gone
@@ -2373,7 +2562,7 @@ ext/my-ext/build.stamp:::null"#
             get_local_arch(),
             sdk_json
         );
-        let result_after = validate_stamps_batch(&requirements, &output_after_ext_clean, None);
+        let result_after = validate_stamps_batch(&requirements, &output_after_ext_clean, &[]);
         assert!(!result_after.is_satisfied());
         assert_eq!(result_after.missing.len(), 1);
         assert_eq!(
@@ -2403,7 +2592,7 @@ runtime/my-runtime/build.stamp:::null"#,
             get_local_arch()
         );
 
-        let result = validate_stamps_batch(&requirements, &output, None);
+        let result = validate_stamps_batch(&requirements, &output, &[]);
 
         assert!(!result.is_satisfied());
         assert!(result.satisfied.is_empty());
@@ -2435,7 +2624,11 @@ runtime/my-runtime/build.stamp:::null"#,
         let result = validate_stamps_batch(
             &requirements,
             &output,
-            Some((&StampComponent::Extension, &changed_inputs)),
+            &[(
+                StampComponent::Extension,
+                StampCommand::Install,
+                &changed_inputs,
+            )],
         );
 
         assert!(!result.is_satisfied());
@@ -2483,7 +2676,11 @@ runtime/my-runtime/build.stamp:::null"#,
         let result = validate_stamps_batch(
             &requirements,
             &output,
-            Some((&StampComponent::Extension, &changed_inputs)),
+            &[(
+                StampComponent::Extension,
+                StampCommand::Install,
+                &changed_inputs,
+            )],
         );
 
         assert!(!result.is_satisfied());
@@ -2675,9 +2872,20 @@ kernel:
         .unwrap();
 
         let empty_parsed = serde_yaml::Value::Mapping(serde_yaml::Mapping::new());
-        let hash_without =
-            compute_runtime_input_hash(&without_kernel, "dev", &empty_parsed).unwrap();
-        let hash_with = compute_runtime_input_hash(&with_kernel, "dev", &empty_parsed).unwrap();
+        let hash_without = compute_runtime_build_input_hash(
+            &without_kernel,
+            "dev",
+            &empty_parsed,
+            std::path::Path::new("."),
+        )
+        .unwrap();
+        let hash_with = compute_runtime_build_input_hash(
+            &with_kernel,
+            "dev",
+            &empty_parsed,
+            std::path::Path::new("."),
+        )
+        .unwrap();
 
         // Hashes should differ when kernel config is added
         assert_ne!(hash_without.config_hash, hash_with.config_hash);
@@ -2708,10 +2916,20 @@ kernel:
         .unwrap();
 
         let empty_parsed = serde_yaml::Value::Mapping(serde_yaml::Mapping::new());
-        let hash_package =
-            compute_runtime_input_hash(&kernel_package, "dev", &empty_parsed).unwrap();
-        let hash_compile =
-            compute_runtime_input_hash(&kernel_compile, "dev", &empty_parsed).unwrap();
+        let hash_package = compute_runtime_build_input_hash(
+            &kernel_package,
+            "dev",
+            &empty_parsed,
+            std::path::Path::new("."),
+        )
+        .unwrap();
+        let hash_compile = compute_runtime_build_input_hash(
+            &kernel_compile,
+            "dev",
+            &empty_parsed,
+            std::path::Path::new("."),
+        )
+        .unwrap();
 
         // Switching kernel mode should produce a different hash
         assert_ne!(hash_package.config_hash, hash_compile.config_hash);
@@ -2745,8 +2963,16 @@ extensions:
         )
         .unwrap();
 
-        let hash_without = compute_ext_input_hash(&config_without, "my-ext").unwrap();
-        let hash_with = compute_ext_input_hash(&config_with, "my-ext").unwrap();
+        let hash_without = compute_ext_image_input_hash(
+            &config_without,
+            "my-ext",
+            None,
+            std::path::Path::new("."),
+        )
+        .unwrap();
+        let hash_with =
+            compute_ext_image_input_hash(&config_with, "my-ext", None, std::path::Path::new("."))
+                .unwrap();
 
         assert_ne!(
             hash_without.config_hash, hash_with.config_hash,
@@ -2790,8 +3016,20 @@ extensions:
         )
         .unwrap();
 
-        let hash_without = compute_runtime_input_hash(&runtime, "dev", &parsed_without).unwrap();
-        let hash_with = compute_runtime_input_hash(&runtime, "dev", &parsed_with).unwrap();
+        let hash_without = compute_runtime_build_input_hash(
+            &runtime,
+            "dev",
+            &parsed_without,
+            std::path::Path::new("."),
+        )
+        .unwrap();
+        let hash_with = compute_runtime_build_input_hash(
+            &runtime,
+            "dev",
+            &parsed_with,
+            std::path::Path::new("."),
+        )
+        .unwrap();
 
         assert_ne!(
             hash_without.config_hash, hash_with.config_hash,
@@ -2821,9 +3059,20 @@ var_files:
         .unwrap();
 
         let empty_parsed = serde_yaml::Value::Mapping(serde_yaml::Mapping::new());
-        let hash_without =
-            compute_runtime_input_hash(&runtime_without, "dev", &empty_parsed).unwrap();
-        let hash_with = compute_runtime_input_hash(&runtime_with, "dev", &empty_parsed).unwrap();
+        let hash_without = compute_runtime_build_input_hash(
+            &runtime_without,
+            "dev",
+            &empty_parsed,
+            std::path::Path::new("."),
+        )
+        .unwrap();
+        let hash_with = compute_runtime_build_input_hash(
+            &runtime_with,
+            "dev",
+            &empty_parsed,
+            std::path::Path::new("."),
+        )
+        .unwrap();
 
         assert_ne!(
             hash_without.config_hash, hash_with.config_hash,
@@ -2861,8 +3110,16 @@ extensions:
         )
         .unwrap();
 
-        let hash_without = compute_ext_input_hash(&config_without, "my-ext").unwrap();
-        let hash_with = compute_ext_input_hash(&config_with, "my-ext").unwrap();
+        let hash_without = compute_ext_image_input_hash(
+            &config_without,
+            "my-ext",
+            None,
+            std::path::Path::new("."),
+        )
+        .unwrap();
+        let hash_with =
+            compute_ext_image_input_hash(&config_with, "my-ext", None, std::path::Path::new("."))
+                .unwrap();
 
         assert_ne!(
             hash_without.config_hash, hash_with.config_hash,
@@ -2894,13 +3151,389 @@ var:
         .unwrap();
 
         let empty_parsed = serde_yaml::Value::Mapping(serde_yaml::Mapping::new());
-        let hash_without =
-            compute_runtime_input_hash(&runtime_without, "dev", &empty_parsed).unwrap();
-        let hash_with = compute_runtime_input_hash(&runtime_with, "dev", &empty_parsed).unwrap();
+        let hash_without = compute_runtime_build_input_hash(
+            &runtime_without,
+            "dev",
+            &empty_parsed,
+            std::path::Path::new("."),
+        )
+        .unwrap();
+        let hash_with = compute_runtime_build_input_hash(
+            &runtime_with,
+            "dev",
+            &empty_parsed,
+            std::path::Path::new("."),
+        )
+        .unwrap();
 
         assert_ne!(
             hash_without.config_hash, hash_with.config_hash,
             "Adding var config should change the runtime input hash"
         );
     }
+
+    // ────────────────────────────────────────────────────────────────────
+    // Negative-invalidation tests
+    //
+    // Each test asserts that mutating a field that the step does NOT care
+    // about leaves the step's input hash unchanged. Without these, the
+    // per-step split is one refactor away from regressing back to the
+    // shared-hash over-invalidation behavior.
+    // ────────────────────────────────────────────────────────────────────
+
+    fn ext_with_extras(extras: &str) -> serde_yaml::Value {
+        let yaml = format!(
+            r#"
+extensions:
+  my-ext:
+    packages:
+      foo: "*"
+    types: [sysext]
+{extras}
+"#
+        );
+        serde_yaml::from_str(&yaml).unwrap()
+    }
+
+    fn ext_install_hash(value: &serde_yaml::Value) -> String {
+        compute_ext_install_input_hash(value, "my-ext")
+            .unwrap()
+            .config_hash
+    }
+
+    fn ext_build_hash(value: &serde_yaml::Value) -> String {
+        compute_ext_build_input_hash(value, "my-ext", std::path::Path::new("."))
+            .unwrap()
+            .config_hash
+    }
+
+    fn ext_image_hash(value: &serde_yaml::Value) -> String {
+        compute_ext_image_input_hash(value, "my-ext", None, std::path::Path::new("."))
+            .unwrap()
+            .config_hash
+    }
+
+    #[test]
+    fn ext_install_unaffected_by_image_field() {
+        let base = ext_with_extras("");
+        let with_image = ext_with_extras("    image:\n      type: kab\n      args: \"-v 1.0.0\"");
+        assert_eq!(ext_install_hash(&base), ext_install_hash(&with_image));
+    }
+
+    #[test]
+    fn ext_install_unaffected_by_var_files() {
+        let base = ext_with_extras("");
+        let with_var = ext_with_extras("    var_files:\n      - \"var/lib/docker/**\"");
+        assert_eq!(ext_install_hash(&base), ext_install_hash(&with_var));
+    }
+
+    #[test]
+    fn ext_install_unaffected_by_subvolumes_and_post_build() {
+        let base = ext_with_extras("");
+        let with = ext_with_extras(
+            "    subvolumes:\n      lib/docker:\n        nodatacow: true\n    post_build: scripts/build.sh",
+        );
+        assert_eq!(ext_install_hash(&base), ext_install_hash(&with));
+    }
+
+    #[test]
+    fn ext_install_unaffected_by_metadata_and_runtime_fields() {
+        let base = ext_with_extras("");
+        let with = ext_with_extras(
+            "    version: \"1.0.0\"\n    scopes: [system]\n    enable_services: [foo.service]\n    \
+             on_merge: [\"echo hi\"]\n    on_unmerge: [\"echo bye\"]",
+        );
+        assert_eq!(ext_install_hash(&base), ext_install_hash(&with));
+    }
+
+    #[test]
+    fn ext_build_unaffected_by_var_files_and_subvolumes() {
+        let base = ext_with_extras("");
+        let with = ext_with_extras(
+            "    var_files:\n      - \"var/lib/docker/**\"\n    subvolumes:\n      lib/x:\n        nodatacow: true",
+        );
+        assert_eq!(ext_build_hash(&base), ext_build_hash(&with));
+    }
+
+    #[test]
+    fn ext_build_unaffected_by_filesystem_override() {
+        // The filesystem field is image-only — build must not see it.
+        let base = ext_with_extras("");
+        let with_fs = ext_with_extras("    filesystem: erofs-zst");
+        assert_eq!(ext_build_hash(&base), ext_build_hash(&with_fs));
+    }
+
+    #[test]
+    fn ext_image_includes_var_files_and_subvolumes() {
+        let base = ext_with_extras("");
+        let with = ext_with_extras(
+            "    var_files:\n      - \"var/lib/docker/**\"\n    subvolumes:\n      lib/x:\n        nodatacow: true",
+        );
+        assert_ne!(ext_image_hash(&base), ext_image_hash(&with));
+    }
+
+    #[test]
+    fn ext_build_content_changes_invalidate_when_post_build_set() {
+        let tmp = tempfile::TempDir::new().unwrap();
+        let script = tmp.path().join("build.sh");
+        std::fs::write(&script, b"#!/bin/sh\necho original\n").unwrap();
+
+        let config = ext_with_extras("    post_build: build.sh");
+        let h1 = compute_ext_build_input_hash(&config, "my-ext", tmp.path())
+            .unwrap()
+            .config_hash;
+
+        std::fs::write(&script, b"#!/bin/sh\necho edited\n").unwrap();
+        let h2 = compute_ext_build_input_hash(&config, "my-ext", tmp.path())
+            .unwrap()
+            .config_hash;
+
+        assert_ne!(
+            h1, h2,
+            "editing post_build script body should invalidate the build hash"
+        );
+    }
+
+    fn runtime(yaml: &str) -> serde_yaml::Value {
+        serde_yaml::from_str(yaml).unwrap()
+    }
+
+    #[test]
+    fn runtime_install_unaffected_by_build_only_fields() {
+        let base = runtime(
+            r#"
+packages:
+  avocado-runtime: "*"
+target: "x86_64"
+"#,
+        );
+        let with_build_only = runtime(
+            r#"
+packages:
+  avocado-runtime: "*"
+target: "x86_64"
+kernel:
+  version: "6.6.*"
+var:
+  compression: zstd
+var_files:
+  - source: "files/x"
+    dest: "lib/x"
+post_build: scripts/post.sh
+"#,
+        );
+        let h1 = compute_runtime_install_input_hash(&base, "dev")
+            .unwrap()
+            .config_hash;
+        let h2 = compute_runtime_install_input_hash(&with_build_only, "dev")
+            .unwrap()
+            .config_hash;
+        assert_eq!(h1, h2);
+    }
+
+    #[test]
+    fn runtime_install_unaffected_by_top_level_rootfs_initramfs_filesystem() {
+        let runtime_node = runtime(
+            r#"
+packages:
+  avocado-runtime: "*"
+target: "x86_64"
+"#,
+        );
+        let parsed_a: serde_yaml::Value = serde_yaml::from_str(
+            r#"
+rootfs:
+  filesystem: erofs-lz4
+initramfs:
+  filesystem: cpio.zst
+"#,
+        )
+        .unwrap();
+        let parsed_b: serde_yaml::Value = serde_yaml::from_str(
+            r#"
+rootfs:
+  filesystem: erofs-zst
+initramfs:
+  filesystem: cpio
+"#,
+        )
+        .unwrap();
+        // install hash ignores the parsed/top-level filesystem entirely.
+        let h_a = compute_runtime_install_input_hash(&runtime_node, "dev")
+            .unwrap()
+            .config_hash;
+        let h_b = compute_runtime_install_input_hash(&runtime_node, "dev")
+            .unwrap()
+            .config_hash;
+        assert_eq!(h_a, h_b);
+        // sanity: build hash DOES include filesystem
+        let b_a = compute_runtime_build_input_hash(
+            &runtime_node,
+            "dev",
+            &parsed_a,
+            std::path::Path::new("."),
+        )
+        .unwrap()
+        .config_hash;
+        let b_b = compute_runtime_build_input_hash(
+            &runtime_node,
+            "dev",
+            &parsed_b,
+            std::path::Path::new("."),
+        )
+        .unwrap()
+        .config_hash;
+        assert_ne!(
+            b_a, b_b,
+            "runtime build SHOULD invalidate on filesystem swap"
+        );
+    }
+
+    #[test]
+    fn sdk_install_unaffected_by_rootfs_initramfs_packages() {
+        let base: serde_yaml::Value = serde_yaml::from_str(
+            r#"
+sdk:
+  image: my-sdk:1
+  packages:
+    sdk-deps: "*"
+rootfs:
+  packages:
+    pkg-a: "*"
+initramfs:
+  packages:
+    pkg-b: "*"
+"#,
+        )
+        .unwrap();
+        let bumped: serde_yaml::Value = serde_yaml::from_str(
+            r#"
+sdk:
+  image: my-sdk:1
+  packages:
+    sdk-deps: "*"
+rootfs:
+  packages:
+    pkg-a: ">=2.0"
+initramfs:
+  packages:
+    pkg-b: ">=3.0"
+"#,
+        )
+        .unwrap();
+        let h_base = compute_sdk_input_hash(&base).unwrap().config_hash;
+        let h_bumped = compute_sdk_input_hash(&bumped).unwrap().config_hash;
+        assert_eq!(
+            h_base, h_bumped,
+            "rootfs/initramfs package bumps must not invalidate the SDK install stamp"
+        );
+    }
+
+    #[test]
+    fn rootfs_install_ignores_unrelated_kernel_fields() {
+        let base: serde_yaml::Value = serde_yaml::from_str(
+            r#"
+rootfs:
+  packages:
+    avocado-pkg-rootfs: "*"
+kernel:
+  version: "6.6.*"
+  package: kernel-image
+"#,
+        )
+        .unwrap();
+        let with_metadata: serde_yaml::Value = serde_yaml::from_str(
+            r#"
+rootfs:
+  packages:
+    avocado-pkg-rootfs: "*"
+kernel:
+  version: "6.6.*"
+  package: kernel-image
+  metadata: cosmetic
+  description: "added later"
+"#,
+        )
+        .unwrap();
+        let h_base = compute_rootfs_input_hash(&base, std::path::Path::new("."))
+            .unwrap()
+            .config_hash;
+        let h_extra = compute_rootfs_input_hash(&with_metadata, std::path::Path::new("."))
+            .unwrap()
+            .config_hash;
+        assert_eq!(
+            h_base, h_extra,
+            "adding unrelated keys under `kernel:` must not invalidate the rootfs install stamp"
+        );
+    }
+
+    #[test]
+    fn rootfs_install_invalidates_on_kernel_version_change() {
+        let v1: serde_yaml::Value = serde_yaml::from_str(
+            r#"
+rootfs:
+  packages:
+    avocado-pkg-rootfs: "*"
+kernel:
+  version: "6.6.*"
+"#,
+        )
+        .unwrap();
+        let v2: serde_yaml::Value = serde_yaml::from_str(
+            r#"
+rootfs:
+  packages:
+    avocado-pkg-rootfs: "*"
+kernel:
+  version: "6.7.*"
+"#,
+        )
+        .unwrap();
+        let h_v1 = compute_rootfs_input_hash(&v1, std::path::Path::new("."))
+            .unwrap()
+            .config_hash;
+        let h_v2 = compute_rootfs_input_hash(&v2, std::path::Path::new("."))
+            .unwrap()
+            .config_hash;
+        assert_ne!(h_v1, h_v2);
+    }
+
+    #[test]
+    fn rootfs_install_post_install_content_change_invalidates() {
+        let tmp = tempfile::TempDir::new().unwrap();
+        let script = tmp.path().join("post.sh");
+        std::fs::write(&script, b"#!/bin/sh\necho v1\n").unwrap();
+
+        let config: serde_yaml::Value = serde_yaml::from_str(
+            r#"
+rootfs:
+  packages:
+    avocado-pkg-rootfs: "*"
+  post_install: post.sh
+"#,
+        )
+        .unwrap();
+        let h1 = compute_rootfs_input_hash(&config, tmp.path())
+            .unwrap()
+            .config_hash;
+
+        std::fs::write(&script, b"#!/bin/sh\necho v2\n").unwrap();
+        let h2 = compute_rootfs_input_hash(&config, tmp.path())
+            .unwrap()
+            .config_hash;
+
+        assert_ne!(h1, h2);
+    }
+
+    #[test]
+    fn stamp_version_bump_invalidates_old_stamps() {
+        let inputs = StampInputs::new("sha256:abc".to_string());
+        let mut stamp = Stamp::sdk_install("x86_64", inputs.clone(), StampOutputs::default());
+        // Forge an older version.
+        stamp.version = STAMP_VERSION - 1;
+        assert!(
+            !stamp.is_current(&inputs),
+            "older stamp version should be reported as stale"
+        );
+    }
 }

From d7d7fc617a5aa59a3fbd8190576d4c751f0715da Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 26 May 2026 21:39:07 -0400
Subject: [PATCH 03/21] runtime build: fall back to default rootfs/initramfs
 for permissions resolution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a runtime has no explicit `rootfs:` / `initramfs:` ref (the common
case for projects that define images at the top level), the resolver
returned None and the permissions section came out empty — meaning the
root user's shadow entry never got rewritten, root login was silently
broken on the resulting image.

Fix: in runtime/build.rs, fall back to `config.rootfs_default()` /
`config.initramfs_default()` when the runtime-level ref is unset, same
fallback the image build itself uses for filesystem/post_install.

Adds a regression test in `utils::config::tests` that mirrors the test
project shape (top-level rootfs/initramfs with `permissions: dev`,
runtime declares no rootfs/initramfs of its own) and asserts the
fallback path picks up the permissions block.

Verified end-to-end: after rebuild, the rootfs erofs image's
/etc/shadow now carries `root::19000:...` (empty password) instead of
the inherited `root:*:...` from the sysroot.
---
 src/commands/runtime/build.rs | 23 +++++++++++-------
 src/utils/config.rs           | 44 +++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/src/commands/runtime/build.rs b/src/commands/runtime/build.rs
index b247653a..5e4ee1bf 100644
--- a/src/commands/runtime/build.rs
+++ b/src/commands/runtime/build.rs
@@ -2248,11 +2248,21 @@ echo "Docker image priming complete.""#,
             render_users_groups_script(users.as_ref(), groups.as_ref(), etc_dir, None)
         };
 
+        // When the runtime doesn't declare its own `rootfs:` / `initramfs:`
+        // ref, fall back to the top-level default entry — same fallback the
+        // image build itself uses to find filesystem/post_install settings.
+        // Without this fallback, projects that only set permissions at the
+        // top level (the common case) silently get no permissions baked
+        // into the work dir.
+        let resolved_rootfs = config
+            .resolve_runtime_rootfs(&self.runtime_name)
+            .or_else(|| config.rootfs_default());
+        let resolved_initramfs = config
+            .resolve_runtime_initramfs(&self.runtime_name)
+            .or_else(|| config.initramfs_default());
+
         let rootfs_post_install = get_post_install(parsed.get("rootfs"));
-        let rootfs_permissions_section = render_perms(
-            config.resolve_runtime_rootfs(&self.runtime_name),
-            "$ROOTFS_WORK/etc",
-        );
+        let rootfs_permissions_section = render_perms(resolved_rootfs, "$ROOTFS_WORK/etc");
         let rootfs_build_section = generate_rootfs_build_script(
             NAMESPACE_UUID,
             &config.get_rootfs_filesystem(),
@@ -2261,10 +2271,7 @@ echo "Docker image priming complete.""#,
         );
 
         let initramfs_post_install = get_post_install(parsed.get("initramfs"));
-        let initramfs_permissions_section = render_perms(
-            config.resolve_runtime_initramfs(&self.runtime_name),
-            "$INITRAMFS_WORK/etc",
-        );
+        let initramfs_permissions_section = render_perms(resolved_initramfs, "$INITRAMFS_WORK/etc");
         let initramfs_build_section = generate_initramfs_build_script(
             NAMESPACE_UUID,
             &config.get_initramfs_filesystem(),
diff --git a/src/utils/config.rs b/src/utils/config.rs
index 44a73a2e..0e4e3fe1 100644
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@@ -11136,6 +11136,50 @@ rootfs:
         assert!(resolved.users.as_ref().unwrap().contains_key("root"));
     }
 
+    /// Regression: when a runtime has no explicit `rootfs:` / `initramfs:`
+    /// ref, runtime build must still pick up the top-level default entry's
+    /// `permissions:`. Previously `resolve_runtime_rootfs` returned None
+    /// here, the permissions section came out empty, and root login was
+    /// silently broken on the resulting image.
+    #[test]
+    fn test_rootfs_default_fallback_carries_permissions() {
+        let yaml = r#"
+default_target: qemuarm64
+permissions:
+  dev:
+    users:
+      root:
+        password: ""
+rootfs:
+  permissions: dev
+initramfs:
+  permissions: dev
+runtimes:
+  dev:
+    target: aarch64-unknown-linux-gnu
+    packages: { avocado-runtime: "*" }
+"#;
+        let config = Config::load_from_yaml_str(yaml).unwrap();
+        // Runtime has no explicit rootfs/initramfs ref — must fall back
+        // to the top-level default entry to pick up permissions.
+        assert!(config.resolve_runtime_rootfs("dev").is_none());
+        assert!(config.resolve_runtime_initramfs("dev").is_none());
+        let default_rootfs = config
+            .rootfs_default()
+            .expect("rootfs default present from singleton form");
+        let perms = config
+            .resolve_image_permissions(default_rootfs)
+            .expect("permissions resolve via default rootfs");
+        assert!(perms.users.as_ref().unwrap().contains_key("root"));
+        let default_initramfs = config
+            .initramfs_default()
+            .expect("initramfs default present from singleton form");
+        let perms = config
+            .resolve_image_permissions(default_initramfs)
+            .expect("permissions resolve via default initramfs");
+        assert!(perms.users.as_ref().unwrap().contains_key("root"));
+    }
+
     #[test]
     fn test_validate_runtime_refs_rejects_unresolved_rootfs_permissions() {
         let yaml = r#"

From 4209e7f54fdc021f791a1add266353bd07aa4ad9 Mon Sep 17 00:00:00 2001
From: nicksinas <nicksinas@gmail.com>
Date: Sun, 31 May 2026 03:27:42 -0500
Subject: [PATCH 04/21] docs: plan for avocado deploy port forwarding on macOS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Investigate why `avocado runtime deploy` fails on macOS and design the
fix. Root cause: the deploy script runs inside the SDK container, which
runs inside the slirp-NAT'd avocado-vm, so the TUF repo HTTP server
(:8585) it starts is unreachable by the target device, and the script's
host-IP autodetect returns container/VM addresses.

Plan: a per-deploy QMP hostfwd (bound 0.0.0.0, opened only during
deploy) + publishing the container repo port to the VM + setting
AVOCADO_DEPLOY_REPO_HOST to the macOS LAN IP (get_local_ip_for_remote),
surfaced as a reusable `avocado vm port-forward` primitive. No desktop
change — the CLI owns the qemu lifecycle.

Plan only; no behavior change yet.
---
 docs/features/macos-deploy-port-forwarding.md | 190 ++++++++++++++++++
 1 file changed, 190 insertions(+)
 create mode 100644 docs/features/macos-deploy-port-forwarding.md

diff --git a/docs/features/macos-deploy-port-forwarding.md b/docs/features/macos-deploy-port-forwarding.md
new file mode 100644
index 00000000..acac37b8
--- /dev/null
+++ b/docs/features/macos-deploy-port-forwarding.md
@@ -0,0 +1,190 @@
+# `avocado deploy` on macOS: VM port forwarding
+
+Status: **proposal / plan**. Investigation + design for making
+`avocado runtime deploy` work on macOS, where the build/deploy runs
+inside the slirp-NAT'd avocado-vm.
+
+## 1. Symptom
+
+`avocado deploy <device>` on macOS fails: the device can't fetch the
+TUF repo the deploy serves, so the final
+`avocadoctl runtime add --url http://<ip>:8585` step on the device errors
+out (connection refused / timeout / wrong host).
+
+## 2. How deploy works (today)
+
+`runtime/deploy.rs` builds a shell script and runs it **inside the SDK
+container** (`run_in_container`, `create_deploy_script`):
+
+1. Assembles a TUF repo under `/tmp/avocado-deploy-repo` (metadata +
+   symlinked image `.raw` targets).
+2. `python3 -m http.server 8585 --bind 0.0.0.0` to serve it.
+3. Auto-detects the IP the device should fetch from
+   (`ip route get <device>` / `ip -4 addr show scope global`), or honors
+   `AVOCADO_DEPLOY_REPO_HOST` if set.
+4. SSHes to the device and runs
+   `avocadoctl runtime add --url http://<HOST_IP>:8585`.
+
+It already anticipates this problem — the script comment says
+`AVOCADO_DEPLOY_REPO_HOST` is "useful for QEMU user-mode networking where
+the host is at 10.0.2.2" — but nothing wires it up on macOS.
+
+## 3. Why it breaks on macOS (the topology)
+
+```
+macOS host  (LAN: 192.168.x.y, reachable by the device)
+   │  qemu, slirp user-mode NAT
+   ▼
+avocado-vm  (guest 10.0.2.15; host alias 10.0.2.2; NOT inbound-reachable from LAN)
+   │  dockerd
+   ▼
+SDK container  ← the deploy script runs HERE
+   • python3 http.server :8585   (bound 0.0.0.0 *inside the container*)
+   • ip addr / ip route          → container/VM addresses (docker bridge, 10.0.2.15)
+   • ssh → device                (outbound via slirp NAT: OK)
+```
+
+Three independent gaps, all from the server living inside the NAT'd VM:
+
+1. **Repo host IP is wrong.** The script's autodetect runs in the
+   container and returns a docker-bridge / `10.0.2.15` address. The
+   device is handed `http://10.0.2.15:8585`, which is meaningless on the
+   LAN. (`10.0.2.2` is only meaningful *inside* the guest, so that's
+   wrong too.)
+2. **No inbound path to the server.** slirp does not let a LAN device
+   reach the guest. Even with the right IP, nothing forwards the device's
+   request into the VM/container. The existing qemu `hostfwd` is
+   `tcp:127.0.0.1:<port>-:22` — **loopback-only, SSH-only**.
+3. **Container port isn't exposed to the VM.** The http.server binds
+   `0.0.0.0` *in the container's* netns (docker bridge), not the VM's
+   `:8585`. (`deploy.rs` adds no `--net=host` / `-p`, unlike the HITL
+   server which does.)
+
+Outbound SSH from the container to the device works (slirp NAT), so the
+control path is fine; only the device→repo fetch is broken.
+
+## 4. Proposed design
+
+Reuse the in-container HTTP server (keeps the repo files where they're
+staged) and bridge the device→server path with a **per-deploy port
+forward** plus the correct repo host. Three pieces:
+
+### 4a. A reusable VM port-forward primitive (QMP)
+
+Add dynamic slirp forwarding via the existing QMP client
+([`src/utils/vm/qmp.rs`](../../src/utils/vm/qmp.rs)), using
+`human-monitor-command`:
+
+```
+hostfwd_add  net0 tcp:0.0.0.0:8585-:8585     # open  (bind 0.0.0.0 → LAN-reachable)
+hostfwd_remove net0 tcp:0.0.0.0:8585          # close
+```
+
+- `0.0.0.0` (not `127.0.0.1`) so a LAN device can reach `macOS:8585`.
+  This is the key difference from the SSH forward.
+- Forwards `macOS:8585 → guest 10.0.2.15:8585`.
+- Surface it as `avocado vm port-forward add|remove|list <host>:<port>-:<port>`
+  for general use, and have deploy call it internally. (A general
+  primitive is the "properly support port forwarding" the feature asks
+  for; deploy is its first consumer.)
+- Alternative: a **static** `hostfwd=tcp:0.0.0.0:8585-:8585` baked into
+  `qemu.rs` at VM start. Simpler, but leaves a LAN port open for the
+  VM's whole lifetime — rejected in favor of open-only-during-deploy.
+
+### 4b. Expose the container's repo port to the VM
+
+In `deploy.rs`, when routing through the VM, publish the repo port from
+the SDK container to the VM host so the qemu forward lands on it:
+
+- Add `-p 8585:8585` to the deploy container args (or `--net=host`,
+  matching the HITL server's pattern). Then
+  `macOS:8585 → (hostfwd) → VM:8585 → (docker -p) → container:8585`.
+
+### 4c. Hand the device the right host IP
+
+Set `AVOCADO_DEPLOY_REPO_HOST` to the macOS host's LAN IP, reusing
+[`get_local_ip_for_remote`](../../src/utils/remote.rs) (resolves the
+local interface IP that can reach a given device). `deploy.rs` already
+forwards `AVOCADO_DEPLOY_REPO_HOST` into the container env — so the
+device is told `http://<macOS-LAN-IP>:8585`, which routes back through
+the qemu forward.
+
+### End-to-end (external device on the LAN — the primary case)
+
+```
+device ── http GET ──► macOS-LAN-IP:8585
+                          └─(qemu hostfwd 0.0.0.0:8585→10.0.2.15:8585)
+                              └─ VM:8585 ─(docker -p)─► container http.server
+container ── ssh ──► device   (outbound slirp NAT)
+```
+
+## 5. Orchestration (where the glue lives)
+
+`avocado deploy` runs on the **host**, so the host-side command wraps the
+deploy with the forward lifecycle. On macOS + VM-routing active:
+
+1. Resolve macOS LAN IP via `get_local_ip_for_remote(device_host)`.
+2. QMP `hostfwd_add net0 tcp:0.0.0.0:8585-:8585` against the VM's
+   `qmp.sock` (path from `VmPaths`).
+3. Run the deploy container with `AVOCADO_DEPLOY_REPO_HOST=<LAN IP>` and
+   `-p 8585:8585`.
+4. On completion (success or error), QMP `hostfwd_remove` to close the
+   LAN port. Best-effort; also reconcile stale forwards on next `vm
+   start`.
+
+Gate all of this on macOS + `route::resolve_mode() == Apply` (the same
+signal that says "we're talking to the avocado-vm's docker"). On Linux
+with a real local docker / `--runs-on`, deploy already works as-is —
+leave it untouched.
+
+## 6. Scenarios
+
+- **External board on the LAN (primary):** fully solved by §4.
+- **Deploy to the avocado-vm itself as the device (testing):**
+  degenerate — the deploy script runs in a container *inside* the same
+  VM and would SSH to the VM and fetch from itself. Out of scope here;
+  document that the device must be a reachable address and the VM-as-
+  target needs a separate path (e.g. SSH to the bridge gateway), if we
+  want it at all.
+
+## 7. Alternatives considered
+
+- **Run the TUF http server on the macOS host** instead of the
+  container. No qemu forward needed (host is already on the LAN). But the
+  repo targets (manifest + `.raw` images) are staged in the container's
+  avocado prefix (an NFS/virtiofs-backed volume in the VM) and symlinked;
+  exposing them to a host-side server is more invasive than forwarding a
+  port. Rejected for now.
+- **Static hostfwd at VM start** (§4a) — simpler, but a permanently-open
+  LAN port. Rejected.
+
+## 8. Implementation steps
+
+1. `src/utils/vm/qmp.rs`: add `hostfwd_add` / `hostfwd_remove` /
+   `hostfwd_list` helpers (wrap `human-monitor-command`).
+2. New `avocado vm port-forward` subcommand (`src/commands/vm/`) using
+   those, for general use + tests.
+3. `src/commands/runtime/deploy.rs`: on macOS+VM-routing, resolve LAN IP,
+   open the forward, inject `AVOCADO_DEPLOY_REPO_HOST` + `-p 8585:8585`,
+   and close the forward in a guaranteed-cleanup path.
+4. Honor `AVOCADO_DEPLOY_REPO_PORT` end-to-end (forward + publish + URL)
+   so a non-default port still works.
+5. Tests: QMP command formatting; deploy wiring sets the env + container
+   args on macOS and leaves Linux/`--runs-on` paths unchanged.
+
+## 9. Open questions / risks
+
+- **macOS firewall:** opening `0.0.0.0:8585` on the host may prompt the
+  application firewall. qemu is the listener; confirm the prompt/behavior.
+- **Security:** the forward exposes the repo to the LAN for the deploy's
+  duration. Acceptable (TUF metadata is signed; it's transient), but
+  document it. Could bind to the specific LAN interface instead of
+  `0.0.0.0` if we want to narrow it.
+- **Port conflict:** if `8585` is taken on the host, the forward fails —
+  pick a free port and thread it through `AVOCADO_DEPLOY_REPO_PORT`.
+- **QMP `hostfwd_add` availability:** confirm the bundled qemu's slirp
+  build supports runtime `hostfwd_add` (it's standard, but verify on the
+  pinned qemu).
+- **No desktop change needed:** the CLI owns the qemu lifecycle on macOS;
+  the desktop app drives `avocado` and is unaffected. A future Devices/UI
+  affordance could call `vm port-forward`, but it's out of scope.

From 2f2d776b35f659c9ec9f04c2a8ef6f36a5edb37d Mon Sep 17 00:00:00 2001
From: nicksinas <nicksinas@gmail.com>
Date: Sun, 31 May 2026 20:16:31 -0500
Subject: [PATCH 05/21] deploy: make `avocado deploy` work on macOS via VM port
 forwarding

On macOS the deploy container runs inside the slirp-NAT'd avocado-vm, so
the TUF repo HTTP server it starts (:8585) was unreachable by the target
device and the in-container host-IP autodetect returned VM-internal
addresses. Bridge the device->repo path:

- qmp: add human_monitor_command + hostfwd_add/hostfwd_remove (runtime
  slirp port forwarding via the QEMU monitor), with unit tests.
- deploy: on macOS/Windows (is_docker_desktop), set AVOCADO_DEPLOY_REPO_HOST
  to this host's LAN IP and publish the repo port; on the avocado-vm
  (is_vm_routing_active) also open a `hostfwd 0.0.0.0:PORT->guest:PORT`
  for the deploy and tear it down afterward. Skip `-p` when the SDK
  container uses host networking (docker discards it and the hostfwd
  already reaches the VM-bound port). Linux (native docker) is untouched.

Validated end-to-end to a LAN Raspberry Pi 4: device fetched the repo
metadata over the forward (HTTP 200). See
docs/features/macos-deploy-port-forwarding.md.
---
 docs/features/macos-deploy-port-forwarding.md | 107 +++++++---
 src/commands/runtime/deploy.rs                | 185 +++++++++++++++++-
 src/utils/vm/qmp.rs                           | 132 +++++++++++++
 3 files changed, 393 insertions(+), 31 deletions(-)

diff --git a/docs/features/macos-deploy-port-forwarding.md b/docs/features/macos-deploy-port-forwarding.md
index acac37b8..8b58ad8d 100644
--- a/docs/features/macos-deploy-port-forwarding.md
+++ b/docs/features/macos-deploy-port-forwarding.md
@@ -1,8 +1,8 @@
 # `avocado deploy` on macOS: VM port forwarding
 
-Status: **proposal / plan**. Investigation + design for making
+Status: **implemented + validated** (avocado-vm path). Makes
 `avocado runtime deploy` work on macOS, where the build/deploy runs
-inside the slirp-NAT'd avocado-vm.
+inside the slirp-NAT'd avocado-vm. Validation notes in §12.
 
 ## 1. Symptom
 
@@ -91,14 +91,16 @@ hostfwd_remove net0 tcp:0.0.0.0:8585          # close
   `qemu.rs` at VM start. Simpler, but leaves a LAN port open for the
   VM's whole lifetime — rejected in favor of open-only-during-deploy.
 
-### 4b. Expose the container's repo port to the VM
+### 4b. Expose the container's repo port out of its VM
 
-In `deploy.rs`, when routing through the VM, publish the repo port from
-the SDK container to the VM host so the qemu forward lands on it:
+On macOS (both contexts) publish the repo port from the SDK container so
+it escapes the container netns:
 
-- Add `-p 8585:8585` to the deploy container args (or `--net=host`,
-  matching the HITL server's pattern). Then
-  `macOS:8585 → (hostfwd) → VM:8585 → (docker -p) → container:8585`.
+- Add `-p 8585:8585` to the deploy container args.
+- **avocado-vm:** `-p` publishes onto the VM's interfaces; the qemu
+  `hostfwd` (§4a) then carries `macOS:8585 → VM:8585 → container:8585`.
+- **Docker Desktop:** `-p` is forwarded straight to the macOS host by
+  Docker Desktop's vpnkit — no qemu step.
 
 ### 4c. Hand the device the right host IP
 
@@ -118,24 +120,40 @@ device ── http GET ──► macOS-LAN-IP:8585
 container ── ssh ──► device   (outbound slirp NAT)
 ```
 
-## 5. Orchestration (where the glue lives)
-
-`avocado deploy` runs on the **host**, so the host-side command wraps the
-deploy with the forward lifecycle. On macOS + VM-routing active:
-
-1. Resolve macOS LAN IP via `get_local_ip_for_remote(device_host)`.
-2. QMP `hostfwd_add net0 tcp:0.0.0.0:8585-:8585` against the VM's
-   `qmp.sock` (path from `VmPaths`).
-3. Run the deploy container with `AVOCADO_DEPLOY_REPO_HOST=<LAN IP>` and
-   `-p 8585:8585`.
-4. On completion (success or error), QMP `hostfwd_remove` to close the
-   LAN port. Best-effort; also reconcile stale forwards on next `vm
-   start`.
-
-Gate all of this on macOS + `route::resolve_mode() == Apply` (the same
-signal that says "we're talking to the avocado-vm's docker"). On Linux
-with a real local docker / `--runs-on`, deploy already works as-is —
-leave it untouched.
+## 5. Detection & orchestration (where the glue lives)
+
+Deploy fails on **both** macOS contexts, because in either one the deploy
+container runs inside a Linux VM — the avocado-vm *or* Docker Desktop's
+LinuxKit VM — so its in-container `ip route`/`ip addr` autodetect returns
+a VM-internal address the device can't reach. Linux runs the container on
+native docker with no VM in between, so it already works and must stay
+untouched.
+
+So the gate is **two-tier**, mirroring the split the HITL server already
+uses (`is_docker_desktop()` → publish vs host-net):
+
+- **`is_docker_desktop()`** (`cfg!(macos) || cfg!(windows)`) — the deploy
+  container is inside a Linux VM. Apply the fixes common to both Mac
+  contexts:
+  1. `AVOCADO_DEPLOY_REPO_HOST` = macOS LAN IP via
+     `get_local_ip_for_remote(device_host)` (overrides the broken
+     in-container autodetect). Respect an explicit user-set value.
+  2. Publish the repo port from the container (`-p <port>:<port>`).
+  - **+ `is_vm_routing_active()`** (DOCKER_HOST → avocado-vm socket):
+    *also* open a qemu `hostfwd` (`tcp:0.0.0.0:<port>-:<port>`) via QMP
+    against the VM's `qmp.sock` (`VmPaths`), because raw slirp doesn't
+    auto-expose the published port to the host LAN. Removed on completion
+    (success or error); reconcile stale forwards on next `vm start`.
+  - **else (Docker Desktop)**: no qemu step — Docker Desktop's `-p`
+    already forwards the container port to the macOS host (vpnkit).
+- **Linux** — `is_docker_desktop()` false → skip everything; current
+  behavior preserved (works today, no VM, native docker).
+
+Key correction over an earlier draft: the discriminator is
+`is_docker_desktop()`, **not** `is_vm_routing_active()` alone — the
+Docker-Desktop-on-Mac case is broken too. The qemu `hostfwd` is the
+*only* avocado-vm-specific piece; the LAN-IP injection + port publish are
+shared by both Mac contexts.
 
 ## 6. Scenarios
 
@@ -188,3 +206,40 @@ leave it untouched.
 - **No desktop change needed:** the CLI owns the qemu lifecycle on macOS;
   the desktop app drives `avocado` and is unaffected. A future Devices/UI
   affordance could call `vm port-forward`, but it's out of scope.
+
+## 12. Validation (2026-06-01)
+
+Validated end-to-end on the **avocado-vm path** deploying to a real
+Raspberry Pi 4 on the LAN:
+
+- The repo was served at the **macOS host LAN IP** (`AVOCADO_DEPLOY_REPO_HOST`
+  injection working), and the device successfully fetched
+  `GET /metadata/timestamp.json → 200` through the qemu `hostfwd` → VM →
+  container. The device→repo reachability that was previously impossible
+  on macOS now works.
+- `qemu hostfwd_add` is accepted by the pinned qemu (open question §9
+  resolved).
+
+Findings folded back into the implementation:
+
+- **Host-networking SDK containers:** when the SDK container runs
+  `--network=host` (e.g. projects that set it), docker discards the `-p`
+  publish with a "Published ports are discarded when using host network
+  mode" warning — and it's unnecessary there, since the container already
+  shares the VM's `:8585` that the `hostfwd` targets. The shim now skips
+  `-p` when host networking is detected.
+
+Out of scope / separate concerns surfaced during testing:
+
+- **Device trust:** sideload deploy then fails at the device with
+  `Signature verification failed … got 0, need 1` unless the device's
+  installed TUF root matches the project's signing key — i.e. the device
+  must be provisioned/flashed from an image built with the same
+  `signing-keys`. This is a provisioning/trust matter, independent of the
+  port-forwarding fix.
+- **Docker Desktop path** (`--no-vm-auto-start`): the LAN-IP + `-p` half
+  applies, but it's **not yet validated**, and a project using
+  `--network=host` won't expose the port to macOS under Docker Desktop
+  (host net there is the LinuxKit VM, not the host) — needs the project
+  to use bridge networking, plus confirmation that Docker Desktop's `-p`
+  binds a LAN-reachable address.
diff --git a/src/commands/runtime/deploy.rs b/src/commands/runtime/deploy.rs
index b234c75c..f3c188fb 100644
--- a/src/commands/runtime/deploy.rs
+++ b/src/commands/runtime/deploy.rs
@@ -1,6 +1,6 @@
 use crate::utils::{
     config::{ComposedConfig, Config},
-    container::{RunConfig, SdkContainer},
+    container::{is_docker_desktop, is_vm_routing_active, RunConfig, SdkContainer},
     lockfile::LockFile,
     output::{print_info, print_success, print_warning, OutputLevel},
     output_format::{emit_json_event, is_json_output_active},
@@ -472,6 +472,36 @@ impl RuntimeDeployCommand {
             env_vars.insert("AVOCADO_DEPLOY_REPO_PORT".to_string(), repo_port);
         }
 
+        // On macOS/Windows the deploy container runs inside a Linux VM, so
+        // the TUF repo server it starts is unreachable by the device unless
+        // we hand the device this host's LAN IP, publish the repo port out
+        // of the container, and (avocado-vm only) open a qemu slirp forward.
+        // No-op on Linux (native docker), where deploy already works.
+        let repo_port: u16 = std::env::var("AVOCADO_DEPLOY_REPO_PORT")
+            .ok()
+            .and_then(|s| s.parse().ok())
+            .unwrap_or(DEFAULT_DEPLOY_REPO_PORT);
+        let mut container_args = config.merge_sdk_container_args(self.container_args.as_ref());
+        // If the SDK container runs with host networking, the published-port
+        // (`-p`) trick is both unnecessary and discarded by docker, so the
+        // shim skips it (see prepare_mac_deploy_net).
+        let host_net = container_args
+            .as_deref()
+            .map(|args| {
+                args.iter()
+                    .any(|a| a == "--network=host" || a == "--net=host")
+            })
+            .unwrap_or(false);
+        let MacDeployNet {
+            env: mac_env,
+            container_args: mac_args,
+            forward,
+        } = prepare_mac_deploy_net(&self.device, repo_port, host_net, self.verbose).await;
+        env_vars.extend(mac_env);
+        if !mac_args.is_empty() {
+            container_args.get_or_insert_with(Vec::new).extend(mac_args);
+        }
+
         let run_config = RunConfig {
             container_image: container_image.to_string(),
             target: target_arch.clone(),
@@ -480,16 +510,20 @@ impl RuntimeDeployCommand {
             source_environment: true,
             interactive: false,
             env_vars: Some(env_vars),
-            container_args: config.merge_sdk_container_args(self.container_args.as_ref()),
+            container_args,
             dnf_args: self.dnf_args.clone(),
             sdk_arch: self.sdk_arch.clone(),
             ..Default::default()
         };
-        let deploy_result = match container_helper
+        let run_outcome = container_helper
             .run_in_container(run_config)
             .await
-            .context("Failed to deploy runtime")
-        {
+            .context("Failed to deploy runtime");
+        // Tear down the VM port forward (best-effort) regardless of outcome.
+        if let Some(fwd) = forward {
+            fwd.close().await;
+        }
+        let deploy_result = match run_outcome {
             Ok(r) => r,
             Err(e) => {
                 let _ = std::fs::remove_dir_all(&staging_dir);
@@ -740,6 +774,147 @@ fi
     }
 }
 
+/// Host-side networking shim for `avocado deploy` on macOS/Windows, where
+/// the deploy container runs inside a Linux VM (avocado-vm or Docker
+/// Desktop). The TUF repo HTTP server it starts is otherwise trapped in
+/// the container netns, and the script's in-container host-IP autodetect
+/// returns a VM-internal address the device can't reach. No-op on Linux.
+/// See docs/features/macos-deploy-port-forwarding.md.
+struct MacDeployNet {
+    /// Extra env for the deploy container (AVOCADO_DEPLOY_REPO_HOST).
+    env: HashMap<String, String>,
+    /// Extra `docker run` args (publish the repo port: `-p <port>:<port>`).
+    container_args: Vec<String>,
+    /// A qemu slirp forward to tear down afterward (avocado-vm only).
+    forward: Option<OpenForward>,
+}
+
+/// A qemu slirp host-forward opened for the duration of one deploy. Teardown
+/// is async, so it's closed explicitly by the caller rather than via `Drop`.
+struct OpenForward {
+    qmp_socket: std::path::PathBuf,
+    host_port: u16,
+}
+
+impl OpenForward {
+    async fn close(self) {
+        if let Ok(mut c) = crate::utils::vm::qmp::QmpClient::connect(&self.qmp_socket).await {
+            let _ = c.hostfwd_remove("net0", "0.0.0.0", self.host_port).await;
+        }
+    }
+}
+
+/// Build the macOS/Windows deploy networking shim. Returns an empty (no-op)
+/// value on Linux so the native-docker path is left exactly as-is.
+async fn prepare_mac_deploy_net(
+    device: &str,
+    port: u16,
+    host_net: bool,
+    verbose: bool,
+) -> MacDeployNet {
+    let mut net = MacDeployNet {
+        env: HashMap::new(),
+        container_args: Vec::new(),
+        forward: None,
+    };
+
+    // Linux runs the container on native docker — no VM in between, deploy
+    // already works. Leave it untouched.
+    if !is_docker_desktop() {
+        return net;
+    }
+
+    // Publish the repo port out of the container's netns — but only when it
+    // has its own. With `--network=host` the container already shares the
+    // VM's network (so the qemu hostfwd below reaches `:{port}` directly),
+    // and docker discards `-p` with a "Published ports are discarded when
+    // using host network mode" warning, so skip it.
+    if !host_net {
+        net.container_args.push("-p".to_string());
+        net.container_args.push(format!("{port}:{port}"));
+    }
+
+    // Both Mac contexts: tell the device the host LAN IP it can reach us on,
+    // overriding the broken in-container autodetect. Respect a user override.
+    if std::env::var("AVOCADO_DEPLOY_REPO_HOST").is_err() {
+        let host = DeviceSpec::parse(device)
+            .map(|s| s.host)
+            .unwrap_or_default();
+        match crate::utils::remote::get_local_ip_for_remote(&host).await {
+            Ok(ip) => {
+                if verbose {
+                    print_info(
+                        &format!("deploy: repo host IP {ip} (reachable by device {host})"),
+                        OutputLevel::Normal,
+                    );
+                }
+                net.env
+                    .insert("AVOCADO_DEPLOY_REPO_HOST".to_string(), ip.to_string());
+            }
+            Err(e) => print_warning(
+                &format!(
+                    "deploy: could not determine a LAN IP the device can reach this host on \
+                     ({e}). Set AVOCADO_DEPLOY_REPO_HOST to the right address."
+                ),
+                OutputLevel::Normal,
+            ),
+        }
+    }
+
+    // avocado-vm only: open a slirp hostfwd so the LAN device can reach the
+    // published port through qemu. Docker Desktop forwards `-p` to the host
+    // itself (vpnkit), so it needs no qemu step.
+    if is_vm_routing_active() {
+        let sock = match crate::utils::vm::state::VmPaths::resolve() {
+            Ok(p) => p.qmp_socket(),
+            Err(e) => {
+                print_warning(
+                    &format!("deploy: can't locate the avocado-vm for port forwarding ({e})."),
+                    OutputLevel::Normal,
+                );
+                return net;
+            }
+        };
+        match crate::utils::vm::qmp::QmpClient::connect(&sock).await {
+            Ok(mut c) => {
+                // Clear any stale forward from a prior interrupted deploy, then add.
+                let _ = c.hostfwd_remove("net0", "0.0.0.0", port).await;
+                match c.hostfwd_add("net0", "0.0.0.0", port, port).await {
+                    Ok(()) => {
+                        if verbose {
+                            print_info(
+                                &format!(
+                                    "deploy: opened VM port forward 0.0.0.0:{port} → guest :{port}"
+                                ),
+                                OutputLevel::Normal,
+                            );
+                        }
+                        net.forward = Some(OpenForward {
+                            qmp_socket: sock,
+                            host_port: port,
+                        });
+                    }
+                    Err(e) => print_warning(
+                        &format!(
+                            "deploy: failed to open the VM port forward for {port} ({e}); the \
+                             device may be unable to fetch the repo. Is the avocado-vm running?"
+                        ),
+                        OutputLevel::Normal,
+                    ),
+                }
+            }
+            Err(e) => print_warning(
+                &format!(
+                    "deploy: couldn't reach the avocado-vm QMP socket for port forwarding ({e})."
+                ),
+                OutputLevel::Normal,
+            ),
+        }
+    }
+
+    net
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/src/utils/vm/qmp.rs b/src/utils/vm/qmp.rs
index 68d81dc6..60db50df 100644
--- a/src/utils/vm/qmp.rs
+++ b/src/utils/vm/qmp.rs
@@ -70,6 +70,64 @@ impl QmpClient {
         Ok(v.get("return").cloned().unwrap_or(Value::Null))
     }
 
+    /// Run an HMP (human monitor) command via `human-monitor-command`.
+    /// HMP errors come back *inside* the returned string, not the QMP
+    /// `error` field, so callers must inspect the returned text.
+    pub async fn human_monitor_command(&mut self, command_line: &str) -> Result<String> {
+        let v = self
+            .execute(
+                "human-monitor-command",
+                Some(serde_json::json!({ "command-line": command_line })),
+            )
+            .await?;
+        Ok(v.as_str().unwrap_or_default().to_string())
+    }
+
+    /// Add a slirp host-forwarding rule to a `user` netdev at runtime:
+    /// `host_addr:host_port` on the host → `:guest_port` in the guest.
+    /// Use `host_addr = "0.0.0.0"` to make the forward reachable from the
+    /// LAN (not just loopback). TCP only — that's all deploy needs.
+    pub async fn hostfwd_add(
+        &mut self,
+        netdev: &str,
+        host_addr: &str,
+        host_port: u16,
+        guest_port: u16,
+    ) -> Result<()> {
+        // qemu hostfwd_add form: `<netdev> tcp:<hostaddr>:<hostport>-:<guestport>`
+        let out = self
+            .human_monitor_command(&format!(
+                "hostfwd_add {netdev} tcp:{host_addr}:{host_port}-:{guest_port}"
+            ))
+            .await?;
+        let out = out.trim();
+        if !out.is_empty() {
+            bail!("hostfwd_add failed: {out}");
+        }
+        Ok(())
+    }
+
+    /// Remove a previously-added slirp host-forwarding rule. The remove
+    /// form omits the guest side: `<netdev> tcp:<hostaddr>:<hostport>`.
+    /// A "not found" reply is tolerated so cleanup is idempotent.
+    pub async fn hostfwd_remove(
+        &mut self,
+        netdev: &str,
+        host_addr: &str,
+        host_port: u16,
+    ) -> Result<()> {
+        let out = self
+            .human_monitor_command(&format!(
+                "hostfwd_remove {netdev} tcp:{host_addr}:{host_port}"
+            ))
+            .await?;
+        let out = out.trim();
+        if !out.is_empty() && !out.contains("not found") {
+            bail!("hostfwd_remove failed: {out}");
+        }
+        Ok(())
+    }
+
     /// Read lines until we see one that has a `return` or `error` key
     /// (i.e. a command response). Events are skipped.
     async fn read_until_response(&mut self, dur: Duration) -> Result<Value> {
@@ -188,4 +246,78 @@ mod tests {
         let err = client.execute("device_add", None).await.unwrap_err();
         assert!(format!("{err:#}").contains("nope"));
     }
+
+    #[tokio::test]
+    async fn hostfwd_add_ok_on_empty_return() {
+        let socket = spawn_mock(|mut rh, mut wh| async move {
+            wh.write_all(b"{\"QMP\":{\"version\":{}}}\n").await.unwrap();
+            let mut line = String::new();
+            rh.read_line(&mut line).await.unwrap();
+            wh.write_all(b"{\"return\":{}}\n").await.unwrap();
+            line.clear();
+            rh.read_line(&mut line).await.unwrap();
+            // The HMP command is carried inside human-monitor-command.
+            assert!(line.contains("human-monitor-command"));
+            assert!(line.contains("hostfwd_add net0 tcp:0.0.0.0:8585-:8585"));
+            // Success: hostfwd_add prints nothing.
+            wh.write_all(b"{\"return\":\"\"}\n").await.unwrap();
+        })
+        .await;
+
+        let mut client = QmpClient::connect(&socket).await.unwrap();
+        client
+            .hostfwd_add("net0", "0.0.0.0", 8585, 8585)
+            .await
+            .unwrap();
+    }
+
+    #[tokio::test]
+    async fn hostfwd_add_errors_on_nonempty_return() {
+        let socket = spawn_mock(|mut rh, mut wh| async move {
+            wh.write_all(b"{\"QMP\":{\"version\":{}}}\n").await.unwrap();
+            let mut line = String::new();
+            rh.read_line(&mut line).await.unwrap();
+            wh.write_all(b"{\"return\":{}}\n").await.unwrap();
+            line.clear();
+            rh.read_line(&mut line).await.unwrap();
+            // HMP errors come back in the return string, not the QMP error field.
+            wh.write_all(b"{\"return\":\"Could not set up host forwarding rule\\n\"}\n")
+                .await
+                .unwrap();
+        })
+        .await;
+
+        let mut client = QmpClient::connect(&socket).await.unwrap();
+        let err = client
+            .hostfwd_add("net0", "0.0.0.0", 8585, 8585)
+            .await
+            .unwrap_err();
+        assert!(format!("{err:#}").contains("hostfwd_add failed"));
+    }
+
+    #[tokio::test]
+    async fn hostfwd_remove_tolerates_not_found() {
+        let socket = spawn_mock(|mut rh, mut wh| async move {
+            wh.write_all(b"{\"QMP\":{\"version\":{}}}\n").await.unwrap();
+            let mut line = String::new();
+            rh.read_line(&mut line).await.unwrap();
+            wh.write_all(b"{\"return\":{}}\n").await.unwrap();
+            line.clear();
+            rh.read_line(&mut line).await.unwrap();
+            assert!(line.contains("hostfwd_remove net0 tcp:0.0.0.0:8585"));
+            // A stale-cleanup "not found" reply must not be treated as an error.
+            wh.write_all(
+                b"{\"return\":\"host forwarding rule for tcp:0.0.0.0:8585 not found\\n\"}\n",
+            )
+            .await
+            .unwrap();
+        })
+        .await;
+
+        let mut client = QmpClient::connect(&socket).await.unwrap();
+        client
+            .hostfwd_remove("net0", "0.0.0.0", 8585)
+            .await
+            .unwrap();
+    }
 }

From 534e57485e39fe32702073253078b2c34abe364b Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Sun, 31 May 2026 00:30:20 -0400
Subject: [PATCH 06/21] repo TLS: support custom CA + insecure mode across all
 dnf phases

Lets the SDK trust a self-signed / private-CA package endpoint (e.g. an internal Pulp behind
package-ca). Centralized so it covers EVERY dnf invocation - sdk bootstrap, sdk packages, ext,
runtime, rootfs, initramfs, and the per-module 'dnf' subcommands, host AND target repo confs:

- config: distro.repo.ca + distro.repo.tls_verify; resolvers get_repo_ca()/get_repo_insecure()
  (env AVOCADO_REPO_CA / AVOCADO_REPO_INSECURE win over config). promote_repo_tls_env() pushes
  config values to the process env at load so the container env-builders pick them up uniformly.
- container: inject_repo_tls_env() adds AVOCADO_REPO_CA_B64 (base64 of the CA file) +
  AVOCADO_REPO_INSECURE to the container env at the env-builder chokepoints. REPO_TLS_SETUP_SNIPPET
  appends the CA to the SDK trust bundle (which SSL_CERT_FILE/CURL_CA_BUNDLE and every explicit
  sslcacert point at) and, for insecure, adds --setopt=sslverify=0 to DNF_SDK_HOST (base of every
  dnf call). Emitted by both entrypoint generators.
- sdk bootstrap: snippet appended to the bootstrap command so the FIRST dnf (target pkg from
  sdk/all) is covered too.
---
 src/commands/sdk/install.rs |  8 ++++-
 src/utils/config.rs         | 59 +++++++++++++++++++++++++++++++++++
 src/utils/container.rs      | 61 +++++++++++++++++++++++++++++++++++++
 3 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/src/commands/sdk/install.rs b/src/commands/sdk/install.rs
index 71687ef4..3fba6f25 100644
--- a/src/commands/sdk/install.rs
+++ b/src/commands/sdk/install.rs
@@ -1599,7 +1599,13 @@ fi
         let run_config = RunConfig {
             container_image: container_image.to_string(),
             target: target.to_string(),
-            command: env_setup_command.to_string(),
+            // Append the shared repo-TLS setup so the bootstrap's FIRST dnf (installing the
+            // target package from the sdk/all repo) trusts a custom CA / honors insecure.
+            command: format!(
+                "{}{}",
+                env_setup_command,
+                crate::utils::container::REPO_TLS_SETUP_SNIPPET
+            ),
             verbose: self.verbose,
             source_environment: true,
             interactive: false,
diff --git a/src/utils/config.rs b/src/utils/config.rs
index 0e4e3fe1..e6d8944a 100644
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@@ -860,6 +860,12 @@ pub struct DistroRepoConfig {
     pub url: Option<String>,
     /// Explicit releasever override. When not set, derived from distro.release/channel.
     pub releasever: Option<String>,
+    /// Path to a CA cert (PEM) to trust for the repo endpoint (self-signed / private CA).
+    /// Env override: AVOCADO_REPO_CA. Appended to the SDK trust bundle for all dnf phases.
+    pub ca: Option<String>,
+    /// TLS verification toggle for the repo endpoint. Set false to skip verification
+    /// (testing only). Env override: AVOCADO_REPO_INSECURE=1. Default: verify.
+    pub tls_verify: Option<bool>,
 }
 
 /// Reference to a Docker image for priming on the var partition at build time.
@@ -1659,6 +1665,10 @@ impl Config {
 
         config.validate_cli_requirement()?;
 
+        // Promote config-file repo TLS settings (distro.repo.ca / tls_verify) to the process
+        // env so the container env-builders pick them up the same as the env-var form.
+        config.promote_repo_tls_env();
+
         Ok(ComposedConfig {
             config,
             merged_value: main_config,
@@ -3290,6 +3300,10 @@ impl Config {
         config.synthesize_implicit_default_runtime();
         config.validate_runtime_refs()?;
 
+        // Promote config-file repo TLS settings (distro.repo.ca / tls_verify) to the process
+        // env so the container env-builders pick them up the same as the env-var form.
+        config.promote_repo_tls_env();
+
         Ok(config)
     }
 
@@ -3580,6 +3594,51 @@ impl Config {
         self.sdk.as_ref()?.repo_url.as_ref().cloned()
     }
 
+    /// Path to a CA cert to trust for the repo endpoint.
+    /// Priority: AVOCADO_REPO_CA (env) > distro.repo.ca (config).
+    pub fn get_repo_ca(&self) -> Option<String> {
+        if let Ok(p) = env::var("AVOCADO_REPO_CA") {
+            if !p.is_empty() {
+                return Some(p);
+            }
+        }
+        self.distro
+            .as_ref()
+            .and_then(|d| d.repo.as_ref())
+            .and_then(|r| r.ca.as_ref())
+            .cloned()
+    }
+
+    /// Whether to skip TLS verification for the repo endpoint (testing only).
+    /// Priority: AVOCADO_REPO_INSECURE (env, truthy) > distro.repo.tls_verify == false.
+    pub fn get_repo_insecure(&self) -> bool {
+        if let Ok(v) = env::var("AVOCADO_REPO_INSECURE") {
+            return matches!(v.to_ascii_lowercase().as_str(), "1" | "true" | "yes");
+        }
+        self.distro
+            .as_ref()
+            .and_then(|d| d.repo.as_ref())
+            .and_then(|r| r.tls_verify)
+            .map(|verify| !verify)
+            .unwrap_or(false)
+    }
+
+    /// Promote config-file repo TLS settings to the process env so the container
+    /// env-builders (which don't see the avocado Config) pick them up uniformly with
+    /// the env-var form. Env values already set always win; we never override them.
+    pub fn promote_repo_tls_env(&self) {
+        // When the env var isn't already set, get_repo_ca()/get_repo_insecure() return the
+        // config-file value; promote it so the container env-builders see it uniformly.
+        if env::var_os("AVOCADO_REPO_CA").is_none() {
+            if let Some(ca) = self.get_repo_ca() {
+                env::set_var("AVOCADO_REPO_CA", ca);
+            }
+        }
+        if env::var_os("AVOCADO_REPO_INSECURE").is_none() && self.get_repo_insecure() {
+            env::set_var("AVOCADO_REPO_INSECURE", "1");
+        }
+    }
+
     /// Get the releasever for DNF --releasever.
     /// Priority: AVOCADO_RELEASEVER > AVOCADO_SDK_REPO_RELEASE (legacy)
     ///         > distro.repo.releasever > sdk.repo_release (legacy)
diff --git a/src/utils/container.rs b/src/utils/container.rs
index 439b963b..2d82a502 100644
--- a/src/utils/container.rs
+++ b/src/utils/container.rs
@@ -353,6 +353,55 @@ fn add_security_opts(container_cmd: &mut Vec<String>) {
     }
 }
 
+/// Shared bash that wires a custom repo CA / insecure TLS into EVERY dnf phase. Centralized:
+/// every dnf call trusts `$SSL_CERT_FILE` / `$CURL_CA_BUNDLE` (the SDK bundle) — and explicit
+/// `--setopt=sslcacert=$SSL_CERT_FILE` sites point at the same file — so appending the CA to
+/// that bundle covers sdk/ext/runtime/rootfs/initramfs, the `dnf` subcommands, and host AND
+/// target repo confs. Likewise every call starts with `$DNF_SDK_HOST`, so adding
+/// `--setopt=sslverify=0` there disables verification everywhere when insecure.
+/// Must run AFTER `AVOCADO_SDK_PREFIX`, `SSL_CERT_FILE`, and `DNF_SDK_HOST` are defined, and
+/// before the first dnf call. Driven by env: AVOCADO_REPO_CA_B64 (base64 PEM) / AVOCADO_REPO_INSECURE.
+pub const REPO_TLS_SETUP_SNIPPET: &str = r##"
+# --- custom repo CA / insecure TLS (AVOCADO_REPO_CA / AVOCADO_REPO_INSECURE) ---
+if [ -n "${AVOCADO_REPO_CA_B64:-}" ]; then
+    _avocado_ca_bundle="${AVOCADO_SDK_PREFIX}/etc/ssl/certs/ca-certificates.crt"
+    mkdir -p "$(dirname "$_avocado_ca_bundle")"
+    if ! grep -q "BEGIN AVOCADO_REPO_CA" "$_avocado_ca_bundle" 2>/dev/null; then
+        { echo "# BEGIN AVOCADO_REPO_CA"; printf '%s' "$AVOCADO_REPO_CA_B64" | base64 -d; echo; echo "# END AVOCADO_REPO_CA"; } >> "$_avocado_ca_bundle"
+        echo "[INFO] Added custom repo CA to the SDK trust bundle."
+    fi
+fi
+if [ "${AVOCADO_REPO_INSECURE:-}" = "1" ]; then
+    export DNF_SDK_HOST="${DNF_SDK_HOST} --setopt=sslverify=0"
+    echo "[WARN] AVOCADO_REPO_INSECURE=1: TLS verification DISABLED for all dnf operations."
+fi
+"##;
+
+/// Inject the repo-TLS transport env (read from the process env) into a container's env map.
+/// `AVOCADO_REPO_CA` is a host path; we read it and pass the PEM as base64 (`AVOCADO_REPO_CA_B64`)
+/// so the multi-line cert survives as a single env value. Insecure is a passthrough flag.
+pub fn inject_repo_tls_env(env_vars: &mut std::collections::HashMap<String, String>) {
+    if let Ok(path) = std::env::var("AVOCADO_REPO_CA") {
+        if !path.is_empty() {
+            match std::fs::read(&path) {
+                Ok(bytes) => {
+                    use base64::Engine;
+                    let b64 = base64::engine::general_purpose::STANDARD.encode(bytes);
+                    env_vars.insert("AVOCADO_REPO_CA_B64".to_string(), b64);
+                }
+                Err(e) => {
+                    eprintln!("[WARN] AVOCADO_REPO_CA={path} could not be read ({e}); repo CA not applied.");
+                }
+            }
+        }
+    }
+    if let Ok(v) = std::env::var("AVOCADO_REPO_INSECURE") {
+        if matches!(v.to_ascii_lowercase().as_str(), "1" | "true" | "yes") {
+            env_vars.insert("AVOCADO_REPO_INSECURE".to_string(), "1".to_string());
+        }
+    }
+}
+
 /// Configuration for running commands in containers
 #[derive(Debug, Clone)]
 pub struct RunConfig {
@@ -618,6 +667,8 @@ impl SdkContainer {
         if let Some(release) = &config.repo_release {
             env_vars.insert("AVOCADO_SDK_REPO_RELEASE".to_string(), release.clone());
         }
+        // Custom repo CA / insecure TLS, applied across all dnf phases (see REPO_TLS_SETUP_SNIPPET).
+        inject_repo_tls_env(&mut env_vars);
         if let Some(dnf_args) = &config.dnf_args {
             env_vars.insert("AVOCADO_DNF_ARGS".to_string(), dnf_args.join(" "));
         }
@@ -793,6 +844,8 @@ impl SdkContainer {
         if let Some(release) = &config.repo_release {
             env_vars.insert("AVOCADO_SDK_REPO_RELEASE".to_string(), release.clone());
         }
+        // Custom repo CA / insecure TLS, applied across all dnf phases (see REPO_TLS_SETUP_SNIPPET).
+        inject_repo_tls_env(&mut env_vars);
         if let Some(dnf_args) = &config.dnf_args {
             env_vars.insert("AVOCADO_DNF_ARGS".to_string(), dnf_args.join(" "));
         }
@@ -1172,6 +1225,8 @@ impl SdkContainer {
         if let Some(release) = &config.repo_release {
             env_vars.insert("AVOCADO_SDK_REPO_RELEASE".to_string(), release.clone());
         }
+        // Custom repo CA / insecure TLS, applied across all dnf phases (see REPO_TLS_SETUP_SNIPPET).
+        inject_repo_tls_env(&mut env_vars);
         if let Some(dnf_args) = &config.dnf_args {
             env_vars.insert("AVOCADO_DNF_ARGS".to_string(), dnf_args.join(" "));
         }
@@ -1416,6 +1471,8 @@ impl SdkContainer {
         if let Some(release) = &config.repo_release {
             env_vars.insert("AVOCADO_SDK_REPO_RELEASE".to_string(), release.clone());
         }
+        // Custom repo CA / insecure TLS, applied across all dnf phases (see REPO_TLS_SETUP_SNIPPET).
+        inject_repo_tls_env(&mut env_vars);
         if let Some(dnf_args) = &config.dnf_args {
             env_vars.insert("AVOCADO_DNF_ARGS".to_string(), dnf_args.join(" "));
         }
@@ -2187,6 +2244,8 @@ if [ -f "${AVOCADO_SDK_PREFIX}/etc/ssl/certs/ca-certificates.crt" ]; then
 fi
 "#,
             );
+            // Custom repo CA / insecure TLS, applied across all dnf phases.
+            script.push_str(REPO_TLS_SETUP_SNIPPET);
         }
 
         script
@@ -2476,6 +2535,8 @@ if [ -f "${AVOCADO_SDK_PREFIX}/etc/ssl/certs/ca-certificates.crt" ]; then
 fi
 "#,
             );
+            // Custom repo CA / insecure TLS, applied across all dnf phases.
+            script.push_str(REPO_TLS_SETUP_SNIPPET);
         }
 
         script

From 6dc760f4ea8ee2b27be80346d45f72e3519fac01 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Sun, 31 May 2026 17:58:43 -0400
Subject: [PATCH 07/21] feat(snapshots): reproducible channel snapshot pinning
 in the lock file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pin each target to an immutable point-in-time snapshot of its feed channel
so a clean + rebuild reproduces exactly, even after the live channel head
advances or evicts the NEVRAs the lock file references.

Mechanism: every dnf baseurl is ${repo_url}/$releasever/... with releasever
= {release}/{channel}; pinning injects one segment -> {release}/{channel}/
snapshots/<id>, exposed via AVOCADO_RELEASEVER (which get_releasever() already
honors first), so all sysroots freeze together with no per-call-site plumbing.

- Lock file v7: per-target `repo-snapshot` (RepoSnapshot). Additive — v6 reads
  as v7 with no pin (= track head), fully backward-compatible. merge adopts a
  disk pin when the writer has none; unlock (clear_all) drops it.
- utils/snapshot.rs: resolve-and-apply runs once per command — reuse a matching
  pin, auto-pin to the channel's latest snapshot on first fetch, pre-flight a
  pinned snapshot and emit an actionable "run avocado update" error if it was
  GC'd, warn + track head on a stale release/channel, degrade to head if the
  feed serves no snapshots (snapshots-latest.json 404s). Honors repo CA / TLS.
- Wired into install (umbrella) + fetch + sdk/rootfs/runtime/ext/initramfs
  install; fetch stays the reproducible metadata cache.
- avocado update: Cargo-style move-forward — advance the snapshot pin to newest
  and clear package/kernel pins so the next install re-resolves + re-locks.
- Tests: v6->v7 migration, round-trip, clear-on-unlock, merge-adopts-disk-pin,
  plus pure releasever/pin-status/url transforms.
---
 src/commands/ext/install.rs       |   6 +-
 src/commands/fetch.rs             |   6 +
 src/commands/initramfs/install.rs |   2 +
 src/commands/install.rs           |   6 +
 src/commands/mod.rs               |   1 +
 src/commands/rootfs/install.rs    |   2 +
 src/commands/runtime/install.rs   |   4 +
 src/commands/sdk/install.rs       |   5 +
 src/commands/update.rs            | 122 +++++++++
 src/main.rs                       |  23 ++
 src/utils/lockfile.rs             | 186 +++++++++++++-
 src/utils/mod.rs                  |   1 +
 src/utils/snapshot.rs             | 394 ++++++++++++++++++++++++++++++
 13 files changed, 753 insertions(+), 5 deletions(-)
 create mode 100644 src/commands/update.rs
 create mode 100644 src/utils/snapshot.rs

diff --git a/src/commands/ext/install.rs b/src/commands/ext/install.rs
index e025ba31..426b7e52 100644
--- a/src/commands/ext/install.rs
+++ b/src/commands/ext/install.rs
@@ -174,10 +174,14 @@ impl ExtInstallCommand {
         // Merge container args from config and CLI (similar to SDK commands)
         let merged_container_args = config.merge_sdk_container_args(self.container_args.as_ref());
 
+        // Resolve target and apply the reproducible snapshot pin before reading
+        // repo_release, so it reflects the pinned channel snapshot.
+        let target = resolve_target_required(self.target.as_deref(), config)?;
+        crate::utils::snapshot::resolve_and_apply_for(config, &self.config_path, &target).await?;
+
         // Get repo_url and repo_release from config
         let repo_url = config.get_sdk_repo_url();
         let repo_release = config.get_sdk_repo_release();
-        let target = resolve_target_required(self.target.as_deref(), config)?;
 
         // Determine which extensions to install (with their locations)
         let extensions_to_install: Vec<(String, ExtensionLocation)> =
diff --git a/src/commands/fetch.rs b/src/commands/fetch.rs
index 1190c4ce..915a1fba 100644
--- a/src/commands/fetch.rs
+++ b/src/commands/fetch.rs
@@ -84,6 +84,12 @@ impl FetchCommand {
         // Resolve target architecture
         let target_arch = resolve_target_required(self.target.as_deref(), config)?;
 
+        // Resolve & apply the reproducible snapshot pin before refreshing
+        // metadata, so `fetch` caches the pinned snapshot's repodata (and
+        // auto-pins on first run) rather than the advancing live head.
+        crate::utils::snapshot::resolve_and_apply_for(config, &self.config_path, &target_arch)
+            .await?;
+
         // Get container configuration from interpolated config
         let container_image = config
             .get_sdk_image()
diff --git a/src/commands/initramfs/install.rs b/src/commands/initramfs/install.rs
index d34ec233..57726a1e 100644
--- a/src/commands/initramfs/install.rs
+++ b/src/commands/initramfs/install.rs
@@ -87,6 +87,8 @@ impl InitramfsInstallCommand {
 
         let config = &composed.config;
         let target = validate_and_log_target(self.target.as_deref(), config)?;
+        // Apply the reproducible snapshot pin before any repo_release is read.
+        crate::utils::snapshot::resolve_and_apply_for(config, &self.config_path, &target).await?;
         let merged_container_args = config.merge_sdk_container_args(self.container_args.as_ref());
         let container_image = config.get_sdk_image().ok_or_else(|| {
             anyhow::anyhow!("No container image specified in config under 'sdk.image'")
diff --git a/src/commands/install.rs b/src/commands/install.rs
index cf757964..42e9c1de 100644
--- a/src/commands/install.rs
+++ b/src/commands/install.rs
@@ -121,6 +121,12 @@ impl InstallCommand {
         let _parsed = &composed.merged_value;
         let _target = validate_and_log_target(self.target.as_deref(), config)?;
 
+        // Resolve & apply the reproducible snapshot pin once, before any sysroot
+        // fetch. This exposes AVOCADO_RELEASEVER so every sub-install (SDK,
+        // rootfs, initramfs, extensions, runtimes) freezes to the same immutable
+        // channel snapshot — auto-pinning on first fetch, reusing the pin after.
+        crate::utils::snapshot::resolve_and_apply_for(config, &self.config_path, &_target).await?;
+
         // Compute target runtimes early so we can show a useful start message.
         let initial_runtimes = self.find_target_relevant_runtimes(config, _parsed, &_target)?;
         if initial_runtimes.len() == 1 {
diff --git a/src/commands/mod.rs b/src/commands/mod.rs
index 94a558a4..b9f4d3d2 100644
--- a/src/commands/mod.rs
+++ b/src/commands/mod.rs
@@ -20,5 +20,6 @@ pub mod sdk;
 pub mod sign;
 pub mod signing_keys;
 pub mod unlock;
+pub mod update;
 pub mod upgrade;
 pub mod vm;
diff --git a/src/commands/rootfs/install.rs b/src/commands/rootfs/install.rs
index 5095a5dc..cb21bdbf 100644
--- a/src/commands/rootfs/install.rs
+++ b/src/commands/rootfs/install.rs
@@ -946,6 +946,8 @@ impl RootfsInstallCommand {
 
         let config = &composed.config;
         let target = validate_and_log_target(self.target.as_deref(), config)?;
+        // Apply the reproducible snapshot pin before any repo_release is read.
+        crate::utils::snapshot::resolve_and_apply_for(config, &self.config_path, &target).await?;
         let merged_container_args = config.merge_sdk_container_args(self.container_args.as_ref());
         let container_image = config.get_sdk_image().ok_or_else(|| {
             anyhow::anyhow!("No container image specified in config under 'sdk.image'")
diff --git a/src/commands/runtime/install.rs b/src/commands/runtime/install.rs
index 38ca4cf9..d57c36c0 100644
--- a/src/commands/runtime/install.rs
+++ b/src/commands/runtime/install.rs
@@ -122,6 +122,10 @@ impl RuntimeInstallCommand {
         // Merge container args from config and CLI (similar to SDK commands)
         let merged_container_args = config.merge_sdk_container_args(self.container_args.as_ref());
 
+        // Apply the reproducible snapshot pin before reading repo_release.
+        let target = resolve_target_required(self.target.as_deref(), config)?;
+        crate::utils::snapshot::resolve_and_apply_for(config, &self.config_path, &target).await?;
+
         // Get repo_url and repo_release from config
         let repo_url = config.get_sdk_repo_url();
         let repo_release = config.get_sdk_repo_release();
diff --git a/src/commands/sdk/install.rs b/src/commands/sdk/install.rs
index 3fba6f25..1b16e7f5 100644
--- a/src/commands/sdk/install.rs
+++ b/src/commands/sdk/install.rs
@@ -153,6 +153,11 @@ impl SdkInstallCommand {
         let config = &composed.config;
         let target = validate_and_log_target(self.target.as_deref(), config)?;
 
+        // Apply the reproducible snapshot pin (auto-pin on first fetch) before
+        // any repo_release is read, so the SDK + target sysroots fetch against
+        // the frozen channel snapshot.
+        crate::utils::snapshot::resolve_and_apply_for(config, &self.config_path, &target).await?;
+
         // Merge container args from config with CLI args
         let merged_container_args = config.merge_sdk_container_args(self.container_args.as_ref());
 
diff --git a/src/commands/update.rs b/src/commands/update.rs
new file mode 100644
index 00000000..f3ee096f
--- /dev/null
+++ b/src/commands/update.rs
@@ -0,0 +1,122 @@
+use anyhow::{Context, Result};
+use std::path::Path;
+
+use crate::utils::{
+    config::Config,
+    lockfile::LockFile,
+    output::{print_info, print_success, OutputLevel},
+    snapshot,
+    target::resolve_target_required,
+};
+
+/// `avocado update` — move a target forward to the latest feed state.
+///
+/// Cargo-style: re-resolves the lock against the newest published snapshot.
+/// Concretely it (1) advances the target's snapshot pin to the channel's
+/// current `latest` snapshot, and (2) clears the package + kernel version pins
+/// so the next `avocado install`/`fetch` re-selects the latest versions within
+/// that new snapshot and re-locks them.
+///
+/// Everyday `install`/`fetch` stay reproducible (they reuse the pins); this is
+/// the deliberate, explicit "move forward" action.
+pub struct UpdateCommand {
+    config_path: String,
+    target: Option<String>,
+    verbose: bool,
+}
+
+impl UpdateCommand {
+    pub fn new(config_path: String, target: Option<String>, verbose: bool) -> Self {
+        Self {
+            config_path,
+            target,
+            verbose,
+        }
+    }
+
+    pub async fn execute(&self) -> Result<()> {
+        let config = Config::load(&self.config_path)
+            .with_context(|| format!("Failed to load config from {}", self.config_path))?;
+        let target = resolve_target_required(self.target.as_deref(), &config)?;
+
+        let src_dir = config
+            .get_resolved_src_dir(&self.config_path)
+            .unwrap_or_else(|| {
+                Path::new(&self.config_path)
+                    .parent()
+                    .unwrap_or(Path::new("."))
+                    .to_path_buf()
+            });
+
+        let mut lock_file = LockFile::load(&src_dir)
+            .with_context(|| format!("Failed to load lock file from {}", src_dir.display()))?;
+        let old_snapshot = lock_file
+            .get_repo_snapshot(&target)
+            .map(|s| s.snapshot.clone());
+
+        // Resolve the channel's current latest snapshot (no env/lock side effects).
+        let latest = snapshot::resolve_latest(&config, &target).await?;
+
+        // Re-resolve packages to latest by dropping the existing package +
+        // kernel pins (and the old snapshot pin); the next build re-selects and
+        // re-locks within the new snapshot.
+        lock_file.clear_all(&target);
+
+        match latest {
+            Some(new_pin) => {
+                let new_id = new_pin.snapshot.clone();
+                let feed = format!("{}/{}", new_pin.release, new_pin.channel);
+                lock_file.set_repo_snapshot(&target, new_pin);
+                lock_file
+                    .save_replacing(&src_dir)
+                    .with_context(|| "Failed to save lock file")?;
+
+                match old_snapshot {
+                    Some(old) if old == new_id => print_info(
+                        &format!("Already on the latest {feed} snapshot '{new_id}'."),
+                        OutputLevel::Normal,
+                    ),
+                    Some(old) => print_info(
+                        &format!("Advanced {feed} snapshot '{old}' -> '{new_id}' for '{target}'."),
+                        OutputLevel::Normal,
+                    ),
+                    None => print_info(
+                        &format!("Pinned {feed} to latest snapshot '{new_id}' for '{target}'."),
+                        OutputLevel::Normal,
+                    ),
+                }
+                print_success(
+                    &format!(
+                        "Updated '{target}'. Run 'avocado install' to resolve and lock the latest \
+                         package versions within snapshot '{new_id}'."
+                    ),
+                    OutputLevel::Normal,
+                );
+            }
+            None => {
+                // No snapshot to advance to (feed serves no snapshots, or
+                // releasever is manually overridden). Still honor the
+                // "move to latest" intent for packages: cleared pins mean the
+                // next build resolves the latest available head.
+                lock_file
+                    .save_replacing(&src_dir)
+                    .with_context(|| "Failed to save lock file")?;
+                if self.verbose {
+                    print_info(
+                        "Feed serves no snapshots (or releasever is overridden); no snapshot pin to advance.",
+                        OutputLevel::Normal,
+                    );
+                }
+                print_success(
+                    &format!(
+                        "Cleared package pins for '{target}'. Run 'avocado install' to resolve and \
+                         lock the latest available versions."
+                    ),
+                    OutputLevel::Normal,
+                );
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index 745bb5f8..da892118 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -73,6 +73,7 @@ use commands::signing_keys::{
     SigningKeysCreateCommand, SigningKeysListCommand, SigningKeysRemoveCommand,
 };
 use commands::unlock::UnlockCommand;
+use commands::update::UpdateCommand;
 use commands::upgrade::UpgradeCommand;
 
 #[derive(Parser)]
@@ -550,6 +551,19 @@ enum Commands {
         #[arg(long)]
         initramfs: bool,
     },
+    /// Move a target forward: advance to the latest feed snapshot and re-resolve
+    /// packages to their latest versions on the next install (rewrites the lock).
+    Update {
+        /// Path to avocado.yaml configuration file
+        #[arg(short = 'C', long, default_value = "avocado.yaml")]
+        config: String,
+        /// Enable verbose output
+        #[arg(short, long)]
+        verbose: bool,
+        /// Target architecture
+        #[arg(short, long)]
+        target: Option<String>,
+    },
     /// Avocado Connect platform commands (auth, upload)
     Connect {
         #[command(subcommand)]
@@ -2336,6 +2350,15 @@ async fn main() -> Result<()> {
             unlock_cmd.execute()?;
             Ok(())
         }
+        Commands::Update {
+            config,
+            verbose,
+            target,
+        } => {
+            let update_cmd = UpdateCommand::new(config, target.or(cli.target), verbose);
+            update_cmd.execute().await?;
+            Ok(())
+        }
         Commands::Runtime { command } => match command {
             RuntimeCommands::Install {
                 name,
diff --git a/src/utils/lockfile.rs b/src/utils/lockfile.rs
index 3b312b80..992550b0 100644
--- a/src/utils/lockfile.rs
+++ b/src/utils/lockfile.rs
@@ -36,7 +36,11 @@ static LOCKFILE_SAVE_GATE: Mutex<()> = Mutex::new(());
 ///            `RuntimeLock` carrying both `packages` and per-runtime
 ///            `extensions`. Migration from v5 wraps the old flat map as
 ///            `{ packages: <old>, extensions: {} }`.
-const LOCKFILE_VERSION: u32 = 6;
+/// Version 7: Adds per-target `repo-snapshot`, the immutable channel snapshot
+///            the target's packages were resolved against. Additive — v6
+///            lockfiles read as v7 with `repo_snapshot: None` and behave
+///            exactly as before (track the live channel head).
+const LOCKFILE_VERSION: u32 = 7;
 
 /// Lock file name
 const LOCKFILE_NAME: &str = "lock.json";
@@ -235,6 +239,29 @@ pub type PackageVersions = HashMap<String, String>;
 /// Used for SDK (keyed by host arch) and runtimes (keyed by name)
 pub type NestedPackageVersions = HashMap<String, PackageVersions>;
 
+/// The immutable channel snapshot a target's packages were resolved against.
+///
+/// Recorded on the first fetch that resolves a snapshot (auto-pin). Subsequent
+/// fetches — including after `avocado clean` — re-resolve against this exact
+/// snapshot subtree (`{release}/{channel}/snapshots/{snapshot}`) so the build
+/// reproduces even after the live channel head advances or evicts the NEVRAs
+/// this lockfile pins. `avocado update` advances it; `avocado unlock` clears it.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct RepoSnapshot {
+    /// Distro release (feed year) the snapshot belongs to, e.g. "2026".
+    pub release: String,
+    /// Channel the snapshot belongs to, e.g. "edge".
+    pub channel: String,
+    /// Snapshot id — the immutable `snapshots/<id>` path segment.
+    pub snapshot: String,
+    /// Provenance: the base repo URL resolved against at pin time.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub repo_url: Option<String>,
+    /// Provenance: snapshot mint time (from `snapshots-latest.json`).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub created: Option<String>,
+}
+
 /// Source metadata for a fetched extension in the lock file
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct ExtensionSourceLock {
@@ -404,6 +431,18 @@ pub struct TargetLocks {
     #[serde(default, skip_serializing_if = "boot_record_is_empty")]
     pub boot: BootRecord,
 
+    /// The immutable channel snapshot this target's packages were resolved
+    /// against (lockfile v7+). Auto-pinned on first fetch; reused on every
+    /// later fetch so a clean+rebuild reproduces exactly. `None` means the
+    /// target tracks the live channel head (pre-v7 behavior, or a feed that
+    /// doesn't serve snapshots).
+    #[serde(
+        default,
+        skip_serializing_if = "Option::is_none",
+        rename = "repo-snapshot"
+    )]
+    pub repo_snapshot: Option<RepoSnapshot>,
+
     /// In-memory only: sysroot sections explicitly cleared during this process
     /// run (e.g., "rootfs" / "initramfs" after a kernel-pin-change clean).
     /// Not persisted. `merge_with` skips re-inserting disk packages for any
@@ -485,9 +524,13 @@ impl LockFile {
             if lock_file.version == LOCKFILE_VERSION {
                 return Ok(lock_file);
             }
-            if lock_file.version == 3 || lock_file.version == 4 {
-                // v3 → v4 → v5 → v6: empty runtimes maps parse equivalently;
-                // new fields default-empty via serde.
+            if lock_file.version == 3 || lock_file.version == 4 || lock_file.version == 6 {
+                // v3 → v4 → v5 → v6 → v7: each of these parses cleanly as the
+                // current shape — the intervening additions (kernel-versions,
+                // kernels/boot, repo-snapshot) are all `#[serde(default)]`, and
+                // v6's runtime shape is already the current one. Only the
+                // `version` field differs, bumped here. (v5's runtime map shape
+                // is incompatible, so it still falls through to migration.)
                 lock_file.version = LOCKFILE_VERSION;
                 return Ok(lock_file);
             }
@@ -908,6 +951,13 @@ impl LockFile {
             {
                 self_target.boot = other_target.boot;
             }
+            // Repo snapshot — adopt disk's pin only when self has none, so a
+            // concurrent writer's freshly-resolved pin isn't dropped. An
+            // explicit clear (unlock) goes through `save_replacing`, which
+            // never merges, so a cleared pin stays cleared.
+            if self_target.repo_snapshot.is_none() {
+                self_target.repo_snapshot = other_target.repo_snapshot;
+            }
         }
 
         self
@@ -929,6 +979,20 @@ impl LockFile {
         }
     }
 
+    /// Get the recorded repo snapshot pin for a target, if any.
+    pub fn get_repo_snapshot(&self, target: &str) -> Option<&RepoSnapshot> {
+        self.targets.get(target)?.repo_snapshot.as_ref()
+    }
+
+    /// Record (or replace) the repo snapshot pin for a target. Used by the
+    /// auto-pin-on-first-fetch path and by `avocado update`.
+    pub fn set_repo_snapshot(&mut self, target: &str, snapshot: RepoSnapshot) {
+        self.targets
+            .entry(target.to_string())
+            .or_default()
+            .repo_snapshot = Some(snapshot);
+    }
+
     /// Get the locked version for a package in a specific target and sysroot
     pub fn get_locked_version(
         &self,
@@ -1417,6 +1481,9 @@ impl LockFile {
             target_locks.runtimes.clear();
             target_locks.kernel_versions.clear();
             target_locks.kernels.clear();
+            // Drop the snapshot pin too: unlock means "re-pick latest snapshot
+            // on the next fetch", mirroring the kernel-pin reset above.
+            target_locks.repo_snapshot = None;
         }
     }
 
@@ -3002,4 +3069,115 @@ avocado-sdk-toolchain 0.1.0-r0.x86_64_avocadosdk
         assert!(target.kernels.is_empty());
         assert!(target.boot.is_empty());
     }
+
+    #[test]
+    fn test_migrate_v6_to_v7_additive() {
+        use tempfile::TempDir;
+
+        // A v6 lockfile parses directly as v7 (repo-snapshot is #[serde(default)]),
+        // with only the version bumped and no snapshot pin recorded.
+        let v6_json = r#"{"version":6,"distro_release":"2026","targets":{"qemux86-64":{"rootfs":{"avocado-pkg-rootfs":"1.0.0-r0"},"runtimes":{"dev":{"packages":{"base":"2.0.0-r0"}}}}}}
+"#;
+        let temp_dir = TempDir::new().unwrap();
+        let lock_dir = temp_dir.path().join(LOCKFILE_DIR);
+        fs::create_dir_all(&lock_dir).unwrap();
+        fs::write(lock_dir.join(LOCKFILE_NAME), v6_json).unwrap();
+
+        let loaded = LockFile::load(temp_dir.path()).unwrap();
+        // Version bumped to v7.
+        assert_eq!(loaded.version, LOCKFILE_VERSION);
+        // Existing v6 state preserved.
+        assert_eq!(
+            loaded.get_locked_version("qemux86-64", &SysrootType::Rootfs, "avocado-pkg-rootfs"),
+            Some(&"1.0.0-r0".to_string())
+        );
+        // New v7 field defaults to None — behaves as "track head".
+        assert!(loaded.get_repo_snapshot("qemux86-64").is_none());
+    }
+
+    #[test]
+    fn test_repo_snapshot_round_trip() {
+        use tempfile::TempDir;
+
+        let temp_dir = TempDir::new().unwrap();
+        let mut lock = LockFile::new();
+        lock.set_repo_snapshot(
+            "qemux86-64",
+            RepoSnapshot {
+                release: "2026".to_string(),
+                channel: "edge".to_string(),
+                snapshot: "20260531T120000Z-qemux86-64".to_string(),
+                repo_url: Some("https://repo.example.com".to_string()),
+                created: Some("2026-05-31T12:00:00Z".to_string()),
+            },
+        );
+        lock.save(temp_dir.path()).unwrap();
+
+        // Persisted under the kebab-case key and reloads intact.
+        let raw = fs::read_to_string(LockFile::get_path(temp_dir.path())).unwrap();
+        assert!(raw.contains("\"repo-snapshot\""));
+
+        let loaded = LockFile::load(temp_dir.path()).unwrap();
+        let snap = loaded.get_repo_snapshot("qemux86-64").unwrap();
+        assert_eq!(snap.release, "2026");
+        assert_eq!(snap.channel, "edge");
+        assert_eq!(snap.snapshot, "20260531T120000Z-qemux86-64");
+    }
+
+    #[test]
+    fn test_clear_all_clears_repo_snapshot() {
+        let mut lock = LockFile::new();
+        lock.set_repo_snapshot(
+            "qemux86-64",
+            RepoSnapshot {
+                release: "2026".to_string(),
+                channel: "edge".to_string(),
+                snapshot: "SNAP".to_string(),
+                repo_url: None,
+                created: None,
+            },
+        );
+        assert!(lock.get_repo_snapshot("qemux86-64").is_some());
+        // Unlock semantics: clearing a target drops the snapshot pin too.
+        lock.clear_all("qemux86-64");
+        assert!(lock.get_repo_snapshot("qemux86-64").is_none());
+    }
+
+    #[test]
+    fn test_merge_adopts_disk_snapshot_when_self_unset() {
+        use tempfile::TempDir;
+
+        // Disk has a pin; an in-memory writer that didn't touch the pin must
+        // not drop it on save() (merge adopts disk's value).
+        let temp_dir = TempDir::new().unwrap();
+        let mut on_disk = LockFile::new();
+        on_disk.set_repo_snapshot(
+            "qemux86-64",
+            RepoSnapshot {
+                release: "2026".to_string(),
+                channel: "edge".to_string(),
+                snapshot: "DISK".to_string(),
+                repo_url: None,
+                created: None,
+            },
+        );
+        on_disk.save(temp_dir.path()).unwrap();
+
+        // A fresh writer recording an unrelated package, no snapshot in hand.
+        let mut writer = LockFile::new();
+        writer.set_locked_version("qemux86-64", &SysrootType::Rootfs, "curl", "8.0.0-r0");
+        writer.save(temp_dir.path()).unwrap();
+
+        let loaded = LockFile::load(temp_dir.path()).unwrap();
+        assert_eq!(
+            loaded
+                .get_repo_snapshot("qemux86-64")
+                .map(|s| s.snapshot.as_str()),
+            Some("DISK")
+        );
+        assert_eq!(
+            loaded.get_locked_version("qemux86-64", &SysrootType::Rootfs, "curl"),
+            Some(&"8.0.0-r0".to_string())
+        );
+    }
 }
diff --git a/src/utils/mod.rs b/src/utils/mod.rs
index bd54339b..13e41cbb 100644
--- a/src/utils/mod.rs
+++ b/src/utils/mod.rs
@@ -28,6 +28,7 @@ pub mod scheduler;
 pub mod signing_keys;
 #[cfg(unix)]
 pub mod signing_service;
+pub mod snapshot;
 pub mod stamps;
 pub mod target;
 pub mod tui;
diff --git a/src/utils/snapshot.rs b/src/utils/snapshot.rs
new file mode 100644
index 00000000..95c6a216
--- /dev/null
+++ b/src/utils/snapshot.rs
@@ -0,0 +1,394 @@
+//! Reproducible channel snapshots — auto-pinning the lock file to an immutable
+//! point-in-time view of a feed channel.
+//!
+//! Background: every dnf baseurl is composed as `${repo_url}/$releasever/...`
+//! where `$releasever` is `{release}/{channel}` (e.g. `2026/edge`). The serving
+//! side publishes an immutable copy of each channel's metadata under
+//! `{release}/{channel}/snapshots/<id>/...` (sharing the content-addressed
+//! `_pkgs` pool) plus a small mutable pointer
+//! `{release}/{channel}/target/<machine>/snapshots-latest.json` naming the
+//! newest snapshot.
+//!
+//! Snapshot pinning therefore reduces to injecting one path segment into
+//! `releasever`: `2026/edge` -> `2026/edge/snapshots/<id>`. We resolve the pin
+//! once per command and expose it via the `AVOCADO_RELEASEVER` env var, which
+//! [`Config::get_releasever`] already honors ahead of the derived
+//! `{release}/{channel}` — so every downstream sysroot fetch freezes together
+//! with no per-call-site plumbing (mirrors [`Config::promote_repo_tls_env`]).
+//!
+//! Behavior (confirmed in the feature plan):
+//! - **Auto-pin on first fetch**: with no pin recorded, resolve the channel's
+//!   current `latest` snapshot, record it in the lock file, and fetch against
+//!   it. A later `avocado clean` + rebuild reproduces it exactly.
+//! - **Reuse on later fetches**: a recorded pin is reused verbatim.
+//! - **Feed without snapshots**: if the pointer 404s, fall back to tracking the
+//!   live head (pre-snapshot behavior) and record nothing.
+//! - **Manual releasever override**: if the user pins `releasever` explicitly
+//!   (config or env), we never auto-pin — they own resolution.
+//! - **Changed release/channel**: a stale pin (config moved to a different
+//!   feed) is ignored with a warning telling the user to run `avocado update`.
+
+use anyhow::{Context, Result};
+use std::env;
+use std::path::Path;
+
+use crate::utils::config::Config;
+use crate::utils::lockfile::{LockFile, RepoSnapshot};
+use crate::utils::output::{print_info, OutputLevel};
+
+/// The mutable pointer published per (channel, target) naming the newest
+/// immutable snapshot. Served at
+/// `{release}/{channel}/target/<machine>/snapshots-latest.json`.
+#[derive(Debug, Clone, serde::Deserialize)]
+pub struct SnapshotPointer {
+    /// Snapshot id — the immutable `snapshots/<id>` path segment.
+    pub id: String,
+    /// Snapshot mint time (provenance only).
+    #[serde(default)]
+    pub created: Option<String>,
+}
+
+/// Whether a recorded pin still applies to the configured feed.
+#[derive(Debug, PartialEq, Eq)]
+pub enum PinStatus {
+    /// No pin recorded for this target.
+    None,
+    /// Pin matches the configured release+channel — reuse it.
+    Matches,
+    /// Pin is for a different release/channel than the config now names.
+    Mismatch,
+}
+
+/// Classify a recorded pin against the configured feed. Pure — unit-tested.
+pub fn pin_status(pin: Option<&RepoSnapshot>, release: &str, channel: &str) -> PinStatus {
+    match pin {
+        None => PinStatus::None,
+        Some(p) if p.release == release && p.channel == channel => PinStatus::Matches,
+        Some(_) => PinStatus::Mismatch,
+    }
+}
+
+/// The releasever path segment for a pinned snapshot. Pure — unit-tested.
+pub fn effective_releasever(release: &str, channel: &str, snapshot: &str) -> String {
+    format!("{release}/{channel}/snapshots/{snapshot}")
+}
+
+/// Machine short name as it appears in feed paths (`target/<machine>/...`).
+/// Mirrors `avocado-arch-utils.bbclass`: strip a leading `avocado-`.
+fn machine_short(target: &str) -> &str {
+    target.strip_prefix("avocado-").unwrap_or(target)
+}
+
+/// URL of the per-(channel, target) latest-snapshot pointer.
+pub fn pointer_url(repo_url: &str, release: &str, channel: &str, target: &str) -> String {
+    let base = repo_url.trim_end_matches('/');
+    let machine = machine_short(target);
+    format!("{base}/{release}/{channel}/target/{machine}/snapshots-latest.json")
+}
+
+/// URL of a snapshot's target repomd — used to pre-flight a recorded pin so a
+/// GC'd snapshot produces an actionable error rather than a raw dnf failure.
+pub fn repomd_url(
+    repo_url: &str,
+    release: &str,
+    channel: &str,
+    target: &str,
+    snapshot: &str,
+) -> String {
+    let base = repo_url.trim_end_matches('/');
+    let machine = machine_short(target);
+    format!("{base}/{release}/{channel}/snapshots/{snapshot}/target/{machine}/repodata/repomd.xml")
+}
+
+/// True when the user has taken explicit control of `releasever` (config or
+/// env), in which case we must not auto-pin. This also covers the
+/// already-applied case: a parent command that set `AVOCADO_RELEASEVER` to a
+/// snapshot path makes children no-op.
+fn releasever_is_overridden(config: &Config) -> bool {
+    if env::var_os("AVOCADO_RELEASEVER").is_some()
+        || env::var_os("AVOCADO_SDK_REPO_RELEASE").is_some()
+    {
+        return true;
+    }
+    let distro_override = config
+        .distro
+        .as_ref()
+        .and_then(|d| d.repo.as_ref())
+        .and_then(|r| r.releasever.as_ref())
+        .is_some();
+    let sdk_override = config
+        .sdk
+        .as_ref()
+        .and_then(|s| s.repo_release.as_ref())
+        .is_some();
+    distro_override || sdk_override
+}
+
+/// Build an HTTP client honoring the repo's CA bundle / insecure setting,
+/// matching the TLS posture dnf uses for the same endpoint.
+fn build_client(config: &Config) -> Result<reqwest::Client> {
+    let mut builder = reqwest::ClientBuilder::new()
+        .timeout(std::time::Duration::from_secs(20))
+        .user_agent(concat!("avocado-cli/", env!("CARGO_PKG_VERSION")));
+    if config.get_repo_insecure() {
+        builder = builder.danger_accept_invalid_certs(true);
+    }
+    if let Some(ca_path) = config.get_repo_ca() {
+        let pem = std::fs::read(&ca_path)
+            .with_context(|| format!("Failed to read repo CA bundle: {ca_path}"))?;
+        // A bundle may carry multiple certs; add each.
+        for cert in reqwest::Certificate::from_pem_bundle(&pem)
+            .with_context(|| format!("Failed to parse repo CA bundle: {ca_path}"))?
+        {
+            builder = builder.add_root_certificate(cert);
+        }
+    }
+    builder.build().context("Failed to build HTTP client")
+}
+
+/// Outcome of resolving the channel's latest snapshot.
+enum LatestResult {
+    /// Pointer present — the named snapshot id (+ provenance).
+    Found(SnapshotPointer),
+    /// Pointer 404 — the feed does not serve snapshots.
+    Unsupported,
+}
+
+/// GET the latest-snapshot pointer. 404 -> `Unsupported` (degrade to head);
+/// transport/other errors propagate (don't silently lose reproducibility).
+async fn fetch_latest(
+    client: &reqwest::Client,
+    repo_url: &str,
+    release: &str,
+    channel: &str,
+    target: &str,
+) -> Result<LatestResult> {
+    let url = pointer_url(repo_url, release, channel, target);
+    let resp = client
+        .get(&url)
+        .send()
+        .await
+        .with_context(|| format!("Failed to fetch snapshot pointer: {url}"))?;
+    if resp.status() == reqwest::StatusCode::NOT_FOUND {
+        return Ok(LatestResult::Unsupported);
+    }
+    let resp = resp
+        .error_for_status()
+        .with_context(|| format!("Snapshot pointer request failed: {url}"))?;
+    let pointer: SnapshotPointer = resp
+        .json()
+        .await
+        .with_context(|| format!("Failed to parse snapshot pointer: {url}"))?;
+    Ok(LatestResult::Found(pointer))
+}
+
+/// Build a [`RepoSnapshot`] pin from a resolved pointer.
+fn build_pin(
+    release: &str,
+    channel: &str,
+    repo_url: &str,
+    pointer: &SnapshotPointer,
+) -> RepoSnapshot {
+    RepoSnapshot {
+        release: release.to_string(),
+        channel: channel.to_string(),
+        snapshot: pointer.id.clone(),
+        repo_url: Some(repo_url.to_string()),
+        created: pointer.created.clone(),
+    }
+}
+
+/// Resolve the channel's current latest snapshot into a pin, without touching
+/// the lock file or the process env. Used by `avocado update` to advance the
+/// pin. Returns `None` when releasever is manually overridden or the feed does
+/// not serve snapshots (pointer 404s).
+pub async fn resolve_latest(config: &Config, target: &str) -> Result<Option<RepoSnapshot>> {
+    if releasever_is_overridden(config) {
+        return Ok(None);
+    }
+    let (Some(release), Some(channel)) = (config.get_distro_release(), config.get_distro_channel())
+    else {
+        return Ok(None);
+    };
+    let Some(repo_url) = config.get_repo_url() else {
+        return Ok(None);
+    };
+    let client = build_client(config)?;
+    match fetch_latest(&client, &repo_url, &release, &channel, target).await? {
+        LatestResult::Unsupported => Ok(None),
+        LatestResult::Found(pointer) => {
+            Ok(Some(build_pin(&release, &channel, &repo_url, &pointer)))
+        }
+    }
+}
+
+/// Pre-flight a recorded pin: confirm the snapshot's repomd is still served.
+/// A definitive 404 means the snapshot aged out of retention -> actionable
+/// error. Transport errors (offline) are tolerated so a cached/offline rebuild
+/// against a still-valid pin isn't blocked.
+async fn verify_pin_available(
+    client: &reqwest::Client,
+    repo_url: &str,
+    snap: &RepoSnapshot,
+    target: &str,
+) -> Result<()> {
+    let url = repomd_url(
+        repo_url,
+        &snap.release,
+        &snap.channel,
+        target,
+        &snap.snapshot,
+    );
+    match client.head(&url).send().await {
+        Ok(resp) if resp.status() == reqwest::StatusCode::NOT_FOUND => anyhow::bail!(
+            "Snapshot '{}' for {}/{} is no longer available (retention horizon). \
+             Run 'avocado update' to re-pin to the latest snapshot.",
+            snap.snapshot,
+            snap.release,
+            snap.channel
+        ),
+        // Reachable (2xx/redirect) or non-404 status: proceed.
+        Ok(_) => Ok(()),
+        // Transport error (offline, DNS, timeout): don't block a pinned rebuild.
+        Err(_) => Ok(()),
+    }
+}
+
+/// Resolve the snapshot pin for `target` and, when one applies, expose it via
+/// `AVOCADO_RELEASEVER` so every downstream `get_releasever()` fetches against
+/// the frozen snapshot subtree. Auto-pins (and persists) on first fetch.
+///
+/// Call once at the entry of feed-touching commands; idempotent across the
+/// in-process install task graph (children see the parent's env and no-op).
+pub async fn resolve_and_apply(config: &Config, src_dir: &Path, target: &str) -> Result<()> {
+    if releasever_is_overridden(config) {
+        return Ok(());
+    }
+    let (Some(release), Some(channel)) = (config.get_distro_release(), config.get_distro_channel())
+    else {
+        // No release/channel to derive a feed from — nothing to pin.
+        return Ok(());
+    };
+    let Some(repo_url) = config.get_repo_url() else {
+        return Ok(());
+    };
+
+    let mut lock = LockFile::load(src_dir)
+        .with_context(|| format!("Failed to load lock file from {}", src_dir.display()))?;
+    let client = build_client(config)?;
+
+    let effective = match pin_status(lock.get_repo_snapshot(target), &release, &channel) {
+        PinStatus::Matches => {
+            let snap = lock.get_repo_snapshot(target).expect("matched");
+            verify_pin_available(&client, &repo_url, snap, target).await?;
+            effective_releasever(&snap.release, &snap.channel, &snap.snapshot)
+        }
+        PinStatus::Mismatch => {
+            let snap = lock.get_repo_snapshot(target).expect("mismatch");
+            print_info(
+                &format!(
+                    "[WARNING] Lock file is pinned to snapshot for {}/{} but config now names {}/{}. \
+                     Tracking the live channel head; run 'avocado update' to re-pin.",
+                    snap.release, snap.channel, release, channel
+                ),
+                OutputLevel::Normal,
+            );
+            return Ok(());
+        }
+        PinStatus::None => {
+            match fetch_latest(&client, &repo_url, &release, &channel, target).await? {
+                LatestResult::Unsupported => return Ok(()),
+                LatestResult::Found(pointer) => {
+                    let snap = build_pin(&release, &channel, &repo_url, &pointer);
+                    let eff = effective_releasever(&snap.release, &snap.channel, &snap.snapshot);
+                    lock.set_repo_snapshot(target, snap);
+                    lock.save(src_dir)
+                        .with_context(|| "Failed to record snapshot pin in lock file")?;
+                    print_info(
+                        &format!("Pinned {release}/{channel} to snapshot '{}'.", pointer.id),
+                        OutputLevel::Normal,
+                    );
+                    eff
+                }
+            }
+        }
+    };
+
+    env::set_var("AVOCADO_RELEASEVER", effective);
+    Ok(())
+}
+
+/// Convenience entry for commands that hold a `config_path` string: resolves
+/// `src_dir` the same way the install/clean commands do, then delegates to
+/// [`resolve_and_apply`]. One line per command call site.
+pub async fn resolve_and_apply_for(config: &Config, config_path: &str, target: &str) -> Result<()> {
+    let src_dir = config.get_resolved_src_dir(config_path).unwrap_or_else(|| {
+        Path::new(config_path)
+            .parent()
+            .unwrap_or(Path::new("."))
+            .to_path_buf()
+    });
+    resolve_and_apply(config, &src_dir, target).await
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn snap(release: &str, channel: &str, id: &str) -> RepoSnapshot {
+        RepoSnapshot {
+            release: release.to_string(),
+            channel: channel.to_string(),
+            snapshot: id.to_string(),
+            repo_url: None,
+            created: None,
+        }
+    }
+
+    #[test]
+    fn effective_releasever_injects_snapshot_segment() {
+        assert_eq!(
+            effective_releasever("2026", "edge", "20260531T120000Z-qemux86-64"),
+            "2026/edge/snapshots/20260531T120000Z-qemux86-64"
+        );
+    }
+
+    #[test]
+    fn pin_status_none_when_unpinned() {
+        assert_eq!(pin_status(None, "2026", "edge"), PinStatus::None);
+    }
+
+    #[test]
+    fn pin_status_matches_same_feed() {
+        let p = snap("2026", "edge", "X");
+        assert_eq!(pin_status(Some(&p), "2026", "edge"), PinStatus::Matches);
+    }
+
+    #[test]
+    fn pin_status_mismatch_on_channel_change() {
+        let p = snap("2026", "edge", "X");
+        assert_eq!(pin_status(Some(&p), "2026", "stable"), PinStatus::Mismatch);
+        assert_eq!(pin_status(Some(&p), "2027", "edge"), PinStatus::Mismatch);
+    }
+
+    #[test]
+    fn pointer_url_strips_avocado_prefix_and_trailing_slash() {
+        assert_eq!(
+            pointer_url(
+                "https://repo.example.com/",
+                "2026",
+                "edge",
+                "avocado-qemux86-64"
+            ),
+            "https://repo.example.com/2026/edge/target/qemux86-64/snapshots-latest.json"
+        );
+    }
+
+    #[test]
+    fn repomd_url_points_into_snapshot_subtree() {
+        assert_eq!(
+            repomd_url("https://r.io", "2026", "edge", "qemux86-64", "SNAP"),
+            "https://r.io/2026/edge/snapshots/SNAP/target/qemux86-64/repodata/repomd.xml"
+        );
+    }
+}

From 4ba843aa557eacbfa1332430d3c37a98be9c31e4 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Sun, 31 May 2026 19:54:49 -0400
Subject: [PATCH 08/21] fix(snapshots): auto-pin against the default feed;
 single-source the repo URL

The snapshot resolver early-returned when distro.repo.url was unset, so projects
relying on the baked default feed (no explicit repo.url) never recorded a
repo-snapshot pin even though their dnf fetch hit that default. Fix by deriving
the same default the container uses.

Single source of truth: add Config::DEFAULT_REPO_URL + Config::effective_repo_url()
in config.rs. The snapshot resolver uses effective_repo_url(); the container
env-builder always sets AVOCADO_SDK_REPO_URL from the same const, so the shell's
duplicated literal default is removed (it just consumes the env now).
---
 src/utils/config.rs    | 16 ++++++++++
 src/utils/container.rs | 70 +++++++++++++++++++++++++++---------------
 src/utils/snapshot.rs  |  8 ++---
 3 files changed, 64 insertions(+), 30 deletions(-)

diff --git a/src/utils/config.rs b/src/utils/config.rs
index e6d8944a..401d852e 100644
--- a/src/utils/config.rs
+++ b/src/utils/config.rs
@@ -1373,6 +1373,13 @@ pub struct Config {
 }
 
 impl Config {
+    /// Default prod package-feed base URL, used when neither config
+    /// (`distro.repo.url`/`sdk.repo_url`) nor env sets one. Single source of
+    /// truth: the container env-builder and the snapshot resolver both derive
+    /// the default from here, so the resolver pins against the same feed the
+    /// container's dnf fetches from.
+    pub const DEFAULT_REPO_URL: &'static str = "https://repo.avocadolinux.org";
+
     /// Validate that the running CLI version satisfies the `cli_requirement` if set.
     pub fn validate_cli_requirement(&self) -> Result<()> {
         if let Some(ref requirement) = self.cli_requirement {
@@ -3594,6 +3601,15 @@ impl Config {
         self.sdk.as_ref()?.repo_url.as_ref().cloned()
     }
 
+    /// Effective repo base URL: the configured value, or [`Self::DEFAULT_REPO_URL`]
+    /// when none is set. Single source of truth for the prod-feed default — used by
+    /// the snapshot resolver and the container env-builder so the resolver pins
+    /// against the same feed the container's dnf actually fetches from.
+    pub fn effective_repo_url(&self) -> String {
+        self.get_repo_url()
+            .unwrap_or_else(|| Self::DEFAULT_REPO_URL.to_string())
+    }
+
     /// Path to a CA cert to trust for the repo endpoint.
     /// Priority: AVOCADO_REPO_CA (env) > distro.repo.ca (config).
     pub fn get_repo_ca(&self) -> Option<String> {
diff --git a/src/utils/container.rs b/src/utils/container.rs
index 2d82a502..a701be3c 100644
--- a/src/utils/container.rs
+++ b/src/utils/container.rs
@@ -661,9 +661,16 @@ impl SdkContainer {
             host_platform.to_string(),
         );
 
-        if let Some(url) = &config.repo_url {
-            env_vars.insert("AVOCADO_SDK_REPO_URL".to_string(), url.clone());
-        }
+        // Always provide the repo URL (configured value or the prod default) so
+        // the container shell never needs its own literal default — single source
+        // of truth is Config::DEFAULT_REPO_URL.
+        env_vars.insert(
+            "AVOCADO_SDK_REPO_URL".to_string(),
+            config
+                .repo_url
+                .clone()
+                .unwrap_or_else(|| crate::utils::config::Config::DEFAULT_REPO_URL.to_string()),
+        );
         if let Some(release) = &config.repo_release {
             env_vars.insert("AVOCADO_SDK_REPO_RELEASE".to_string(), release.clone());
         }
@@ -838,9 +845,16 @@ impl SdkContainer {
         // Set host platform - the remote is running the container
         env_vars.insert("AVOCADO_HOST_PLATFORM".to_string(), "linux".to_string());
 
-        if let Some(url) = &config.repo_url {
-            env_vars.insert("AVOCADO_SDK_REPO_URL".to_string(), url.clone());
-        }
+        // Always provide the repo URL (configured value or the prod default) so
+        // the container shell never needs its own literal default — single source
+        // of truth is Config::DEFAULT_REPO_URL.
+        env_vars.insert(
+            "AVOCADO_SDK_REPO_URL".to_string(),
+            config
+                .repo_url
+                .clone()
+                .unwrap_or_else(|| crate::utils::config::Config::DEFAULT_REPO_URL.to_string()),
+        );
         if let Some(release) = &config.repo_release {
             env_vars.insert("AVOCADO_SDK_REPO_RELEASE".to_string(), release.clone());
         }
@@ -1219,9 +1233,16 @@ impl SdkContainer {
             host_platform.to_string(),
         );
 
-        if let Some(url) = &config.repo_url {
-            env_vars.insert("AVOCADO_SDK_REPO_URL".to_string(), url.clone());
-        }
+        // Always provide the repo URL (configured value or the prod default) so
+        // the container shell never needs its own literal default — single source
+        // of truth is Config::DEFAULT_REPO_URL.
+        env_vars.insert(
+            "AVOCADO_SDK_REPO_URL".to_string(),
+            config
+                .repo_url
+                .clone()
+                .unwrap_or_else(|| crate::utils::config::Config::DEFAULT_REPO_URL.to_string()),
+        );
         if let Some(release) = &config.repo_release {
             env_vars.insert("AVOCADO_SDK_REPO_RELEASE".to_string(), release.clone());
         }
@@ -1465,9 +1486,16 @@ impl SdkContainer {
         // Set host platform - the remote is running the container
         env_vars.insert("AVOCADO_HOST_PLATFORM".to_string(), "linux".to_string());
 
-        if let Some(url) = &config.repo_url {
-            env_vars.insert("AVOCADO_SDK_REPO_URL".to_string(), url.clone());
-        }
+        // Always provide the repo URL (configured value or the prod default) so
+        // the container shell never needs its own literal default — single source
+        // of truth is Config::DEFAULT_REPO_URL.
+        env_vars.insert(
+            "AVOCADO_SDK_REPO_URL".to_string(),
+            config
+                .repo_url
+                .clone()
+                .unwrap_or_else(|| crate::utils::config::Config::DEFAULT_REPO_URL.to_string()),
+        );
         if let Some(release) = &config.repo_release {
             env_vars.insert("AVOCADO_SDK_REPO_RELEASE".to_string(), release.clone());
         }
@@ -2058,12 +2086,9 @@ if [ -n "$AVOCADO_EXT_PATH_MOUNTS" ]; then
     done
 fi
 
-# Get repo url from environment or default to prod
-if [ -n "$AVOCADO_SDK_REPO_URL" ]; then
-    REPO_URL="$AVOCADO_SDK_REPO_URL"
-else
-    REPO_URL="https://repo.avocadolinux.org"
-fi
+# Repo URL is always supplied by the CLI env-builder (Config::DEFAULT_REPO_URL
+# when unset), so there is no literal default to drift here.
+REPO_URL="$AVOCADO_SDK_REPO_URL"
 
 if [ -n "$AVOCADO_VERBOSE" ]; then echo "[INFO] Using repo URL: '$REPO_URL'"; fi
 
@@ -2349,12 +2374,9 @@ if [ -n "$AVOCADO_EXT_PATH_MOUNTS" ]; then
     done
 fi
 
-# Get repo url from environment or default to prod
-if [ -n "$AVOCADO_SDK_REPO_URL" ]; then
-    REPO_URL="$AVOCADO_SDK_REPO_URL"
-else
-    REPO_URL="https://repo.avocadolinux.org"
-fi
+# Repo URL is always supplied by the CLI env-builder (Config::DEFAULT_REPO_URL
+# when unset), so there is no literal default to drift here.
+REPO_URL="$AVOCADO_SDK_REPO_URL"
 
 if [ -n "$AVOCADO_VERBOSE" ]; then echo "[INFO] Using repo URL: '$REPO_URL'"; fi
 
diff --git a/src/utils/snapshot.rs b/src/utils/snapshot.rs
index 95c6a216..213eb7df 100644
--- a/src/utils/snapshot.rs
+++ b/src/utils/snapshot.rs
@@ -210,9 +210,7 @@ pub async fn resolve_latest(config: &Config, target: &str) -> Result<Option<Repo
     else {
         return Ok(None);
     };
-    let Some(repo_url) = config.get_repo_url() else {
-        return Ok(None);
-    };
+    let repo_url = config.effective_repo_url();
     let client = build_client(config)?;
     match fetch_latest(&client, &repo_url, &release, &channel, target).await? {
         LatestResult::Unsupported => Ok(None),
@@ -269,9 +267,7 @@ pub async fn resolve_and_apply(config: &Config, src_dir: &Path, target: &str) ->
         // No release/channel to derive a feed from — nothing to pin.
         return Ok(());
     };
-    let Some(repo_url) = config.get_repo_url() else {
-        return Ok(());
-    };
+    let repo_url = config.effective_repo_url();
 
     let mut lock = LockFile::load(src_dir)
         .with_context(|| format!("Failed to load lock file from {}", src_dir.display()))?;

From a050fb669cff905b683fa957a70f42216d6cc994 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 10:44:24 -0400
Subject: [PATCH 09/21] feat(ext): nested extension layout for a shared
 includes installroot

Packaged extensions now nest their content under /<ext_name>/ and self-describe
the layout via `Provides: avocado-ext-layout(nested)`. ext_fetch repoqueries that
provide (repo metadata, no download) and installs nested packages into the SHARED
$AVOCADO_PREFIX/includes installroot, so one rpmdb tracks every installed
extension with no cross-extension file collisions. Legacy packages lacking the
provide keep the per-extension installroot. Either way the final content lands at
includes/<ext_name>/, so consumers are unchanged.
---
 src/commands/ext/package.rs | 19 +++++++++-----
 src/utils/ext_fetch.rs      | 51 +++++++++++++++++++++++++------------
 2 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/src/commands/ext/package.rs b/src/commands/ext/package.rs
index c89b20e6..fa0bed65 100644
--- a/src/commands/ext/package.rs
+++ b/src/commands/ext/package.rs
@@ -625,7 +625,10 @@ if [ "$FILE_COUNT" -eq 0 ]; then
 fi
 
 # Create spec file
-# Package root (/) maps to the extension's src_dir
+# The extension's src_dir maps to a top-level /<ext_name>/ directory in the package, so
+# that installing into a SHARED includes installroot lands its content at
+# includes/<ext_name>/ without colliding with other extensions' files (and one rpmdb
+# tracks all installed extensions).
 cat > SPECS/package.spec << SPEC_EOF
 %define _buildhost reproducible
 AutoReqProv: no
@@ -637,12 +640,16 @@ Summary: {summary}
 License: {license}
 Vendor: {vendor}
 Group: {group}{url_line}
+# Self-describe the on-disk layout so the CLI knows how to install this package: content
+# is nested under /<ext_name>/, so it installs into the SHARED includes installroot.
+# Legacy packages (content at /) lack this provide and use the per-ext installroot.
+Provides: avocado-ext-layout(nested)
 
 %description
 {description}
 
 %files
-/*
+/{name}
 
 %prep
 # No prep needed
@@ -651,10 +658,10 @@ Group: {group}{url_line}
 # No build needed
 
 %install
-mkdir -p %{{buildroot}}
-# Copy staged files to buildroot root
-# This allows installation to \$AVOCADO_PREFIX/includes/<ext_name>/
-cp -r "$STAGING_DIR"/* %{{buildroot}}/
+# Nest the staged files under /<ext_name>/ so a shared includes installroot yields
+# includes/<ext_name>/... (collision-free, one rpmdb per includes root).
+mkdir -p %{{buildroot}}/{name}
+cp -r "$STAGING_DIR"/* %{{buildroot}}/{name}/
 
 %clean
 # Skip clean section - not needed for our use case
diff --git a/src/utils/ext_fetch.rs b/src/utils/ext_fetch.rs
index a28620f3..df3bdda5 100644
--- a/src/utils/ext_fetch.rs
+++ b/src/utils/ext_fetch.rs
@@ -218,9 +218,11 @@ impl ExtensionFetcher {
 
     /// Fetch an extension from the avocado package repository
     ///
-    /// Installs the extension package into a per-extension installroot at
-    /// `$AVOCADO_PREFIX/includes/<ext_name>` using DNF with `--installroot`.
-    /// This gives proper RPM tracking, clean upgrades, and version management.
+    /// Installs the extension package into the SHARED `$AVOCADO_PREFIX/includes`
+    /// installroot using DNF with `--installroot`. Packages nest their content under a
+    /// top-level `/<ext_name>/` dir, so the content lands at `includes/<ext_name>/` and a
+    /// single rpmdb tracks every installed extension (proper tracking, clean upgrades,
+    /// version management, no cross-extension file collisions).
     async fn fetch_from_repo(
         &self,
         ext_name: &str,
@@ -251,15 +253,14 @@ impl ExtensionFetcher {
 
         let repo_arg = repo_name.map(|r| format!("--repo={r}")).unwrap_or_default();
 
-        // Use container path $AVOCADO_PREFIX/includes/<ext_name> as the installroot
-        let installroot = format!("$AVOCADO_PREFIX/includes/{ext_name}");
-
-        // Force mode: clean the installroot for a fresh install
-        let force_clean = if force {
-            format!(r#"rm -rf "{installroot}""#)
-        } else {
-            String::new()
-        };
+        // The package self-describes its layout via `Provides: avocado-ext-layout(nested)`.
+        // - NESTED (new): content under /<ext_name>/ -> install into the SHARED includes
+        //   installroot, so it lands at includes/<ext_name>/ with one rpmdb tracking all exts.
+        // - LEGACY (no such provide): content at / -> per-extension installroot includes/<ext_name>.
+        // Either way the final content is includes/<ext_name>/, so consumers are unchanged. The
+        // installroot is chosen at run time by repoquerying the package's provides.
+        let ext_dir = format!("$AVOCADO_PREFIX/includes/{ext_name}");
+        let force_str = if force { "true" } else { "false" };
 
         // Install the extension package using DNF with --installroot
         // Uses $DNF_SDK_COMBINED_REPO_CONF to access both SDK and target-specific repos
@@ -267,21 +268,39 @@ impl ExtensionFetcher {
             r#"
 set -e
 
-{force_clean}
+# Detect the package's on-disk layout from its provides (repo metadata, no download).
+if RPM_CONFIGDIR=$AVOCADO_SDK_PREFIX/usr/lib/rpm RPM_ETCCONFIGDIR=$AVOCADO_SDK_PREFIX \
+   $DNF_SDK_HOST $DNF_SDK_HOST_OPTS $DNF_SDK_COMBINED_REPO_CONF {repo_arg} \
+   repoquery --provides {package_spec} 2>/dev/null | grep -q 'avocado-ext-layout(nested)'; then
+    INSTALLROOT="$AVOCADO_PREFIX/includes"
+    echo "Extension '{ext_name}': nested layout -> shared includes installroot"
+else
+    INSTALLROOT="{ext_dir}"
+    echo "Extension '{ext_name}': legacy layout -> per-extension installroot"
+fi
+
+# Force: remove just this extension (rpmdb entry + content dir) for a clean reinstall,
+# without disturbing other extensions sharing the installroot.
+if [ "{force_str}" = "true" ]; then
+    RPM_CONFIGDIR=$AVOCADO_SDK_PREFIX/usr/lib/rpm RPM_ETCCONFIGDIR=$AVOCADO_SDK_PREFIX \
+        $DNF_SDK_HOST $DNF_SDK_HOST_OPTS --installroot="$INSTALLROOT" -y remove {package_name} 2>/dev/null || true
+    rm -rf "{ext_dir}"
+fi
+
+mkdir -p "$INSTALLROOT"
 
-# Install the extension package into the per-extension installroot
 RPM_CONFIGDIR=$AVOCADO_SDK_PREFIX/usr/lib/rpm \
 RPM_ETCCONFIGDIR=$AVOCADO_SDK_PREFIX \
 $DNF_SDK_HOST \
     $DNF_SDK_HOST_OPTS \
     $DNF_SDK_COMBINED_REPO_CONF \
     {repo_arg} \
-    --installroot={installroot} \
+    --installroot="$INSTALLROOT" \
     -y \
     install \
     {package_spec}
 
-echo "Successfully fetched extension '{ext_name}' (package: {package_spec}) to {installroot}"
+echo "Successfully installed extension '{ext_name}' (package: {package_spec}) to {ext_dir}"
 "#
         );
 

From b4bdc4712e8efa8cdb11424b55cc86a632730a9e Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 10:44:30 -0400
Subject: [PATCH 10/21] feat(connect): add `connect ext` publish/status/list
 (super-admin)

Build-once publish of a packaged extension RPM to the feed, plus status and
list of published versions. Adds the commands::connect::ext module and wires
the ConnectExtCommands subcommands and dispatch in main.rs.
---
 src/commands/connect/ext.rs | 302 ++++++++++++++++++++++++++++++++++++
 src/commands/connect/mod.rs |   1 +
 src/main.rs                 | 134 ++++++++++++++++
 3 files changed, 437 insertions(+)
 create mode 100644 src/commands/connect/ext.rs

diff --git a/src/commands/connect/ext.rs b/src/commands/connect/ext.rs
new file mode 100644
index 00000000..0bbdd731
--- /dev/null
+++ b/src/commands/connect/ext.rs
@@ -0,0 +1,302 @@
+//! `avocado connect ext` — publish a packaged extension to the feed via avocado-connect.
+//!
+//! Flow (see avocado-connect docs/ext-publish.md): reserve a version + get a presigned
+//! staging URL, PUT the RPM straight to storage, confirm (connect verifies + enqueues the
+//! cluster ingest). Additive + safe: a taken version is rejected, never overwritten.
+
+use anyhow::{Context, Result};
+use sha2::{Digest, Sha256};
+use std::path::Path;
+
+use crate::commands::connect::client;
+use crate::utils::output::{print_info, print_success, OutputLevel};
+
+fn http_client() -> Result<reqwest::Client> {
+    reqwest::Client::builder()
+        .use_rustls_tls()
+        .build()
+        .context("Failed to build HTTP client")
+}
+
+/// Map connect API errors to plain, actionable messages.
+fn api_error(status: u16, body: &str) -> anyhow::Error {
+    let msg = match status {
+        409 => {
+            "that extension version is already taken — bump the version and republish".to_string()
+        }
+        422 => format!("the request was rejected as invalid: {body}"),
+        401 => "not authenticated — run 'avocado connect auth login'".to_string(),
+        403 => "not authorized — extension publish is super-admin only for now".to_string(),
+        404 => "not found".to_string(),
+        _ => format!("HTTP {status}: {body}"),
+    };
+    anyhow::anyhow!(msg)
+}
+
+fn sha256_hex(bytes: &[u8]) -> String {
+    let mut h = Sha256::new();
+    h.update(bytes);
+    h.finalize().iter().map(|b| format!("{b:02x}")).collect()
+}
+
+/// Parse name/version/release/arch from an RPM filename
+/// (`<name>-<version>-<release>.<arch>.rpm`). version/release must be dash-free.
+fn parse_nevra(path: &Path) -> Result<(String, String, String, String)> {
+    let fname = path
+        .file_name()
+        .and_then(|n| n.to_str())
+        .context("invalid RPM path")?;
+    let stem = fname.strip_suffix(".rpm").context("not an .rpm file")?;
+    let (nvr, arch) = stem
+        .rsplit_once('.')
+        .context("RPM filename missing .<arch>")?;
+    let (nv, release) = nvr
+        .rsplit_once('-')
+        .context("RPM filename missing -<release>")?;
+    let (name, version) = nv
+        .rsplit_once('-')
+        .context("RPM filename missing -<version>")?;
+    Ok((
+        name.to_string(),
+        version.to_string(),
+        release.to_string(),
+        arch.to_string(),
+    ))
+}
+
+pub struct ExtPublishCommand {
+    pub config: String,
+    pub org: Option<String>,
+    pub profile: Option<String>,
+    pub rpm: String,
+    pub name: Option<String>,
+    pub version: Option<String>,
+    pub release: Option<String>,
+    pub arch: Option<String>,
+    pub target_release: String,
+    pub target_channel: String,
+    pub targets: String,
+}
+
+impl ExtPublishCommand {
+    pub async fn execute(&self) -> Result<()> {
+        // --org is optional for publish: omit it (and connect.org) to target the
+        // platform (Peridio) org, which connect fills in server-side for super-admins.
+        // When given (flag or connect.org), it publishes into that tenant org and
+        // selects a matching auth profile.
+        let org = self.org.clone().or_else(|| {
+            std::path::Path::new(&self.config)
+                .exists()
+                .then(|| crate::utils::config::load_config(&self.config).ok())
+                .flatten()
+                .and_then(|c| c.connect)
+                .and_then(|c| c.org)
+        });
+        let cfg = client::load_config()?
+            .context("Not logged in. Run 'avocado connect auth login' first.")?;
+        let (_name, profile) = cfg.resolve_profile(self.profile.as_deref(), org.as_deref())?;
+        let api = profile.api_url.trim_end_matches('/').to_string();
+        let token = profile.token.clone();
+
+        let path = Path::new(&self.rpm);
+        let bytes = std::fs::read(path).with_context(|| format!("Failed to read {}", self.rpm))?;
+        let size = bytes.len() as u64;
+        let sha = sha256_hex(&bytes);
+
+        let (pn, pv, pr, pa) = parse_nevra(path).unwrap_or_default();
+        let name = self.name.clone().unwrap_or(pn);
+        let version = self.version.clone().unwrap_or(pv);
+        let release = self
+            .release
+            .clone()
+            .unwrap_or(if pr.is_empty() { "r0".into() } else { pr });
+        let arch = self
+            .arch
+            .clone()
+            .unwrap_or(if pa.is_empty() { "noarch".into() } else { pa });
+        if name.is_empty() || version.is_empty() {
+            anyhow::bail!("could not determine extension name/version — pass --name and --version");
+        }
+        let machines: Vec<String> = self
+            .targets
+            .split(',')
+            .map(|s| s.trim().to_string())
+            .filter(|s| !s.is_empty())
+            .collect();
+
+        let client = http_client()?;
+
+        // 1. reserve version + get presigned staging URL (409 if taken)
+        let org_label = org.as_deref().unwrap_or("platform");
+        print_info(
+            &format!("Publishing {name}-{version}-{release}.{arch} to {api} (org {org_label})..."),
+            OutputLevel::Normal,
+        );
+        let res = client
+            .post(format!("{api}/api/admin/extensions/publish"))
+            .bearer_auth(&token)
+            .json(&serde_json::json!({
+                "organization_id": org,
+                "name": name,
+                "version": version,
+                "release": release,
+                "arch": arch,
+                "sha256": sha,
+                "size_bytes": size,
+                "target_release": self.target_release,
+                "target_channel": self.target_channel,
+                "target_machines": machines,
+            }))
+            .send()
+            .await
+            .context("publish request failed")?;
+        let status = res.status().as_u16();
+        let body = res.text().await.unwrap_or_default();
+        if !(200..300).contains(&status) {
+            return Err(api_error(status, &body));
+        }
+        let data: serde_json::Value =
+            serde_json::from_str(&body).context("failed to parse publish response")?;
+        let id = data["data"]["id"]
+            .as_str()
+            .context("publish response missing version id")?
+            .to_string();
+        let upload_url = data["data"]["upload_url"]
+            .as_str()
+            .context("publish response missing upload_url")?
+            .to_string();
+
+        // 2. PUT the RPM straight to staging (bytes never pass through connect)
+        print_info("Uploading package to staging...", OutputLevel::Normal);
+        let put = client
+            .put(&upload_url)
+            .body(bytes)
+            .send()
+            .await
+            .context("staging upload failed")?;
+        if !put.status().is_success() {
+            let s = put.status().as_u16();
+            let b = put.text().await.unwrap_or_default();
+            anyhow::bail!("staging upload failed (HTTP {s}): {b}");
+        }
+
+        // 3. confirm -> connect verifies bytes and enqueues the cluster ingest
+        let conf = client
+            .post(format!("{api}/api/admin/extensions/{id}/confirm"))
+            .bearer_auth(&token)
+            .send()
+            .await
+            .context("confirm request failed")?;
+        let cs = conf.status().as_u16();
+        let cb = conf.text().await.unwrap_or_default();
+        if !(200..300).contains(&cs) {
+            return Err(api_error(cs, &cb));
+        }
+
+        print_success(
+            &format!("Published {name}-{version}; ingest queued."),
+            OutputLevel::Normal,
+        );
+        print_info(
+            &format!("Track it:  avocado connect ext status {id}"),
+            OutputLevel::Normal,
+        );
+        Ok(())
+    }
+}
+
+pub struct ExtStatusCommand {
+    pub config: String,
+    pub org: Option<String>,
+    pub profile: Option<String>,
+    pub id: String,
+}
+
+impl ExtStatusCommand {
+    pub async fn execute(&self) -> Result<()> {
+        let (api, token) = api_and_token(self.org.clone(), &self.config, self.profile.as_deref())?;
+        let client = http_client()?;
+        let res = client
+            .get(format!("{api}/api/admin/extensions/{}", self.id))
+            .bearer_auth(&token)
+            .send()
+            .await
+            .context("status request failed")?;
+        let status = res.status().as_u16();
+        let body = res.text().await.unwrap_or_default();
+        if !(200..300).contains(&status) {
+            return Err(api_error(status, &body));
+        }
+        let data: serde_json::Value = serde_json::from_str(&body).unwrap_or_default();
+        println!(
+            "{}",
+            serde_json::to_string_pretty(&data["data"]).unwrap_or(body)
+        );
+        Ok(())
+    }
+}
+
+pub struct ExtListCommand {
+    pub config: String,
+    pub org: Option<String>,
+    pub profile: Option<String>,
+    pub name: Option<String>,
+}
+
+impl ExtListCommand {
+    pub async fn execute(&self) -> Result<()> {
+        let (api, token) = api_and_token(self.org.clone(), &self.config, self.profile.as_deref())?;
+        let mut url = format!("{api}/api/admin/extensions");
+        if let Some(n) = &self.name {
+            url.push_str(&format!("?name={n}"));
+        }
+        let client = http_client()?;
+        let res = client
+            .get(&url)
+            .bearer_auth(&token)
+            .send()
+            .await
+            .context("list request failed")?;
+        let status = res.status().as_u16();
+        let body = res.text().await.unwrap_or_default();
+        if !(200..300).contains(&status) {
+            return Err(api_error(status, &body));
+        }
+        let data: serde_json::Value = serde_json::from_str(&body).unwrap_or_default();
+        if let Some(items) = data["data"].as_array() {
+            for v in items {
+                println!(
+                    "{:<28} {:<12} {:<10} {}",
+                    v["package"].as_str().unwrap_or("?"),
+                    v["version"].as_str().unwrap_or("?"),
+                    v["status"].as_str().unwrap_or("?"),
+                    v["nevra"].as_str().unwrap_or("")
+                );
+            }
+        } else {
+            println!("{}", serde_json::to_string_pretty(&data).unwrap_or(body));
+        }
+        Ok(())
+    }
+}
+
+fn api_and_token(
+    org: Option<String>,
+    config_path: &str,
+    profile: Option<&str>,
+) -> Result<(String, String)> {
+    // --org is optional here too: fall back to connect.org, then to the
+    // default/--profile auth, rather than hard-requiring an org.
+    let org = org.or_else(|| {
+        std::path::Path::new(config_path)
+            .exists()
+            .then(|| crate::utils::config::load_config(config_path).ok())
+            .flatten()
+            .and_then(|c| c.connect)
+            .and_then(|c| c.org)
+    });
+    let cfg =
+        client::load_config()?.context("Not logged in. Run 'avocado connect auth login' first.")?;
+    let (_name, p) = cfg.resolve_profile(profile, org.as_deref())?;
+    Ok((p.api_url.trim_end_matches('/').to_string(), p.token.clone()))
+}
diff --git a/src/commands/connect/mod.rs b/src/commands/connect/mod.rs
index 97cc46c6..b5228838 100644
--- a/src/commands/connect/mod.rs
+++ b/src/commands/connect/mod.rs
@@ -6,6 +6,7 @@ pub mod cohorts;
 pub mod deploy;
 pub mod device_reclaim;
 pub mod devices;
+pub mod ext;
 pub mod init;
 pub mod keys;
 pub mod orgs;
diff --git a/src/main.rs b/src/main.rs
index da892118..fd7e21bc 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -613,6 +613,11 @@ enum ConnectCommands {
         #[command(subcommand)]
         command: ConnectOrgsCommands,
     },
+    /// Publish extensions to the feed (super-admin)
+    Ext {
+        #[command(subcommand)]
+        command: ConnectExtCommands,
+    },
     /// Manage projects
     Projects {
         #[command(subcommand)]
@@ -855,6 +860,74 @@ enum ConnectOrgsCommands {
     },
 }
 
+#[derive(Subcommand)]
+enum ConnectExtCommands {
+    /// Build-once publish a packaged extension RPM to the feed (super-admin)
+    Publish {
+        /// Path to the extension RPM (from `avocado ext package`)
+        rpm: String,
+        /// Organization ID (or set connect.org in avocado.yaml)
+        #[arg(long)]
+        org: Option<String>,
+        /// Extension name (default: parsed from the RPM filename)
+        #[arg(long)]
+        name: Option<String>,
+        /// Extension version (default: parsed from the RPM filename)
+        #[arg(long)]
+        version: Option<String>,
+        /// Extension release (default: parsed, else r0)
+        #[arg(long)]
+        release: Option<String>,
+        /// Extension arch (default: parsed, else noarch)
+        #[arg(long)]
+        arch: Option<String>,
+        /// Target feed release
+        #[arg(long, default_value = "2026")]
+        target_release: String,
+        /// Target feed channel
+        #[arg(long, default_value = "edge")]
+        target_channel: String,
+        /// Comma-separated target machines
+        #[arg(long, default_value = "qemux86-64,qemuarm64")]
+        targets: String,
+        /// Path to avocado.yaml configuration file
+        #[arg(short = 'C', long, default_value = "avocado.yaml")]
+        config: String,
+        /// Profile name (defaults to the active default profile)
+        #[arg(long)]
+        profile: Option<String>,
+    },
+    /// Show the status of a published extension version
+    Status {
+        /// Version id
+        id: String,
+        /// Organization ID (or set connect.org in avocado.yaml)
+        #[arg(long)]
+        org: Option<String>,
+        /// Path to avocado.yaml configuration file
+        #[arg(short = 'C', long, default_value = "avocado.yaml")]
+        config: String,
+        /// Profile name (defaults to the active default profile)
+        #[arg(long)]
+        profile: Option<String>,
+    },
+    /// List published extension versions
+    List {
+        /// Filter by package name
+        #[arg(long)]
+        name: Option<String>,
+        /// Organization ID (or set connect.org in avocado.yaml)
+        #[arg(long)]
+        org: Option<String>,
+        /// Path to avocado.yaml configuration file
+        #[arg(short = 'C', long, default_value = "avocado.yaml")]
+        config: String,
+        /// Profile name (defaults to the active default profile)
+        #[arg(long)]
+        profile: Option<String>,
+    },
+}
+
 #[derive(Subcommand)]
 enum ConnectProjectsCommands {
     /// List projects in an organization
@@ -3313,6 +3386,67 @@ async fn main() -> Result<()> {
                     Ok(())
                 }
             },
+            ConnectCommands::Ext { command } => match command {
+                ConnectExtCommands::Publish {
+                    rpm,
+                    org,
+                    name,
+                    version,
+                    release,
+                    arch,
+                    target_release,
+                    target_channel,
+                    targets,
+                    config,
+                    profile,
+                } => {
+                    let cmd = commands::connect::ext::ExtPublishCommand {
+                        config,
+                        org,
+                        profile,
+                        rpm,
+                        name,
+                        version,
+                        release,
+                        arch,
+                        target_release,
+                        target_channel,
+                        targets,
+                    };
+                    cmd.execute().await?;
+                    Ok(())
+                }
+                ConnectExtCommands::Status {
+                    id,
+                    org,
+                    config,
+                    profile,
+                } => {
+                    let cmd = commands::connect::ext::ExtStatusCommand {
+                        config,
+                        org,
+                        profile,
+                        id,
+                    };
+                    cmd.execute().await?;
+                    Ok(())
+                }
+                ConnectExtCommands::List {
+                    name,
+                    org,
+                    config,
+                    profile,
+                } => {
+                    let cmd = commands::connect::ext::ExtListCommand {
+                        config,
+                        org,
+                        profile,
+                        name,
+                    };
+                    cmd.execute().await?;
+                    Ok(())
+                }
+            },
             ConnectCommands::Projects { command } => match command {
                 ConnectProjectsCommands::List {
                     org,

From 32d3349c27027ad29b7f55caf4becec92e0c84ff Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 10:44:59 -0400
Subject: [PATCH 11/21] build(ext): package avocado-cli as an extension via
 `ext package`

Add the avocado.yaml manifest plus compile/install/clean helper scripts that
build avocado-cli into the avocado-ext-cli extension, and gitignore the
transient /.cargo/ cross-compile config that avocado-cli-compile.sh writes
during the build.
---
 .gitignore             |  3 +++
 avocado-cli-clean.sh   | 14 ++++++++++++
 avocado-cli-compile.sh | 45 ++++++++++++++++++++++++++++++++++++
 avocado-cli-install.sh | 23 +++++++++++++++++++
 avocado.yaml           | 52 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 137 insertions(+)
 create mode 100755 avocado-cli-clean.sh
 create mode 100755 avocado-cli-compile.sh
 create mode 100755 avocado-cli-install.sh
 create mode 100644 avocado.yaml

diff --git a/.gitignore b/.gitignore
index 1f11dcad..8f560e57 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,9 @@
 /target/
 **/*.rs.bk
 
+# Transient cross-compile config written by avocado-cli-compile.sh during `ext package`
+/.cargo/
+
 # Cargo lock file (uncomment if this is a library)
 # Cargo.lock
 
diff --git a/avocado-cli-clean.sh b/avocado-cli-clean.sh
new file mode 100755
index 00000000..aaa4f7cd
--- /dev/null
+++ b/avocado-cli-clean.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+echo "Cleaning avocado-cli build artifacts"
+
+cd "$(dirname "$0")"
+
+# Remove Cargo build artifacts
+cargo clean
+
+# Remove any generated config
+rm -rf .cargo
+
+echo "Clean complete"
diff --git a/avocado-cli-compile.sh b/avocado-cli-compile.sh
new file mode 100755
index 00000000..fb0f7436
--- /dev/null
+++ b/avocado-cli-compile.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+set -e
+
+# Find the Rust target from RUST_TARGET_PATH
+for json_file in "$RUST_TARGET_PATH"/*.json; do
+    if [ -f "$json_file" ]; then
+        json_name=$(basename "$json_file" .json)
+        if [[ "$json_name" == "${OECORE_TARGET_ARCH}-"* ]]; then
+            RUST_TARGET="$json_name"
+            break
+        fi
+    fi
+done
+
+if [ -z "$RUST_TARGET" ]; then
+    echo "Error: Could not find Rust target for $OECORE_TARGET_ARCH"
+    exit 1
+fi
+
+echo "Building avocado-cli for target: $RUST_TARGET"
+
+cd "$(dirname "$0")"
+
+# Clear any rustflags that might cause conflicts with our .cargo/config.toml.
+# The SDK env exports CARGO_TARGET_<triple>_RUSTFLAGS carrying its own --sysroot;
+# left set, cargo merges it with the config below and rustc gets --sysroot twice
+# ("Option 'sysroot' given more than once"). Unset every target's flavor, not just
+# one hardcoded triple, so this works for x86_64 and aarch64 targets alike.
+unset RUSTFLAGS
+unset CARGO_BUILD_RUSTFLAGS
+for var in $(env | grep -o 'CARGO_TARGET_[A-Z0-9_]*_RUSTFLAGS'); do
+    unset "$var"
+done
+
+# Remove any existing config that might conflict
+rm -rf .cargo
+
+# Create config.toml with cross-compilation settings
+mkdir -p .cargo
+cat > .cargo/config.toml << EOF
+[target.$RUST_TARGET]
+rustflags = ["--sysroot=$SDKTARGETSYSROOT/usr", "-C", "link-arg=--sysroot=$SDKTARGETSYSROOT"]
+EOF
+
+cargo build --release --target "$RUST_TARGET"
diff --git a/avocado-cli-install.sh b/avocado-cli-install.sh
new file mode 100755
index 00000000..249c368f
--- /dev/null
+++ b/avocado-cli-install.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -e
+
+# Find the Rust target from RUST_TARGET_PATH
+for json_file in "$RUST_TARGET_PATH"/*.json; do
+    if [ -f "$json_file" ]; then
+        json_name=$(basename "$json_file" .json)
+        if [[ "$json_name" == "${OECORE_TARGET_ARCH}-"* ]]; then
+            RUST_TARGET="$json_name"
+            break
+        fi
+    fi
+done
+
+BINARY_PATH="$(dirname "$0")/target/$RUST_TARGET/release/avocado"
+
+if [ ! -f "$BINARY_PATH" ]; then
+    echo "Error: Binary not found at $BINARY_PATH"
+    exit 1
+fi
+
+install -D -m 755 "$BINARY_PATH" "$AVOCADO_BUILD_EXT_SYSROOT/usr/bin/avocado"
+echo "Installed: $(file "$AVOCADO_BUILD_EXT_SYSROOT/usr/bin/avocado")"
diff --git a/avocado.yaml b/avocado.yaml
new file mode 100644
index 00000000..cd5f813a
--- /dev/null
+++ b/avocado.yaml
@@ -0,0 +1,52 @@
+supported_targets: '*'
+
+extensions:
+  avocado-ext-cli:
+    # Version is injected at build time: `AVOCADO_EXT_VERSION=<x> avocado ext package …`
+    # (CI sets it from the release/tag; unset => empty + a warning).
+    version: '{{ env.AVOCADO_EXT_VERSION }}'
+    release: r0
+    summary: Avocado build system command line interface
+    description: Avocado build system command line interface for managing the system
+    license: Apache-2.0
+    url: https://github.com/avocadolinux/avocado-cli
+    vendor: Avocado Linux <info@avocadolinux.org>
+    # Stage the Rust source (the repo root IS the project) plus the packaging
+    # artifacts, so `ext package` can compile it. configs/ is required:
+    # src references include_str!("../../configs/default.yaml").
+    package_files:
+      - avocado.yaml
+      - avocado-cli-*.sh
+      - Cargo.toml
+      - Cargo.lock
+      - build.rs
+      - src
+      - configs
+
+    packages:
+      bash: '*'
+      avocado-cli-bin:
+        compile: avocado-cli-compile
+        install: avocado-cli-install.sh
+
+    sdk:
+      packages:
+        nativesdk-binutils: '*'
+        nativesdk-cargo: '*'
+        nativesdk-gcc: '*'
+        nativesdk-glibc-dev: '*'
+        nativesdk-libgcc-dev: '*'
+        nativesdk-rust: '*'
+        nativesdk-git: '*'
+        packagegroup-rust-cross-canadian-avocado-{{ avocado.target }}: '*'
+
+sdk:
+  image: docker.io/avocadolinux/sdk:{{ env.AVOCADO_DISTRO_RELEASE }}-{{ env.AVOCADO_DISTRO_CHANNEL }}
+
+  compile:
+    avocado-cli-compile:
+      compile: avocado-cli-compile.sh
+      clean: avocado-cli-clean.sh
+      packages:
+        libstd-rs: '*'
+        libstd-rs-dev: '*'

From ffe193eb76984624991d134e7b4a7bc9fdb886f6 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 10:45:20 -0400
Subject: [PATCH 12/21] fix(tui): route unset-env-var warning through the
 output module

`{{ env.VAR }}` interpolation of an unset variable emitted its warning with a
raw eprintln!, which lands inside the TaskRenderer's live cursor region without
being counted in rendered_lines. The next redraw's MoveUp/Clear then cleared one
line too few and stranded a task line, showing as stacked "sdk bootstrap" spinner
lines during installs that fetch remote extensions (whose configs use
`{{ env.AVOCADO_EXT_VERSION }}`). Route it through print_warning, which is
suppressed while a TUI/JSON renderer is active and still prints in plain/CI runs.
---
 src/utils/interpolation/env.rs | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/utils/interpolation/env.rs b/src/utils/interpolation/env.rs
index a003d428..7edaa7ea 100644
--- a/src/utils/interpolation/env.rs
+++ b/src/utils/interpolation/env.rs
@@ -10,6 +10,8 @@
 use anyhow::Result;
 use std::env;
 
+use crate::utils::output::{print_warning, OutputLevel};
+
 /// Resolve an environment variable template.
 ///
 /// # Arguments
@@ -30,8 +32,16 @@ pub fn resolve(var_name: &str) -> Result<Option<String>> {
     match env::var(var_name) {
         Ok(value) => Ok(Some(value)),
         Err(_) => {
-            eprintln!(
-                "[WARNING] Environment variable '{var_name}' is not set, replacing with empty string"
+            // Route through `print_warning` (not a raw `eprintln!`) so the
+            // message is suppressed while a TUI renderer is active. A direct
+            // stderr write here lands inside the renderer's cursor region
+            // without being counted in `rendered_lines`, so the next redraw's
+            // MoveUp/Clear clears one line too few and strands a task line.
+            print_warning(
+                &format!(
+                    "Environment variable '{var_name}' is not set, replacing with empty string"
+                ),
+                OutputLevel::Normal,
             );
             Ok(Some(String::new()))
         }

From 25b5eda15d4a54467db852ff3fd4a789788b8e51 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 11:18:06 -0400
Subject: [PATCH 13/21] fix(deploy): gate avocado-vm QMP port forwarding behind
 cfg(unix)

runtime/deploy.rs referenced crate::utils::vm::qmp::QmpClient unconditionally,
but the qmp module is `#[cfg(unix)]` (unix-socket transport). That broke the
Windows `cargo check` (E0433: cannot find `qmp` in `vm`). Gate the port-forward
setup and teardown behind cfg(unix) with a non-unix no-op; avocado-vm routing
only occurs on unix hosts, so there is no behavior change on unix.
---
 src/commands/runtime/deploy.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/commands/runtime/deploy.rs b/src/commands/runtime/deploy.rs
index f3c188fb..618bd11c 100644
--- a/src/commands/runtime/deploy.rs
+++ b/src/commands/runtime/deploy.rs
@@ -798,9 +798,15 @@ struct OpenForward {
 
 impl OpenForward {
     async fn close(self) {
+        // The qmp module is unix-only (unix-socket transport), so this teardown
+        // only exists on unix. On other platforms OpenForward is never built
+        // (the qmp block below is cfg'd out), so this is unreachable there.
+        #[cfg(unix)]
         if let Ok(mut c) = crate::utils::vm::qmp::QmpClient::connect(&self.qmp_socket).await {
             let _ = c.hostfwd_remove("net0", "0.0.0.0", self.host_port).await;
         }
+        #[cfg(not(unix))]
+        let _ = (&self.qmp_socket, self.host_port);
     }
 }
 
@@ -875,6 +881,10 @@ async fn prepare_mac_deploy_net(
                 return net;
             }
         };
+        // QMP rides a unix-socket transport (the qmp module is unix-only), and
+        // avocado-vm routing only happens on unix hosts — so the port-forward
+        // setup is compiled in on unix only. Elsewhere it's a no-op.
+        #[cfg(unix)]
         match crate::utils::vm::qmp::QmpClient::connect(&sock).await {
             Ok(mut c) => {
                 // Clear any stale forward from a prior interrupted deploy, then add.
@@ -910,6 +920,8 @@ async fn prepare_mac_deploy_net(
                 OutputLevel::Normal,
             ),
         }
+        #[cfg(not(unix))]
+        let _ = &sock;
     }
 
     net

From 32d4e93af0b18af25c9af220cba66039c1686280 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 20:38:00 -0400
Subject: [PATCH 14/21] perf(vm): splice PSCI idle-states into virt-machine DTB

QEMU's `-machine virt` doesn't emit `cpu-idle-states` device-tree
bindings, so CONFIG_ARM_PSCI_CPUIDLE never binds and idle CPUs fall
back to bare WFI. Under HVF that pattern bounces the vCPU thread
through vmexit/vmenter instead of blocking on the WFI handler's
pthread_cond_timedwait, costing ~80% host CPU per vCPU at guest idle.

On arm64 launches we now dump QEMU's auto-generated DTB once (via
`-machine virt,dumpdtb=`), splice in `/idle-states/cpu-sleep-0` plus
per-CPU `cpu-idle-states` properties, cache the patched copy under
`~/.avocado/vm/dtb/` keyed by (smp, memory, qemu_version), and pass
it back with `-dtb`. Cache hits on subsequent launches.

Measured on smp=8 idle: 670% -> 275-344% host CPU. State1 stays
cosmetic on HVF (PSCI CPU_SUSPEND isn't deeper than WFI) but the
framework binding alone fixes the vmexit-loop pattern.

Pure-Rust FDT v17 parse/serialize in fdt.rs; no external dtc
dependency. Failures degrade gracefully to the previous auto-generated
DTB path. `AVOCADO_VM_DTB` env var preserved as a debug override.
---
 src/utils/vm/fdt.rs   | 490 ++++++++++++++++++++++++++++++++++++++++++
 src/utils/vm/mod.rs   |   1 +
 src/utils/vm/qemu.rs  | 134 +++++++++++-
 src/utils/vm/state.rs |   6 +
 4 files changed, 630 insertions(+), 1 deletion(-)
 create mode 100644 src/utils/vm/fdt.rs

diff --git a/src/utils/vm/fdt.rs b/src/utils/vm/fdt.rs
new file mode 100644
index 00000000..88519783
--- /dev/null
+++ b/src/utils/vm/fdt.rs
@@ -0,0 +1,490 @@
+//! Minimal pure-Rust FDT v17 parser/emitter, scoped to the one DT mutation
+//! we need: injecting PSCI `idle-states` into a QEMU-generated `virt`
+//! machine DTB.
+//!
+//! QEMU's `-machine virt` does not emit `idle-states` or per-CPU
+//! `cpu-idle-states` properties, so the kernel's PSCI cpuidle driver
+//! (`drivers/cpuidle/cpuidle-psci.c`) never binds — even with
+//! `CONFIG_ARM_PSCI_CPUIDLE=y`. Without a cpuidle driver, arm64 idle
+//! falls back to bare WFI, which under HVF lets the vCPU thread bounce
+//! through vmexit/vmenter rather than blocking on the WFI handler's
+//! `pthread_cond_timedwait`. End result: a fully-idle 8-vCPU guest burns
+//! ~670% host CPU.
+//!
+//! We dump QEMU's auto-generated DTB once (via `-machine virt,dumpdtb=`),
+//! splice in the missing nodes, cache the patched copy, and pass it back
+//! on the real launch with `-dtb`. With the driver bound, idle drops by
+//! ~50% (the deeper PSCI suspend state stays cosmetic because HVF doesn't
+//! implement CPU_SUSPEND any deeper than WFI today, but state0/WFI
+//! through cpuidle is enough — the framework binding alone fixes the
+//! vmexit-loop pattern).
+
+use anyhow::{bail, Context, Result};
+
+const FDT_MAGIC: u32 = 0xd00d_feed;
+const FDT_VERSION_OUT: u32 = 17;
+const FDT_LAST_COMP_VERSION_OUT: u32 = 16;
+const FDT_BEGIN_NODE: u32 = 0x1;
+const FDT_END_NODE: u32 = 0x2;
+const FDT_PROP: u32 = 0x3;
+const FDT_NOP: u32 = 0x4;
+const FDT_END: u32 = 0x9;
+
+#[derive(Debug, Clone)]
+pub struct Property {
+    pub name: String,
+    pub value: Vec<u8>,
+}
+
+#[derive(Debug, Clone)]
+pub struct Node {
+    pub name: String,
+    pub props: Vec<Property>,
+    pub children: Vec<Node>,
+}
+
+impl Node {
+    pub fn new(name: impl Into<String>) -> Self {
+        Self { name: name.into(), props: Vec::new(), children: Vec::new() }
+    }
+
+    pub fn set_prop(&mut self, name: &str, value: Vec<u8>) {
+        if let Some(p) = self.props.iter_mut().find(|p| p.name == name) {
+            p.value = value;
+        } else {
+            self.props.push(Property { name: name.to_string(), value });
+        }
+    }
+
+    pub fn child_mut(&mut self, name: &str) -> Option<&mut Node> {
+        self.children.iter_mut().find(|c| c.name == name)
+    }
+}
+
+/// Parsed DTB: the root node tree, plus the original memory reservation
+/// block (preserved verbatim on round-trip) and the original
+/// `boot_cpuid_phys` header field.
+pub struct Fdt {
+    pub root: Node,
+    pub mem_rsv: Vec<(u64, u64)>,
+    pub boot_cpuid_phys: u32,
+}
+
+pub fn parse(data: &[u8]) -> Result<Fdt> {
+    if data.len() < 40 {
+        bail!("DTB too short: {} bytes", data.len());
+    }
+    let read_u32 = |off: usize| -> Result<u32> {
+        let slice = data
+            .get(off..off + 4)
+            .with_context(|| format!("DTB header truncated at offset {off}"))?;
+        Ok(u32::from_be_bytes(slice.try_into().unwrap()))
+    };
+    let magic = read_u32(0)?;
+    if magic != FDT_MAGIC {
+        bail!("bad DTB magic {magic:#x}, expected {FDT_MAGIC:#x}");
+    }
+    let totalsize = read_u32(4)? as usize;
+    let off_dt_struct = read_u32(8)? as usize;
+    let off_dt_strings = read_u32(12)? as usize;
+    let off_mem_rsvmap = read_u32(16)? as usize;
+    let version = read_u32(20)?;
+    let boot_cpuid_phys = read_u32(28)?;
+    let size_dt_strings = read_u32(32)? as usize;
+    let size_dt_struct = read_u32(36)? as usize;
+    if version < 16 {
+        bail!("unsupported DTB version {version} (need v16+)");
+    }
+    if data.len() < totalsize {
+        bail!("DTB truncated: header says {totalsize} bytes, got {}", data.len());
+    }
+    if off_dt_struct + size_dt_struct > data.len()
+        || off_dt_strings + size_dt_strings > data.len()
+    {
+        bail!("DTB struct/strings offsets out of bounds");
+    }
+
+    let mut mem_rsv = Vec::new();
+    let mut p = off_mem_rsvmap;
+    loop {
+        if p + 16 > data.len() {
+            bail!("DTB memory reservation block truncated");
+        }
+        let addr = u64::from_be_bytes(data[p..p + 8].try_into().unwrap());
+        let size = u64::from_be_bytes(data[p + 8..p + 16].try_into().unwrap());
+        p += 16;
+        if addr == 0 && size == 0 {
+            break;
+        }
+        mem_rsv.push((addr, size));
+    }
+
+    let mut parser = Parser { data, pos: off_dt_struct, strings_base: off_dt_strings };
+    let first = parser.read_u32()?;
+    if first != FDT_BEGIN_NODE {
+        bail!("DTB struct block must start with BEGIN_NODE, got {first:#x}");
+    }
+    let root = parser.read_node()?;
+    let last = parser.read_u32()?;
+    if last != FDT_END {
+        bail!("DTB struct block missing FDT_END terminator, got {last:#x}");
+    }
+    Ok(Fdt { root, mem_rsv, boot_cpuid_phys })
+}
+
+struct Parser<'a> {
+    data: &'a [u8],
+    pos: usize,
+    strings_base: usize,
+}
+
+impl<'a> Parser<'a> {
+    fn read_u32(&mut self) -> Result<u32> {
+        let slice = self
+            .data
+            .get(self.pos..self.pos + 4)
+            .with_context(|| format!("DTB truncated reading u32 at {}", self.pos))?;
+        self.pos += 4;
+        Ok(u32::from_be_bytes(slice.try_into().unwrap()))
+    }
+
+    fn read_cstr(&mut self) -> Result<String> {
+        let start = self.pos;
+        while self.pos < self.data.len() && self.data[self.pos] != 0 {
+            self.pos += 1;
+        }
+        if self.pos >= self.data.len() {
+            bail!("DTB cstr unterminated at offset {start}");
+        }
+        let s = std::str::from_utf8(&self.data[start..self.pos])
+            .with_context(|| format!("non-utf8 name at offset {start}"))?
+            .to_string();
+        self.pos += 1;
+        while self.pos % 4 != 0 {
+            self.pos += 1;
+        }
+        Ok(s)
+    }
+
+    fn read_string_at(&self, off: usize) -> Result<String> {
+        let start = self.strings_base + off;
+        let mut end = start;
+        while end < self.data.len() && self.data[end] != 0 {
+            end += 1;
+        }
+        if end >= self.data.len() {
+            bail!("DTB strings entry unterminated at offset {off}");
+        }
+        Ok(std::str::from_utf8(&self.data[start..end])
+            .with_context(|| format!("non-utf8 prop name at strings offset {off}"))?
+            .to_string())
+    }
+
+    fn read_node(&mut self) -> Result<Node> {
+        let name = self.read_cstr()?;
+        let mut node = Node::new(name);
+        loop {
+            let tok = self.read_u32()?;
+            match tok {
+                FDT_PROP => {
+                    let len = self.read_u32()? as usize;
+                    let nameoff = self.read_u32()? as usize;
+                    let val = self
+                        .data
+                        .get(self.pos..self.pos + len)
+                        .with_context(|| {
+                            format!("DTB prop value truncated at offset {}", self.pos)
+                        })?
+                        .to_vec();
+                    self.pos += len;
+                    while self.pos % 4 != 0 {
+                        self.pos += 1;
+                    }
+                    node.props.push(Property {
+                        name: self.read_string_at(nameoff)?,
+                        value: val,
+                    });
+                }
+                FDT_BEGIN_NODE => {
+                    let child = self.read_node()?;
+                    node.children.push(child);
+                }
+                FDT_END_NODE => return Ok(node),
+                FDT_NOP => {}
+                other => bail!("unexpected DTB token {other:#x} at pos {}", self.pos - 4),
+            }
+        }
+    }
+}
+
+struct Emitter {
+    structs: Vec<u8>,
+    strings: Vec<u8>,
+}
+
+impl Emitter {
+    fn new() -> Self { Self { structs: Vec::new(), strings: Vec::new() } }
+
+    fn intern(&mut self, name: &str) -> u32 {
+        let bytes = name.as_bytes();
+        let mut i = 0;
+        while i < self.strings.len() {
+            let mut j = i;
+            while j < self.strings.len() && self.strings[j] != 0 {
+                j += 1;
+            }
+            if &self.strings[i..j] == bytes {
+                return i as u32;
+            }
+            i = j + 1;
+        }
+        let off = self.strings.len() as u32;
+        self.strings.extend_from_slice(bytes);
+        self.strings.push(0);
+        off
+    }
+
+    fn push_u32(&mut self, v: u32) { self.structs.extend_from_slice(&v.to_be_bytes()); }
+
+    fn pad4(&mut self) {
+        while self.structs.len() % 4 != 0 {
+            self.structs.push(0);
+        }
+    }
+
+    fn emit_node(&mut self, node: &Node) {
+        self.push_u32(FDT_BEGIN_NODE);
+        self.structs.extend_from_slice(node.name.as_bytes());
+        self.structs.push(0);
+        self.pad4();
+        for p in &node.props {
+            self.push_u32(FDT_PROP);
+            self.push_u32(p.value.len() as u32);
+            let off = self.intern(&p.name);
+            self.push_u32(off);
+            self.structs.extend_from_slice(&p.value);
+            self.pad4();
+        }
+        for c in &node.children {
+            self.emit_node(c);
+        }
+        self.push_u32(FDT_END_NODE);
+    }
+}
+
+pub fn serialize(fdt: &Fdt) -> Vec<u8> {
+    let mut em = Emitter::new();
+    em.emit_node(&fdt.root);
+    em.push_u32(FDT_END);
+
+    let mut rsvbuf = Vec::new();
+    for (a, s) in &fdt.mem_rsv {
+        rsvbuf.extend_from_slice(&a.to_be_bytes());
+        rsvbuf.extend_from_slice(&s.to_be_bytes());
+    }
+    rsvbuf.extend_from_slice(&[0u8; 16]); // terminator
+
+    let header_size = 40usize;
+    let off_mem_rsvmap = header_size;
+    let off_dt_struct = off_mem_rsvmap + rsvbuf.len();
+    let off_dt_strings = off_dt_struct + em.structs.len();
+    let totalsize = off_dt_strings + em.strings.len();
+
+    let mut out = Vec::with_capacity(totalsize);
+    out.extend_from_slice(&FDT_MAGIC.to_be_bytes());
+    out.extend_from_slice(&(totalsize as u32).to_be_bytes());
+    out.extend_from_slice(&(off_dt_struct as u32).to_be_bytes());
+    out.extend_from_slice(&(off_dt_strings as u32).to_be_bytes());
+    out.extend_from_slice(&(off_mem_rsvmap as u32).to_be_bytes());
+    out.extend_from_slice(&FDT_VERSION_OUT.to_be_bytes());
+    out.extend_from_slice(&FDT_LAST_COMP_VERSION_OUT.to_be_bytes());
+    out.extend_from_slice(&fdt.boot_cpuid_phys.to_be_bytes());
+    out.extend_from_slice(&(em.strings.len() as u32).to_be_bytes());
+    out.extend_from_slice(&(em.structs.len() as u32).to_be_bytes());
+    out.extend_from_slice(&rsvbuf);
+    out.extend_from_slice(&em.structs);
+    out.extend_from_slice(&em.strings);
+    out
+}
+
+fn max_phandle(node: &Node) -> u32 {
+    let mut max = 0;
+    fn walk(n: &Node, max: &mut u32) {
+        for p in &n.props {
+            if (p.name == "phandle" || p.name == "linux,phandle") && p.value.len() == 4 {
+                let v = u32::from_be_bytes(p.value.as_slice().try_into().unwrap());
+                if v > *max && v != u32::MAX {
+                    *max = v;
+                }
+            }
+        }
+        for c in &n.children {
+            walk(c, max);
+        }
+    }
+    walk(node, &mut max);
+    max
+}
+
+fn be32(v: u32) -> Vec<u8> { v.to_be_bytes().to_vec() }
+fn strprop(s: &str) -> Vec<u8> {
+    let mut v = s.as_bytes().to_vec();
+    v.push(0);
+    v
+}
+
+/// Splice a single PSCI idle-state node into the root and add
+/// `cpu-idle-states = <phandle>` to each `/cpus/cpu@N` for N in 0..smp.
+///
+/// Latency values are intentionally conservative. With aggressive thresholds
+/// (entry=10, exit=20, min-residency=100) the kernel falls into a polling
+/// code path for sub-100us idles and host CPU goes *up*, not down.
+/// entry=100/exit=250/min-residency=1000 keeps cpuidle going through plain
+/// WFI which HVF blocks cleanly on `pthread_cond_timedwait`. Confirmed
+/// empirically: 670% → 275% on smp=8 idle.
+///
+/// Returns the phandle assigned to the new state node, for diagnostics.
+pub fn patch_idle_states(fdt: &mut Fdt, smp: u32) -> Result<u32> {
+    let phandle = max_phandle(&fdt.root) + 1;
+
+    let mut idle_states = Node::new("idle-states");
+    idle_states.set_prop("entry-method", strprop("psci"));
+
+    let mut sleep = Node::new("cpu-sleep-0");
+    sleep.set_prop("compatible", strprop("arm,idle-state"));
+    sleep.set_prop("idle-state-name", strprop("cpu-sleep"));
+    // PSCI v0.2+ power_state encoding for a CPU-level powerdown:
+    // bit 16 = StateType (1 = powerdown), bits 31:24 = AffinityLevel (0 = CPU).
+    sleep.set_prop("arm,psci-suspend-param", be32(0x0001_0000));
+    sleep.set_prop("entry-latency-us", be32(100));
+    sleep.set_prop("exit-latency-us", be32(250));
+    sleep.set_prop("min-residency-us", be32(1000));
+    sleep.set_prop("local-timer-stop", Vec::new());
+    sleep.set_prop("phandle", be32(phandle));
+    idle_states.children.push(sleep);
+    fdt.root.children.push(idle_states);
+
+    let cpus = fdt
+        .root
+        .child_mut("cpus")
+        .context("DTB has no /cpus node — not a virt machine?")?;
+    let mut patched = 0u32;
+    for i in 0..smp {
+        let name = format!("cpu@{i}");
+        if let Some(cpu) = cpus.child_mut(&name) {
+            cpu.set_prop("cpu-idle-states", be32(phandle));
+            patched += 1;
+        }
+    }
+    if patched == 0 {
+        bail!("DTB has no /cpus/cpu@N nodes — refusing to patch");
+    }
+    Ok(phandle)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Build a minimal synthetic DTB with /cpus/cpu@0..cpu@N, round-trip
+    /// it through parse → patch → serialize → parse, verify shape.
+    fn synth_dtb(smp: u32) -> Vec<u8> {
+        let mut root = Node::new("");
+        root.set_prop("#address-cells", be32(2));
+        root.set_prop("#size-cells", be32(2));
+        let mut cpus = Node::new("cpus");
+        cpus.set_prop("#address-cells", be32(1));
+        cpus.set_prop("#size-cells", be32(0));
+        for i in 0..smp {
+            let mut cpu = Node::new(format!("cpu@{i}"));
+            cpu.set_prop("device_type", strprop("cpu"));
+            cpu.set_prop("compatible", strprop("arm,armv8"));
+            cpu.set_prop("reg", be32(i));
+            cpu.set_prop("enable-method", strprop("psci"));
+            cpu.set_prop("phandle", be32(0x8000 + i));
+            cpus.children.push(cpu);
+        }
+        root.children.push(cpus);
+        let fdt = Fdt { root, mem_rsv: vec![], boot_cpuid_phys: 0 };
+        serialize(&fdt)
+    }
+
+    #[test]
+    fn round_trip_synthetic() {
+        let bytes = synth_dtb(4);
+        let fdt = parse(&bytes).unwrap();
+        assert_eq!(fdt.root.children[0].name, "cpus");
+        assert_eq!(fdt.root.children[0].children.len(), 4);
+        assert_eq!(fdt.boot_cpuid_phys, 0);
+    }
+
+    #[test]
+    fn patch_adds_idle_states_and_cpu_props() {
+        let bytes = synth_dtb(4);
+        let mut fdt = parse(&bytes).unwrap();
+        let phandle = patch_idle_states(&mut fdt, 4).unwrap();
+        // existing phandles go up to 0x8003 → new one should be 0x8004
+        assert_eq!(phandle, 0x8004);
+
+        let idle = fdt
+            .root
+            .children
+            .iter()
+            .find(|c| c.name == "idle-states")
+            .expect("idle-states node missing");
+        assert_eq!(idle.children[0].name, "cpu-sleep-0");
+
+        let cpus = fdt.root.children.iter().find(|c| c.name == "cpus").unwrap();
+        for cpu in &cpus.children {
+            let cis = cpu
+                .props
+                .iter()
+                .find(|p| p.name == "cpu-idle-states")
+                .expect("cpu-idle-states missing on cpu node");
+            assert_eq!(u32::from_be_bytes(cis.value.as_slice().try_into().unwrap()), phandle);
+        }
+    }
+
+    #[test]
+    fn patch_then_serialize_then_reparse_matches() {
+        let bytes = synth_dtb(2);
+        let mut fdt = parse(&bytes).unwrap();
+        patch_idle_states(&mut fdt, 2).unwrap();
+        let out = serialize(&fdt);
+        let rt = parse(&out).unwrap();
+        assert!(rt.root.children.iter().any(|c| c.name == "idle-states"));
+        let cpus = rt.root.children.iter().find(|c| c.name == "cpus").unwrap();
+        assert!(cpus.children.iter().all(|c| c.props.iter().any(|p| p.name == "cpu-idle-states")));
+    }
+
+    #[test]
+    fn parse_rejects_bad_magic() {
+        let mut bytes = synth_dtb(1);
+        bytes[0] = 0;
+        assert!(parse(&bytes).is_err());
+    }
+
+    #[test]
+    fn patch_fails_when_no_cpus_node() {
+        let mut root = Node::new("");
+        root.set_prop("#address-cells", be32(2));
+        let fdt_in = Fdt { root, mem_rsv: vec![], boot_cpuid_phys: 0 };
+        let bytes = serialize(&fdt_in);
+        let mut fdt = parse(&bytes).unwrap();
+        assert!(patch_idle_states(&mut fdt, 4).is_err());
+    }
+
+    #[test]
+    fn boot_cpuid_phys_preserved() {
+        let mut root = Node::new("");
+        let mut cpus = Node::new("cpus");
+        let mut cpu = Node::new("cpu@0");
+        cpu.set_prop("reg", be32(0));
+        cpus.children.push(cpu);
+        root.children.push(cpus);
+        let fdt_in = Fdt { root, mem_rsv: vec![], boot_cpuid_phys: 0x42 };
+        let bytes = serialize(&fdt_in);
+        let parsed = parse(&bytes).unwrap();
+        assert_eq!(parsed.boot_cpuid_phys, 0x42);
+    }
+}
diff --git a/src/utils/vm/mod.rs b/src/utils/vm/mod.rs
index 261e3538..672e66d1 100644
--- a/src/utils/vm/mod.rs
+++ b/src/utils/vm/mod.rs
@@ -25,6 +25,7 @@ pub mod channel;
 #[cfg(target_os = "macos")]
 pub mod client;
 pub mod config;
+pub mod fdt;
 pub mod forward;
 pub mod lifecycle;
 pub mod manifest;
diff --git a/src/utils/vm/qemu.rs b/src/utils/vm/qemu.rs
index 7e0910a7..8f8e3632 100644
--- a/src/utils/vm/qemu.rs
+++ b/src/utils/vm/qemu.rs
@@ -6,9 +6,10 @@
 //! layer can stop later.
 
 use anyhow::{bail, Context, Result};
-use std::path::PathBuf;
+use std::path::{Path, PathBuf};
 use std::process::Stdio;
 
+use super::fdt;
 use super::manifest::Manifest;
 use super::state::VmPaths;
 
@@ -221,9 +222,140 @@ pub fn build_qemu_args(
     args.push("-pidfile".into());
     args.push(paths.pid_file().to_string_lossy().into_owned());
 
+    // On arm64, splice PSCI idle-states into the DTB so the in-guest
+    // cpuidle driver actually binds. See fdt.rs for the why; the short
+    // version is "QEMU virt doesn't emit cpu-idle-states bindings, so
+    // CONFIG_ARM_PSCI_CPUIDLE has nothing to attach to and idle CPUs spin
+    // through HVF vmexit/vmenter at ~80% host each."
+    //
+    // Failures degrade gracefully: log + skip, kernel falls back to the
+    // auto-generated DTB it would have used anyway.
+    if matches!(arch.as_str(), "arm64" | "aarch64") {
+        let dtb_override = std::env::var("AVOCADO_VM_DTB").ok().filter(|s| !s.is_empty());
+        match dtb_override {
+            Some(path) => {
+                args.push("-dtb".into());
+                args.push(path);
+            }
+            None => match ensure_idle_states_dtb(paths, cfg) {
+                Ok(path) => {
+                    args.push("-dtb".into());
+                    args.push(path.to_string_lossy().into_owned());
+                }
+                Err(e) => {
+                    eprintln!(
+                        "warn: PSCI idle-states DTB preparation failed ({e}); booting with \
+                         auto-generated DTB (expect higher host CPU at guest idle)"
+                    );
+                }
+            },
+        }
+    }
+
     Ok(args)
 }
 
+/// Produce a DTB patched with PSCI `idle-states` for the current QEMU
+/// config. Cached under `~/.avocado/vm/dtb/`, keyed by parameters that
+/// affect the DT layout (memory range and cpu count change DT nodes;
+/// QEMU version may change auto-generated property shapes).
+///
+/// The cache miss path runs `qemu-system-aarch64 -machine virt,dumpdtb=…`
+/// to capture QEMU's auto-generated DTB, splices in the missing nodes,
+/// then atomically renames into place. Cost is ~500ms per cache miss,
+/// hidden under the rest of VM boot. Cache hits return immediately.
+fn ensure_idle_states_dtb(paths: &VmPaths, cfg: &QemuConfig) -> Result<PathBuf> {
+    let qemu_version = qemu_version_tag("qemu-system-aarch64")?;
+    let cache_dir = paths.dtb_cache_dir();
+    std::fs::create_dir_all(&cache_dir)
+        .with_context(|| format!("failed to create {}", cache_dir.display()))?;
+    let cache_path = cache_dir.join(format!(
+        "virt-smp{}-m{}-q{}.dtb",
+        cfg.cpus, cfg.memory_mib, qemu_version
+    ));
+    if cache_path.is_file() {
+        return Ok(cache_path);
+    }
+    let tmp = tempfile::NamedTempFile::new_in(&cache_dir)
+        .context("failed to create temp file for DTB dump")?;
+    dump_base_dtb("qemu-system-aarch64", cfg, tmp.path())
+        .context("failed to dump base DTB from QEMU")?;
+    let raw = std::fs::read(tmp.path())
+        .with_context(|| format!("failed to read dumped DTB at {}", tmp.path().display()))?;
+    let mut fdt = fdt::parse(&raw).context("failed to parse QEMU-generated DTB")?;
+    fdt::patch_idle_states(&mut fdt, cfg.cpus)
+        .context("failed to splice idle-states into DTB")?;
+    let patched = fdt::serialize(&fdt);
+    std::fs::write(tmp.path(), &patched)
+        .with_context(|| format!("failed to write patched DTB to {}", tmp.path().display()))?;
+    tmp.persist(&cache_path)
+        .with_context(|| format!("failed to install patched DTB at {}", cache_path.display()))?;
+    Ok(cache_path)
+}
+
+/// Run `qemu-system-aarch64 -machine virt,dumpdtb=PATH` and let QEMU
+/// write its auto-generated DTB, then exit. We pass the same
+/// `-machine`, `-smp`, `-m`, `-cpu`, `-accel` flags that affect DT
+/// generation so the dumped tree matches what the real launch would see.
+fn dump_base_dtb(qemu_bin: &str, cfg: &QemuConfig, out: &Path) -> Result<()> {
+    let machine = format!("virt,dumpdtb={}", out.display());
+    let status = std::process::Command::new(qemu_bin)
+        .args([
+            "-machine",
+            &machine,
+            "-accel",
+            accel_flag(),
+            "-cpu",
+            cpu_for("aarch64"),
+            "-smp",
+            &cfg.cpus.to_string(),
+            "-m",
+            &format!("{}M", cfg.memory_mib),
+            "-nographic",
+        ])
+        .stdin(Stdio::null())
+        .stdout(Stdio::null())
+        .stderr(Stdio::piped())
+        .output()
+        .with_context(|| format!("failed to spawn {qemu_bin} for dumpdtb"))?;
+    if !status.status.success() {
+        let stderr = String::from_utf8_lossy(&status.stderr);
+        bail!(
+            "{qemu_bin} dumpdtb exited with {}: {}",
+            status.status,
+            stderr.trim()
+        );
+    }
+    Ok(())
+}
+
+/// Stable, filename-safe identifier for the QEMU binary's version, used
+/// in the DTB cache key. First line of `--version` looks like
+/// `QEMU emulator version 11.0.0` — we slugify the version token.
+fn qemu_version_tag(qemu_bin: &str) -> Result<String> {
+    let output = std::process::Command::new(qemu_bin)
+        .arg("--version")
+        .output()
+        .with_context(|| format!("failed to run `{qemu_bin} --version`"))?;
+    if !output.status.success() {
+        bail!("{qemu_bin} --version exited with {}", output.status);
+    }
+    let first = String::from_utf8_lossy(&output.stdout)
+        .lines()
+        .next()
+        .unwrap_or("")
+        .to_string();
+    // "QEMU emulator version 11.0.0" -> "11.0.0"
+    let version = first
+        .split_whitespace()
+        .find(|tok| tok.chars().next().is_some_and(|c| c.is_ascii_digit()))
+        .unwrap_or("unknown");
+    Ok(version
+        .chars()
+        .map(|c| if c.is_ascii_alphanumeric() || c == '.' || c == '-' { c } else { '_' })
+        .collect())
+}
+
 /// Spawn QEMU detached from the controlling terminal. Returns the child pid.
 /// The child writes its own pidfile thanks to `-pidfile`; we also capture
 /// the spawn-time pid so the caller can `kill` it directly if needed.
diff --git a/src/utils/vm/state.rs b/src/utils/vm/state.rs
index bcab64fb..00b49804 100644
--- a/src/utils/vm/state.rs
+++ b/src/utils/vm/state.rs
@@ -171,6 +171,12 @@ impl VmPaths {
     pub fn config_file(&self) -> PathBuf {
         self.root.join("config.yaml")
     }
+    /// Cache directory for patched DTBs (one per `(arch, smp, memory_mib,
+    /// qemu_version)` combination). Tiny files (~10 KB each), no eviction
+    /// needed.
+    pub fn dtb_cache_dir(&self) -> PathBuf {
+        self.root.join("dtb")
+    }
 }
 
 /// Find the user's home directory. Wraps the `directories` crate so callers

From f01f2e8a48b3a52d84468e0f8270e56f00387bd2 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 21:22:42 -0400
Subject: [PATCH 15/21] perf(vm): hibernation supervisor with wake-on-connect
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a long-lived `avocado vm supervise` process spawned alongside
QEMU. Owns the user-facing SSH port and docker socket; QEMU's hostfwd
moves to a loopback-only internal port. The supervisor:

  - Proxies inbound TCP to QEMU's internal hostfwd. On accept, sends
    QMP `cont` if the VM is paused; SSH handshake then continues
    against the freshly-resumed guest.
  - Owns `~/.avocado/vm/docker.sock`. On accept, wakes the VM and
    lazily spawns the ssh -L tunnel to /run/docker.sock in the guest
    (cached for the awake-window, torn down on pause so QEMU can
    sleep cleanly).
  - Tracks active connections + idle timer. With no inbound activity
    for `idle.hibernate_after_secs` (default 10s for testing), sends
    QMP `stop`. Host CPU on QEMU drops to ~0% while RAM stays
    resident. Any subsequent SSH or docker connection wakes it
    transparently.

Cache key for the DTB also switches from `qemu --version` to the QEMU
binary mtime — saves ~300-500ms of subprocess overhead on every VM
start. Mtime naturally invalidates on `brew upgrade qemu`.

Known limitations (deferred):
  - Docker forwarder lifecycle is now supervisor-owned when
    hibernation is enabled (idle_after_secs > 0); legacy long-lived
    forwarder still used when disabled to avoid regressing existing
    non-hibernating setups.
  - CPU hotplug for awake-but-idle floor: QMP `device_add` returns
    "machine does not support hot-plugging CPUs" on QEMU 11 + HVF +
    ARM virt. Defer; Linux CPU offline is a fallback path if needed
    later.
---
 src/commands/vm/mod.rs       |   1 +
 src/commands/vm/supervise.rs |  40 +++
 src/main.rs                  |  48 ++++
 src/utils/vm/config.rs       |  17 ++
 src/utils/vm/lifecycle.rs    | 195 +++++++++++++--
 src/utils/vm/mod.rs          |   1 +
 src/utils/vm/qemu.rs         |  52 ++--
 src/utils/vm/state.rs        |  28 +++
 src/utils/vm/supervisor.rs   | 467 +++++++++++++++++++++++++++++++++++
 9 files changed, 806 insertions(+), 43 deletions(-)
 create mode 100644 src/commands/vm/supervise.rs
 create mode 100644 src/utils/vm/supervisor.rs

diff --git a/src/commands/vm/mod.rs b/src/commands/vm/mod.rs
index daf97cb2..585cd74a 100644
--- a/src/commands/vm/mod.rs
+++ b/src/commands/vm/mod.rs
@@ -12,4 +12,5 @@ pub mod shell;
 pub mod start;
 pub mod status;
 pub mod stop;
+pub mod supervise;
 pub mod update;
diff --git a/src/commands/vm/supervise.rs b/src/commands/vm/supervise.rs
new file mode 100644
index 00000000..ce38e4d0
--- /dev/null
+++ b/src/commands/vm/supervise.rs
@@ -0,0 +1,40 @@
+//! `avocado vm supervise` — long-lived host-side hibernation supervisor.
+//!
+//! Spawned by `avocado vm start` after QEMU is reachable. Not intended
+//! to be run by users directly (hidden in CLI help); the lifecycle
+//! layer owns the argv. See [`crate::utils::vm::supervisor`] for the
+//! actual loop.
+
+use anyhow::Result;
+use std::path::PathBuf;
+
+use crate::utils::vm::supervisor::{run, RunArgs};
+
+pub struct SuperviseCommand {
+    pub user_port: u16,
+    pub internal_port: u16,
+    pub qmp_socket: PathBuf,
+    pub idle_after_secs: u64,
+    pub pid_file: PathBuf,
+    pub docker_socket: PathBuf,
+    pub docker_socket_internal: PathBuf,
+    pub ssh_key: PathBuf,
+    pub known_hosts: PathBuf,
+}
+
+impl SuperviseCommand {
+    pub async fn execute(self) -> Result<()> {
+        run(RunArgs {
+            user_port: self.user_port,
+            internal_port: self.internal_port,
+            qmp_socket: self.qmp_socket,
+            idle_after_secs: self.idle_after_secs,
+            pid_file: self.pid_file,
+            docker_socket: self.docker_socket,
+            docker_socket_internal: self.docker_socket_internal,
+            ssh_key: self.ssh_key,
+            known_hosts: self.known_hosts,
+        })
+        .await
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index fd7e21bc..da65621f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3076,6 +3076,29 @@ async fn main() -> Result<()> {
                 cmd.execute().await
             }
             VmCommands::Stop { force } => commands::vm::stop::StopCommand { force }.execute().await,
+            VmCommands::Supervise {
+                user_port,
+                internal_port,
+                qmp_socket,
+                idle_after_secs,
+                pid_file,
+                docker_socket,
+                docker_socket_internal,
+                ssh_key,
+                known_hosts,
+            } => commands::vm::supervise::SuperviseCommand {
+                user_port,
+                internal_port,
+                qmp_socket,
+                idle_after_secs,
+                pid_file,
+                docker_socket,
+                docker_socket_internal,
+                ssh_key,
+                known_hosts,
+            }
+            .execute()
+            .await,
             VmCommands::Status => commands::vm::status::StatusCommand.execute().await,
             VmCommands::Shell { command } => {
                 commands::vm::shell::ShellCommand { command }
@@ -4537,6 +4560,31 @@ enum VmCommands {
         #[command(subcommand)]
         command: VmConfigCommands,
     },
+    /// Long-lived hibernation supervisor. Internal — spawned by `vm start`,
+    /// not for direct use. Owns the user-facing SSH port AND docker
+    /// socket, proxies to QEMU's internal hostfwd / SSH tunnel, and
+    /// sends QMP stop/cont on the idle timeout.
+    #[command(hide = true)]
+    Supervise {
+        #[arg(long)]
+        user_port: u16,
+        #[arg(long)]
+        internal_port: u16,
+        #[arg(long)]
+        qmp_socket: std::path::PathBuf,
+        #[arg(long)]
+        idle_after_secs: u64,
+        #[arg(long)]
+        pid_file: std::path::PathBuf,
+        #[arg(long)]
+        docker_socket: std::path::PathBuf,
+        #[arg(long)]
+        docker_socket_internal: std::path::PathBuf,
+        #[arg(long)]
+        ssh_key: std::path::PathBuf,
+        #[arg(long)]
+        known_hosts: std::path::PathBuf,
+    },
     /// Check for and apply VM image updates from the release channel.
     /// Stops + restarts the VM if it was running. Preserves the existing
     /// `var` partition; use `vm reset` to wipe state.
diff --git a/src/utils/vm/config.rs b/src/utils/vm/config.rs
index a01c9aac..55d14631 100644
--- a/src/utils/vm/config.rs
+++ b/src/utils/vm/config.rs
@@ -26,6 +26,9 @@ pub struct VmConfig {
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub runtime: Option<RuntimeConfig>,
 
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub idle: Option<IdleConfig>,
+
     /// Forward-compat bucket for keys this CLI version doesn't know about.
     /// Preserved verbatim on save so a newer desktop's settings survive an
     /// older CLI's round-trip.
@@ -48,6 +51,20 @@ pub struct RuntimeConfig {
     pub extra: BTreeMap<String, serde_yaml::Value>,
 }
 
+/// Hibernation knobs. The supervisor process (`avocado vm supervise`)
+/// proxies the user-facing SSH port to QEMU's internal hostfwd; after
+/// `hibernate_after_secs` of no proxied activity, it sends QMP `stop`
+/// to halt the vCPUs. Wake happens automatically on the next incoming
+/// connection. Set `hibernate_after_secs` to 0 (or omit) to disable.
+#[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct IdleConfig {
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub hibernate_after_secs: Option<u64>,
+
+    #[serde(flatten)]
+    pub extra: BTreeMap<String, serde_yaml::Value>,
+}
+
 #[derive(Debug, Default, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct NetworkConfig {
     /// Override the guest's DNS resolvers. Applied post-boot via
diff --git a/src/utils/vm/lifecycle.rs b/src/utils/vm/lifecycle.rs
index 921af48a..ddeddd97 100644
--- a/src/utils/vm/lifecycle.rs
+++ b/src/utils/vm/lifecycle.rs
@@ -163,6 +163,14 @@ pub async fn start(opts: StartOptions) -> Result<VmStatus> {
     };
     state::write_ssh_port(&paths, ssh_port)?;
 
+    // Loopback-only port QEMU's hostfwd binds to. The supervisor
+    // listens on the user-facing `ssh_port` and proxies through to
+    // this one; downstream callers (vm shell, forward.rs, Avocado.app)
+    // only ever see `ssh_port`.
+    let internal_ssh_port = qemu::pick_free_port()?;
+    std::fs::write(paths.internal_ssh_port_file(), internal_ssh_port.to_string())
+        .with_context(|| format!("writing {}", paths.internal_ssh_port_file().display()))?;
+
     // Now that the port is known, write the ssh-config + wire it into
     // ~/.ssh/config. This is required for `DOCKER_HOST=ssh://avocado-vm`
     // to resolve in any subprocess we spawn — Docker's ssh transport reads
@@ -178,7 +186,7 @@ pub async fn start(opts: StartOptions) -> Result<VmStatus> {
     let cfg = QemuConfig {
         memory_mib,
         cpus,
-        ssh_port,
+        ssh_port: internal_ssh_port,
         cmdline_extra: opts.cmdline_extra,
         artifact_dir: artifact_dir.clone(),
         workspace: workspace.clone(),
@@ -200,6 +208,14 @@ pub async fn start(opts: StartOptions) -> Result<VmStatus> {
         p
     };
 
+    // Spawn the hibernation supervisor. Owns the user-facing SSH port
+    // and proxies through to QEMU's internal hostfwd. After
+    // `idle_after_secs` of no proxied activity, sends QMP `stop` to
+    // halt the vCPUs; wakes on the next incoming TCP. boot_sync below
+    // goes through the proxy, which is why we spawn before waiting.
+    let idle_after_secs = resolve_idle_after_secs(&paths);
+    spawn_supervisor(&paths, ssh_port, internal_ssh_port, idle_after_secs).await?;
+
     // Wait for the guest to become ready — first signal wins (qga vs SSH).
     let signal = super::boot_sync::wait_for_guest_ready(&paths.qga_socket(), ssh_port, None)
         .await
@@ -239,18 +255,24 @@ pub async fn start(opts: StartOptions) -> Result<VmStatus> {
         );
     }
 
-    // Bring up the docker-socket SSH forward so DOCKER_HOST=unix://… works
-    // from the host without touching the user's ~/.ssh/config. Non-fatal
-    // on error: a working VM with just SSH access is still useful for
-    // debugging.
-    if let Err(e) = super::forward::start(&paths, ssh_port).await {
-        crate::utils::output::print_warning(
-            &format!(
-                "docker socket forward failed: {e:#}. Local DOCKER_HOST routing won't work until you start it. \
-                 (`avocado vm stop && avocado vm start` retries.)"
-            ),
-            crate::utils::output::OutputLevel::Normal,
-        );
+    // Docker socket. With hibernation enabled (supervisor running), the
+    // supervisor owns `docker.sock` directly and manages an SSH `-L`
+    // tunnel internally with VM wake/pause lifecycle. Without
+    // hibernation (idle_after_secs == 0), keep the legacy long-lived
+    // forwarder behavior so existing setups don't regress.
+    //
+    // Non-fatal on error: a working VM with just SSH access is still
+    // useful for debugging.
+    if idle_after_secs == 0 {
+        if let Err(e) = super::forward::start(&paths, ssh_port).await {
+            crate::utils::output::print_warning(
+                &format!(
+                    "docker socket forward failed: {e:#}. Local DOCKER_HOST routing won't work until you start it. \
+                     (`avocado vm stop && avocado vm start` retries.)"
+                ),
+                crate::utils::output::OutputLevel::Normal,
+            );
+        }
     }
 
     notify_desktop(
@@ -281,9 +303,12 @@ pub async fn stop(force: bool) -> Result<()> {
 async fn stop_inner(force: bool) -> Result<()> {
     let paths = VmPaths::resolve()?;
 
-    // Always try to tear down the docker socket forward first — the SSH
-    // process can outlive QEMU if we shut down the VM by signal, leaving
-    // a stale `docker.sock` on the host.
+    // Tear down auxiliary host-side processes BEFORE QEMU. The supervisor
+    // owns the user-facing SSH port; if we left it running after QEMU
+    // exited, the next `vm start` would race against a still-bound port.
+    // The docker socket forwarder is an SSH child that can outlive QEMU
+    // if we shut down by signal, leaving a stale `docker.sock`.
+    stop_supervisor(&paths);
     let _ = super::forward::stop(&paths).await;
 
     let pid = match state::read_pid(&paths)? {
@@ -731,3 +756,141 @@ fn write_ssh_config(paths: &VmPaths, ssh_port: u16) -> Result<()> {
         .with_context(|| format!("writing {}", paths.ssh_config().display()))?;
     Ok(())
 }
+
+/// Default idle timeout in seconds when neither config nor env var sets
+/// one. Aggressive for testing while the hibernation supervisor is new
+/// — production should land on a more user-friendly default (multiple
+/// minutes) once the wake-on-connect path has been exercised in real
+/// workflows.
+const DEFAULT_IDLE_AFTER_SECS: u64 = 10;
+
+/// Resolve the hibernate timeout. Env var wins (one-shot override for
+/// experimentation), else the persisted `idle.hibernate_after_secs`,
+/// else the default. `0` disables hibernation while keeping the proxy
+/// up — useful for isolating proxy issues from QMP issues.
+fn resolve_idle_after_secs(paths: &VmPaths) -> u64 {
+    if let Ok(raw) = std::env::var("AVOCADO_VM_IDLE_HIBERNATE_SECS") {
+        if let Ok(parsed) = raw.parse::<u64>() {
+            return parsed;
+        }
+    }
+    if let Ok(cfg) = super::config::VmConfig::load(paths) {
+        if let Some(idle) = &cfg.idle {
+            if let Some(v) = idle.hibernate_after_secs {
+                return v;
+            }
+        }
+    }
+    DEFAULT_IDLE_AFTER_SECS
+}
+
+/// Spawn `avocado vm supervise` as a detached child. Same daemonization
+/// pattern as QEMU (setsid + null stdio), pid recorded so `stop_inner`
+/// can take it down before QEMU. We re-exec the running binary
+/// (`std::env::current_exe`) rather than expecting an installed
+/// `avocado` on PATH — that way a `cargo run` or out-of-tree binary
+/// supervises itself instead of pulling in a stale system copy.
+/// Best-effort SIGTERM → SIGKILL on the supervisor pid, then remove
+/// its pidfile + internal-ssh-port marker. Idempotent — missing
+/// pidfile / dead pid is a no-op.
+fn stop_supervisor(paths: &VmPaths) {
+    let pidfile = paths.supervisor_pid();
+    if let Ok(raw) = std::fs::read_to_string(&pidfile) {
+        if let Ok(pid) = raw.trim().parse::<u32>() {
+            if state::pid_alive(pid) {
+                send_signal(pid, SIGTERM);
+                for _ in 0..20 {
+                    if !state::pid_alive(pid) {
+                        break;
+                    }
+                    std::thread::sleep(Duration::from_millis(50));
+                }
+                if state::pid_alive(pid) {
+                    send_signal(pid, SIGKILL);
+                }
+            }
+        }
+    }
+    let _ = std::fs::remove_file(pidfile);
+    let _ = std::fs::remove_file(paths.internal_ssh_port_file());
+}
+
+async fn spawn_supervisor(
+    paths: &VmPaths,
+    user_port: u16,
+    internal_port: u16,
+    idle_after_secs: u64,
+) -> Result<()> {
+    let exe = std::env::current_exe().context("locating current avocado binary")?;
+    let mut cmd = tokio::process::Command::new(&exe);
+    cmd.args([
+        "vm",
+        "supervise",
+        "--user-port",
+        &user_port.to_string(),
+        "--internal-port",
+        &internal_port.to_string(),
+        "--qmp-socket",
+        &paths.qmp_socket().to_string_lossy(),
+        "--idle-after-secs",
+        &idle_after_secs.to_string(),
+        "--pid-file",
+        &paths.supervisor_pid().to_string_lossy(),
+        "--docker-socket",
+        &paths.docker_socket().to_string_lossy(),
+        "--docker-socket-internal",
+        &paths.docker_socket_internal().to_string_lossy(),
+        "--ssh-key",
+        &paths.ssh_key().to_string_lossy(),
+        "--known-hosts",
+        &paths.known_hosts().to_string_lossy(),
+    ]);
+    // Append the supervisor's stderr to ~/.avocado/vm/supervisor.log so
+    // pause/resume events are recoverable post-mortem. `tail -F` is
+    // robust to the file appearing only after first launch.
+    let log = std::fs::OpenOptions::new()
+        .create(true)
+        .append(true)
+        .open(paths.supervisor_log())
+        .with_context(|| format!("opening {}", paths.supervisor_log().display()))?;
+    let log_dup = log.try_clone().context("cloning supervisor log handle")?;
+    cmd.stdin(std::process::Stdio::null());
+    cmd.stdout(log);
+    cmd.stderr(log_dup);
+    #[cfg(unix)]
+    unsafe {
+        cmd.pre_exec(|| {
+            let _ = libc::setsid();
+            Ok(())
+        });
+    }
+    let child = cmd
+        .spawn()
+        .with_context(|| format!("failed to spawn supervisor: {}", exe.display()))?;
+    let spawn_pid = child.id().unwrap_or(0);
+    drop(child);
+
+    // Poll briefly for the supervisor's listener to come up — proves the
+    // proxy is ready before boot_sync starts pumping connections through.
+    let deadline = std::time::Instant::now() + Duration::from_secs(5);
+    loop {
+        if tokio::net::TcpStream::connect(("127.0.0.1", user_port)).await.is_ok() {
+            return Ok(());
+        }
+        if std::time::Instant::now() >= deadline {
+            // Don't fail the whole boot — log + carry on with whatever
+            // the supervisor managed to do. Worst case the user-facing
+            // port refuses connections and the user sees a normal SSH
+            // connection error.
+            crate::utils::output::print_warning(
+                &format!(
+                    "hibernation supervisor (pid {spawn_pid}) didn't bind 127.0.0.1:{user_port} within 5s; \
+                     proxy may be down. SSH may not work until you restart with `vm stop && vm start`."
+                ),
+                crate::utils::output::OutputLevel::Normal,
+            );
+            return Ok(());
+        }
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+}
diff --git a/src/utils/vm/mod.rs b/src/utils/vm/mod.rs
index 672e66d1..0d09870c 100644
--- a/src/utils/vm/mod.rs
+++ b/src/utils/vm/mod.rs
@@ -39,3 +39,4 @@ pub mod share;
 pub mod ssh;
 pub mod staging;
 pub mod state;
+pub mod supervisor;
diff --git a/src/utils/vm/qemu.rs b/src/utils/vm/qemu.rs
index 8f8e3632..1f750631 100644
--- a/src/utils/vm/qemu.rs
+++ b/src/utils/vm/qemu.rs
@@ -265,13 +265,18 @@ pub fn build_qemu_args(
 /// then atomically renames into place. Cost is ~500ms per cache miss,
 /// hidden under the rest of VM boot. Cache hits return immediately.
 fn ensure_idle_states_dtb(paths: &VmPaths, cfg: &QemuConfig) -> Result<PathBuf> {
-    let qemu_version = qemu_version_tag("qemu-system-aarch64")?;
+    // Cache key uses the QEMU binary's mtime instead of `--version` to
+    // avoid shelling out on every launch. A `brew upgrade qemu` bumps
+    // the mtime, which naturally invalidates the cache. The mtime stat
+    // is microseconds; the `--version` subprocess pays the full dyld
+    // load cost (~300-500 ms on macOS) for libsnappy/libpng/libfdt.
+    let qemu_tag = qemu_binary_tag("qemu-system-aarch64")?;
     let cache_dir = paths.dtb_cache_dir();
     std::fs::create_dir_all(&cache_dir)
         .with_context(|| format!("failed to create {}", cache_dir.display()))?;
     let cache_path = cache_dir.join(format!(
         "virt-smp{}-m{}-q{}.dtb",
-        cfg.cpus, cfg.memory_mib, qemu_version
+        cfg.cpus, cfg.memory_mib, qemu_tag
     ));
     if cache_path.is_file() {
         return Ok(cache_path);
@@ -329,31 +334,24 @@ fn dump_base_dtb(qemu_bin: &str, cfg: &QemuConfig, out: &Path) -> Result<()> {
     Ok(())
 }
 
-/// Stable, filename-safe identifier for the QEMU binary's version, used
-/// in the DTB cache key. First line of `--version` looks like
-/// `QEMU emulator version 11.0.0` — we slugify the version token.
-fn qemu_version_tag(qemu_bin: &str) -> Result<String> {
-    let output = std::process::Command::new(qemu_bin)
-        .arg("--version")
-        .output()
-        .with_context(|| format!("failed to run `{qemu_bin} --version`"))?;
-    if !output.status.success() {
-        bail!("{qemu_bin} --version exited with {}", output.status);
-    }
-    let first = String::from_utf8_lossy(&output.stdout)
-        .lines()
-        .next()
-        .unwrap_or("")
-        .to_string();
-    // "QEMU emulator version 11.0.0" -> "11.0.0"
-    let version = first
-        .split_whitespace()
-        .find(|tok| tok.chars().next().is_some_and(|c| c.is_ascii_digit()))
-        .unwrap_or("unknown");
-    Ok(version
-        .chars()
-        .map(|c| if c.is_ascii_alphanumeric() || c == '.' || c == '-' { c } else { '_' })
-        .collect())
+/// Stable, filename-safe identifier for the QEMU binary, used in the
+/// DTB cache key. Uses the binary's mtime (seconds since epoch) rather
+/// than `--version` so we don't spawn a subprocess on every launch.
+/// `brew upgrade qemu` bumps the mtime, naturally invalidating the
+/// cache; a binary that hasn't been touched produces the same key
+/// indefinitely.
+fn qemu_binary_tag(qemu_bin: &str) -> Result<String> {
+    let path = which_on_path(qemu_bin)
+        .with_context(|| format!("{qemu_bin} not found on $PATH"))?;
+    let meta = std::fs::metadata(&path)
+        .with_context(|| format!("stat {}", path.display()))?;
+    let mtime = meta
+        .modified()
+        .ok()
+        .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
+        .map(|d| d.as_secs())
+        .unwrap_or(0);
+    Ok(format!("m{mtime}"))
 }
 
 /// Spawn QEMU detached from the controlling terminal. Returns the child pid.
diff --git a/src/utils/vm/state.rs b/src/utils/vm/state.rs
index 00b49804..6c161118 100644
--- a/src/utils/vm/state.rs
+++ b/src/utils/vm/state.rs
@@ -154,10 +154,35 @@ impl VmPaths {
     pub fn docker_socket(&self) -> PathBuf {
         self.root.join("docker.sock")
     }
+    /// Internal-only landing point for the supervisor-managed SSH `-L`
+    /// tunnel to `/run/docker.sock`. Lives next to `docker.sock` but
+    /// distinct so the user-facing socket can stay alive (owned by the
+    /// supervisor) while this one comes and goes with VM wake/pause.
+    pub fn docker_socket_internal(&self) -> PathBuf {
+        self.root.join("docker.sock.internal")
+    }
     /// PID of the SSH process maintaining the docker socket forward.
     pub fn forwarder_pid(&self) -> PathBuf {
         self.root.join("forwarder.pid")
     }
+    /// PID of the hibernation supervisor (TCP proxy + QMP stop/cont).
+    /// Distinct from `pid_file` (QEMU) so the lifecycle layer can tear
+    /// down the supervisor before QEMU on shutdown.
+    pub fn supervisor_pid(&self) -> PathBuf {
+        self.root.join("supervisor.pid")
+    }
+    /// Append-only log for the supervisor's pause/resume events. Tail
+    /// this while reproducing a hibernation issue — every QMP stop /
+    /// cont and every accept hits this file.
+    pub fn supervisor_log(&self) -> PathBuf {
+        self.root.join("supervisor.log")
+    }
+    /// Loopback-only port QEMU's SSH hostfwd binds to. The supervisor
+    /// listens on the user-facing `ssh-port` and proxies to this one;
+    /// callers never connect here directly.
+    pub fn internal_ssh_port_file(&self) -> PathBuf {
+        self.root.join("internal-ssh-port")
+    }
     /// Absolute path to the artifact directory that was last used for `vm
     /// start`. The macOS Avocado.app reads this when launched without an
     /// AVOCADO_VM_DIR env var (Finder/Dock launches inherit a sanitized env
@@ -254,7 +279,10 @@ pub fn cleanup_transient(paths: &VmPaths) {
         paths.ssh_port_file(),
         paths.lock_file(),
         paths.docker_socket(),
+        paths.docker_socket_internal(),
         paths.forwarder_pid(),
+        paths.supervisor_pid(),
+        paths.internal_ssh_port_file(),
     ] {
         let _ = std::fs::remove_file(&p);
     }
diff --git a/src/utils/vm/supervisor.rs b/src/utils/vm/supervisor.rs
new file mode 100644
index 00000000..0055903e
--- /dev/null
+++ b/src/utils/vm/supervisor.rs
@@ -0,0 +1,467 @@
+//! Host-side hibernation supervisor for the helper VM.
+//!
+//! Architecturally a small proxy server with QMP-driven lifecycle.
+//! QEMU is launched with its SSH hostfwd bound to a loopback-only
+//! "internal" port; the supervisor listens on the user-facing port
+//! (the one in `~/.avocado/vm/ssh-port`) and pipes accepted
+//! connections through to the internal port. Doing it this way means
+//! *we* see every incoming connection, which gives us:
+//!
+//! 1. **Idle detection** — when no proxied connection has been active
+//!    for `idle_after_secs`, we send QMP `stop` to halt all vCPU
+//!    threads. Host CPU drops to ~0%; guest RAM stays resident.
+//! 2. **Wake-on-connect** — on the next incoming TCP, we send QMP
+//!    `cont` *before* opening the inner connection. The guest resumes
+//!    in-place and the SSH handshake completes ~100ms later than it
+//!    would on a live VM.
+//!
+//! The supervisor also owns the user-facing **docker socket**
+//! (`~/.avocado/vm/docker.sock`). On any incoming docker client
+//! connection it ensures (a) the VM is awake and (b) a single
+//! supervisor-managed `ssh -L` tunnel is running between an internal
+//! sock (`docker.sock.internal`) and `/run/docker.sock` in the guest,
+//! then pipes the client through. The tunnel comes up on wake and is
+//! torn down on pause so QEMU can sleep cleanly.
+//!
+//! Lifecycle: spawned by `lifecycle::start` after QEMU is reachable,
+//! killed by `lifecycle::stop` before QEMU. The subcommand entry point
+//! lives in `commands::vm::supervise` — this module is the loop it
+//! runs.
+
+use anyhow::{Context, Result};
+use std::path::{Path, PathBuf};
+use std::process::Stdio;
+use std::sync::atomic::{AtomicBool, AtomicI64, AtomicUsize, Ordering};
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::io::AsyncWriteExt;
+use tokio::net::{TcpListener, TcpStream, UnixListener, UnixStream};
+use tokio::sync::Mutex;
+
+use super::qmp::QmpClient;
+use super::state;
+
+/// Arguments passed from the `avocado vm supervise` subcommand into the
+/// supervisor loop. Plain owned data so the caller can construct it from
+/// clap-parsed flags without leaking lifetimes.
+#[derive(Debug, Clone)]
+pub struct RunArgs {
+    /// External TCP port the supervisor listens on. Today this is the
+    /// SSH port that everything else (`vm shell`, Avocado.app)
+    /// connects to.
+    pub user_port: u16,
+    /// Loopback port QEMU's `hostfwd` binds to. Only the supervisor
+    /// connects here.
+    pub internal_port: u16,
+    /// QMP control socket.
+    pub qmp_socket: PathBuf,
+    /// How long with no active connections before we halt the vCPUs.
+    pub idle_after_secs: u64,
+    /// Path to write our pid so the lifecycle layer can kill us later.
+    pub pid_file: PathBuf,
+    /// Host path for the user-facing docker socket. Supervisor owns it.
+    pub docker_socket: PathBuf,
+    /// Host path the supervisor's SSH `-L` tunnel binds to; only the
+    /// docker proxy connects here.
+    pub docker_socket_internal: PathBuf,
+    /// SSH private key for tunneling to the guest.
+    pub ssh_key: PathBuf,
+    /// known_hosts file the SSH tunnel uses.
+    pub known_hosts: PathBuf,
+}
+
+struct State {
+    paused: AtomicBool,
+    active_conns: AtomicUsize,
+    last_activity_ms: AtomicI64,
+    qmp_socket: PathBuf,
+    idle_threshold_ms: i64,
+    args: RunArgs,
+    /// SSH `-L` tunnel child pid, if running. Mutex serializes
+    /// spawn/kill so a pause/wake race doesn't leak a child.
+    tunnel: Mutex<Option<u32>>,
+    /// Serializes QMP stop/cont so racing wake-and-pause attempts
+    /// can't leave the supervisor's `paused` flag out of sync with
+    /// QEMU's actual state.
+    qmp_lock: Mutex<()>,
+}
+
+impl State {
+    fn touch(&self) {
+        self.last_activity_ms.store(now_ms(), Ordering::Relaxed);
+    }
+
+    /// QMP `cont` only — bring vCPUs back to running. Idempotent and
+    /// fast (single QMP round-trip). Does NOT touch the SSH tunnel:
+    /// TCP-proxy callers don't need it, and bundling it would make
+    /// every SSH probe wait 8s on tunnel spawn during boot.
+    async fn wake(self: &Arc<Self>) -> Result<()> {
+        let _guard = self.qmp_lock.lock().await;
+        if self.paused.load(Ordering::Relaxed) {
+            qmp_send(&self.qmp_socket, "cont", None)
+                .await
+                .context("QMP cont")?;
+            self.paused.store(false, Ordering::Relaxed);
+            eprintln!("supervisor: resumed VM on incoming connection");
+        }
+        Ok(())
+    }
+
+    /// Halt the VM and tear down the tunnel so QEMU isn't holding any
+    /// kernel-side state that the guest can't service while paused.
+    async fn pause(self: &Arc<Self>) -> Result<()> {
+        let _guard = self.qmp_lock.lock().await;
+        if self.paused.load(Ordering::Relaxed) {
+            return Ok(());
+        }
+        // Tear down tunnel first; its SSH keepalives would otherwise
+        // timeout while QEMU is stopped and the child would die in
+        // a way we can't tell apart from a real failure.
+        self.kill_tunnel().await;
+        qmp_send(&self.qmp_socket, "stop", None)
+            .await
+            .context("QMP stop")?;
+        self.paused.store(true, Ordering::Relaxed);
+        Ok(())
+    }
+
+    /// Spawn the SSH `-L` tunnel if it's not already running. Polls
+    /// briefly for the local socket to appear so callers can proceed
+    /// to `connect()` immediately on return.
+    async fn ensure_tunnel(self: &Arc<Self>) -> Result<()> {
+        let mut lock = self.tunnel.lock().await;
+        if let Some(pid) = *lock {
+            if state::pid_alive(pid) && self.args.docker_socket_internal.exists() {
+                return Ok(());
+            }
+            // stale handle; clean up before respawning
+            send_signal(pid, SIGTERM);
+            let _ = std::fs::remove_file(&self.args.docker_socket_internal);
+        }
+        let pid = spawn_ssh_tunnel(&self.args)?;
+        // Wait for the local sock to materialize — ssh -L creates it
+        // only after authentication completes.
+        let deadline = std::time::Instant::now() + Duration::from_secs(8);
+        loop {
+            if self.args.docker_socket_internal.exists() {
+                *lock = Some(pid);
+                eprintln!("supervisor: docker tunnel up (pid {pid})");
+                return Ok(());
+            }
+            if !state::pid_alive(pid) {
+                return Err(anyhow::anyhow!(
+                    "ssh tunnel exited before docker socket appeared"
+                ));
+            }
+            if std::time::Instant::now() >= deadline {
+                send_signal(pid, SIGTERM);
+                return Err(anyhow::anyhow!(
+                    "timed out waiting for docker tunnel to come up"
+                ));
+            }
+            tokio::time::sleep(Duration::from_millis(50)).await;
+        }
+    }
+
+    async fn kill_tunnel(self: &Arc<Self>) {
+        let mut lock = self.tunnel.lock().await;
+        if let Some(pid) = lock.take() {
+            send_signal(pid, SIGTERM);
+            // Don't block long; ssh dies quickly on SIGTERM.
+            for _ in 0..20 {
+                if !state::pid_alive(pid) {
+                    break;
+                }
+                tokio::time::sleep(Duration::from_millis(25)).await;
+            }
+            if state::pid_alive(pid) {
+                send_signal(pid, SIGKILL);
+            }
+        }
+        let _ = std::fs::remove_file(&self.args.docker_socket_internal);
+    }
+}
+
+fn now_ms() -> i64 {
+    use std::time::{SystemTime, UNIX_EPOCH};
+    SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .map(|d| d.as_millis() as i64)
+        .unwrap_or(0)
+}
+
+const SIGTERM: libc::c_int = 15;
+const SIGKILL: libc::c_int = 9;
+
+fn send_signal(pid: u32, sig: libc::c_int) {
+    #[cfg(unix)]
+    unsafe {
+        libc::kill(pid as libc::pid_t, sig);
+    }
+    #[cfg(not(unix))]
+    {
+        let _ = (pid, sig);
+    }
+}
+
+/// Run the supervisor loop until killed.
+pub async fn run(args: RunArgs) -> Result<()> {
+    std::fs::write(&args.pid_file, std::process::id().to_string())
+        .with_context(|| format!("writing {}", args.pid_file.display()))?;
+
+    let state = Arc::new(State {
+        paused: AtomicBool::new(false),
+        active_conns: AtomicUsize::new(0),
+        last_activity_ms: AtomicI64::new(now_ms()),
+        qmp_socket: args.qmp_socket.clone(),
+        idle_threshold_ms: (args.idle_after_secs.saturating_mul(1000)) as i64,
+        tunnel: Mutex::new(None),
+        qmp_lock: Mutex::new(()),
+        args: args.clone(),
+    });
+
+    // Tunnel comes up lazily on first docker conn (handle_docker calls
+    // ensure_tunnel). Spawning eagerly here would race against guest
+    // sshd boot: the SSH handshake fails for ~30s after QEMU starts,
+    // and during that time the supervisor's TCP listener wouldn't bind
+    // (this function blocks on tunnel polling), making the whole boot
+    // cascade fail.
+
+    let tcp_listener = TcpListener::bind(("127.0.0.1", args.user_port))
+        .await
+        .with_context(|| format!("binding 127.0.0.1:{}", args.user_port))?;
+    eprintln!(
+        "supervisor: TCP listening on 127.0.0.1:{} → 127.0.0.1:{} (idle {} s)",
+        args.user_port, args.internal_port, args.idle_after_secs
+    );
+
+    // Stale Unix socket would refuse bind; ours is owned by us across restarts.
+    let _ = std::fs::remove_file(&args.docker_socket);
+    let unix_listener = UnixListener::bind(&args.docker_socket)
+        .with_context(|| format!("binding {}", args.docker_socket.display()))?;
+    eprintln!(
+        "supervisor: Unix listening on {} → SSH→/run/docker.sock",
+        args.docker_socket.display()
+    );
+
+    if args.idle_after_secs > 0 {
+        let state_t = state.clone();
+        tokio::spawn(async move {
+            idle_watcher(state_t).await;
+        });
+    }
+
+    // Signal handler: on SIGTERM/SIGINT, restore the VM to a usable
+    // state (resumed + tunnel down) so the next start doesn't trip
+    // over a paused VM with no supervisor to wake it.
+    let state_sig = state.clone();
+    tokio::spawn(async move {
+        if let Err(e) = wait_for_term().await {
+            eprintln!("supervisor: signal handler error: {e:#}");
+            return;
+        }
+        let _ = state_sig.wake().await; // ensure VM is resumed before we exit
+        state_sig.kill_tunnel().await;
+        std::process::exit(0);
+    });
+
+    // Main accept loop: select between TCP and Unix listeners. Spawned
+    // tasks own their connection through close.
+    loop {
+        tokio::select! {
+            res = tcp_listener.accept() => {
+                let (sock, peer) = match res {
+                    Ok(v) => v,
+                    Err(e) => { eprintln!("supervisor: TCP accept error: {e:#}"); continue; }
+                };
+                let s = state.clone();
+                let internal_port = args.internal_port;
+                tokio::spawn(async move {
+                    if let Err(e) = handle_tcp(sock, internal_port, s).await {
+                        eprintln!("supervisor: TCP conn {peer} error: {e:#}");
+                    }
+                });
+            }
+            res = unix_listener.accept() => {
+                let (sock, _peer) = match res {
+                    Ok(v) => v,
+                    Err(e) => { eprintln!("supervisor: Unix accept error: {e:#}"); continue; }
+                };
+                let s = state.clone();
+                tokio::spawn(async move {
+                    if let Err(e) = handle_docker(sock, s).await {
+                        eprintln!("supervisor: docker conn error: {e:#}");
+                    }
+                });
+            }
+        }
+    }
+}
+
+async fn handle_tcp(mut incoming: TcpStream, internal_port: u16, state: Arc<State>) -> Result<()> {
+    state.active_conns.fetch_add(1, Ordering::Relaxed);
+    state.touch();
+
+    if let Err(e) = state.wake().await {
+        eprintln!("supervisor: wake failed: {e}");
+    }
+
+    let mut inner = TcpStream::connect(("127.0.0.1", internal_port))
+        .await
+        .with_context(|| format!("connecting to internal port {internal_port}"))?;
+    let res = tokio::io::copy_bidirectional(&mut incoming, &mut inner).await;
+    let _ = incoming.shutdown().await;
+    let _ = inner.shutdown().await;
+
+    state.active_conns.fetch_sub(1, Ordering::Relaxed);
+    state.touch();
+    classify_close(res)
+}
+
+async fn handle_docker(mut client: UnixStream, state: Arc<State>) -> Result<()> {
+    state.active_conns.fetch_add(1, Ordering::Relaxed);
+    state.touch();
+
+    // Wake VM first (QMP cont). Then bring the SSH tunnel up — the
+    // tunnel's auth handshake needs guest sshd running, which is only
+    // true post-wake.
+    state.wake().await.context("waking VM for docker conn")?;
+    state
+        .ensure_tunnel()
+        .await
+        .context("bringing docker tunnel up")?;
+
+    let mut backend = UnixStream::connect(&state.args.docker_socket_internal)
+        .await
+        .with_context(|| {
+            format!(
+                "connecting to docker tunnel sock {}",
+                state.args.docker_socket_internal.display()
+            )
+        })?;
+    let res = tokio::io::copy_bidirectional(&mut client, &mut backend).await;
+    let _ = client.shutdown().await;
+    let _ = backend.shutdown().await;
+
+    state.active_conns.fetch_sub(1, Ordering::Relaxed);
+    state.touch();
+    classify_close(res)
+}
+
+/// Filter expected close patterns. SSH probe (boot_sync), `vm shell`
+/// exit, docker client disconnect, any client that closes without
+/// TCP-FIN — all show up as ECONNRESET / BrokenPipe / UnexpectedEof
+/// here. Real I/O faults still propagate.
+fn classify_close(res: std::io::Result<(u64, u64)>) -> Result<()> {
+    match res {
+        Ok(_) => Ok(()),
+        Err(e) => match e.kind() {
+            std::io::ErrorKind::ConnectionReset
+            | std::io::ErrorKind::BrokenPipe
+            | std::io::ErrorKind::UnexpectedEof
+            | std::io::ErrorKind::NotConnected => Ok(()),
+            _ => Err(e).context("bidirectional copy failed"),
+        },
+    }
+}
+
+async fn idle_watcher(state: Arc<State>) {
+    loop {
+        tokio::time::sleep(Duration::from_secs(1)).await;
+        if state.paused.load(Ordering::Relaxed) {
+            continue;
+        }
+        if state.active_conns.load(Ordering::Relaxed) > 0 {
+            continue;
+        }
+        let since = now_ms() - state.last_activity_ms.load(Ordering::Relaxed);
+        if since >= state.idle_threshold_ms {
+            match state.pause().await {
+                Ok(_) => eprintln!("supervisor: paused VM after {since} ms idle"),
+                Err(e) => {
+                    eprintln!("supervisor: pause failed: {e}");
+                    state.touch(); // back off
+                }
+            }
+        }
+    }
+}
+
+/// Spawn an `ssh -N -L <local-sock>:/run/docker.sock` to the guest.
+/// Same flag set as the original `forward.rs`; managed by the
+/// supervisor instead of `lifecycle::start`.
+fn spawn_ssh_tunnel(args: &RunArgs) -> Result<u32> {
+    let _ = std::fs::remove_file(&args.docker_socket_internal);
+    let mut cmd = std::process::Command::new("ssh");
+    cmd.args([
+        "-N",
+        "-T",
+        "-o",
+        "ConnectTimeout=10",
+        "-o",
+        "ExitOnForwardFailure=yes",
+        "-o",
+        "ServerAliveInterval=30",
+        "-o",
+        "ServerAliveCountMax=3",
+        "-o",
+        "StrictHostKeyChecking=no",
+        "-o",
+        &format!("UserKnownHostsFile={}", args.known_hosts.display()),
+        "-o",
+        "PasswordAuthentication=no",
+        "-o",
+        "BatchMode=yes",
+        "-o",
+        "LogLevel=ERROR",
+        "-i",
+        args.ssh_key.to_str().context("ssh key path utf-8")?,
+        "-p",
+        &args.internal_port.to_string(),
+        "-L",
+        &format!(
+            "{}:/run/docker.sock",
+            args.docker_socket_internal.display()
+        ),
+        "root@127.0.0.1",
+    ]);
+    cmd.stdin(Stdio::null());
+    cmd.stdout(Stdio::null());
+    cmd.stderr(Stdio::null());
+    #[cfg(unix)]
+    unsafe {
+        use std::os::unix::process::CommandExt;
+        cmd.pre_exec(|| {
+            let _ = libc::setsid();
+            Ok(())
+        });
+    }
+    let child = cmd.spawn().context("spawning ssh -L tunnel")?;
+    Ok(child.id())
+}
+
+/// Thin one-shot QMP command runner. Open + close per call because
+/// stop/cont happen at most a few times per minute and the QmpClient
+/// holds its own connection state.
+async fn qmp_send(socket: &Path, cmd: &str, args: Option<serde_json::Value>) -> Result<()> {
+    let mut client = QmpClient::connect(socket).await?;
+    let _ = client.execute(cmd, args).await?;
+    Ok(())
+}
+
+#[cfg(unix)]
+async fn wait_for_term() -> Result<()> {
+    use tokio::signal::unix::{signal, SignalKind};
+    let mut term = signal(SignalKind::terminate()).context("install SIGTERM handler")?;
+    let mut intr = signal(SignalKind::interrupt()).context("install SIGINT handler")?;
+    tokio::select! {
+        _ = term.recv() => {}
+        _ = intr.recv() => {}
+    }
+    Ok(())
+}
+
+#[cfg(not(unix))]
+async fn wait_for_term() -> Result<()> {
+    tokio::signal::ctrl_c().await.context("install ctrl-c handler")
+}

From fd9f7e7c97e47f6ce55b32ff1d9bed596c943159 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 21:24:08 -0400
Subject: [PATCH 16/21] perf(vm): default hibernation idle timeout from 10s to
 60s
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 10s default was useful while iterating on the supervisor — short
enough to verify pause/wake every few minutes of testing. For real
use, 10s pauses mid-SSH-session whenever the user pauses to think,
which adds noticeable wake latency on every command. 60s is
comfortable for normal interactive work while still freeing host CPU
within a minute of stepping away. Users who want either extreme can
override via `avocado vm config set idle.hibernate_after_secs N` or
the `AVOCADO_VM_IDLE_HIBERNATE_SECS` env var.
---
 src/utils/vm/lifecycle.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/utils/vm/lifecycle.rs b/src/utils/vm/lifecycle.rs
index ddeddd97..5d4bc702 100644
--- a/src/utils/vm/lifecycle.rs
+++ b/src/utils/vm/lifecycle.rs
@@ -758,11 +758,11 @@ fn write_ssh_config(paths: &VmPaths, ssh_port: u16) -> Result<()> {
 }
 
 /// Default idle timeout in seconds when neither config nor env var sets
-/// one. Aggressive for testing while the hibernation supervisor is new
-/// — production should land on a more user-friendly default (multiple
-/// minutes) once the wake-on-connect path has been exercised in real
-/// workflows.
-const DEFAULT_IDLE_AFTER_SECS: u64 = 10;
+/// one. One minute strikes a balance between freeing host CPU promptly
+/// when the user steps away from active work and not pausing mid-pause
+/// during normal SSH/docker bursts. Users with snappier wake budgets
+/// can lower via `avocado vm config set idle.hibernate_after_secs N`.
+const DEFAULT_IDLE_AFTER_SECS: u64 = 60;
 
 /// Resolve the hibernate timeout. Env var wins (one-shot override for
 /// experimentation), else the persisted `idle.hibernate_after_secs`,

From 9dc3dc1627194a39f8e1f23d33a6a648de33f74e Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 21:29:46 -0400
Subject: [PATCH 17/21] chore: satisfy fmt + clippy for the supervisor/DTB work

`cargo fmt --check` and `cargo clippy --all-targets --all-features
-- -D warnings` were both failing on the just-merged supervisor and
DTB changes. Auto-applies rustfmt and rewrites three `pos % 4 != 0`
clippy::manual_is_multiple_of sites in fdt.rs to `!pos.is_multiple_of(4)`.

No behavior changes.
---
 src/main.rs                | 26 +++++++------
 src/utils/vm/fdt.rs        | 80 +++++++++++++++++++++++++++++---------
 src/utils/vm/lifecycle.rs  | 12 ++++--
 src/utils/vm/qemu.rs       | 13 +++----
 src/utils/vm/supervisor.rs |  9 ++---
 5 files changed, 95 insertions(+), 45 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index da65621f..6d622125 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3086,19 +3086,21 @@ async fn main() -> Result<()> {
                 docker_socket_internal,
                 ssh_key,
                 known_hosts,
-            } => commands::vm::supervise::SuperviseCommand {
-                user_port,
-                internal_port,
-                qmp_socket,
-                idle_after_secs,
-                pid_file,
-                docker_socket,
-                docker_socket_internal,
-                ssh_key,
-                known_hosts,
+            } => {
+                commands::vm::supervise::SuperviseCommand {
+                    user_port,
+                    internal_port,
+                    qmp_socket,
+                    idle_after_secs,
+                    pid_file,
+                    docker_socket,
+                    docker_socket_internal,
+                    ssh_key,
+                    known_hosts,
+                }
+                .execute()
+                .await
             }
-            .execute()
-            .await,
             VmCommands::Status => commands::vm::status::StatusCommand.execute().await,
             VmCommands::Shell { command } => {
                 commands::vm::shell::ShellCommand { command }
diff --git a/src/utils/vm/fdt.rs b/src/utils/vm/fdt.rs
index 88519783..66201a01 100644
--- a/src/utils/vm/fdt.rs
+++ b/src/utils/vm/fdt.rs
@@ -45,14 +45,21 @@ pub struct Node {
 
 impl Node {
     pub fn new(name: impl Into<String>) -> Self {
-        Self { name: name.into(), props: Vec::new(), children: Vec::new() }
+        Self {
+            name: name.into(),
+            props: Vec::new(),
+            children: Vec::new(),
+        }
     }
 
     pub fn set_prop(&mut self, name: &str, value: Vec<u8>) {
         if let Some(p) = self.props.iter_mut().find(|p| p.name == name) {
             p.value = value;
         } else {
-            self.props.push(Property { name: name.to_string(), value });
+            self.props.push(Property {
+                name: name.to_string(),
+                value,
+            });
         }
     }
 
@@ -96,10 +103,12 @@ pub fn parse(data: &[u8]) -> Result<Fdt> {
         bail!("unsupported DTB version {version} (need v16+)");
     }
     if data.len() < totalsize {
-        bail!("DTB truncated: header says {totalsize} bytes, got {}", data.len());
+        bail!(
+            "DTB truncated: header says {totalsize} bytes, got {}",
+            data.len()
+        );
     }
-    if off_dt_struct + size_dt_struct > data.len()
-        || off_dt_strings + size_dt_strings > data.len()
+    if off_dt_struct + size_dt_struct > data.len() || off_dt_strings + size_dt_strings > data.len()
     {
         bail!("DTB struct/strings offsets out of bounds");
     }
@@ -119,7 +128,11 @@ pub fn parse(data: &[u8]) -> Result<Fdt> {
         mem_rsv.push((addr, size));
     }
 
-    let mut parser = Parser { data, pos: off_dt_struct, strings_base: off_dt_strings };
+    let mut parser = Parser {
+        data,
+        pos: off_dt_struct,
+        strings_base: off_dt_strings,
+    };
     let first = parser.read_u32()?;
     if first != FDT_BEGIN_NODE {
         bail!("DTB struct block must start with BEGIN_NODE, got {first:#x}");
@@ -129,7 +142,11 @@ pub fn parse(data: &[u8]) -> Result<Fdt> {
     if last != FDT_END {
         bail!("DTB struct block missing FDT_END terminator, got {last:#x}");
     }
-    Ok(Fdt { root, mem_rsv, boot_cpuid_phys })
+    Ok(Fdt {
+        root,
+        mem_rsv,
+        boot_cpuid_phys,
+    })
 }
 
 struct Parser<'a> {
@@ -160,7 +177,7 @@ impl<'a> Parser<'a> {
             .with_context(|| format!("non-utf8 name at offset {start}"))?
             .to_string();
         self.pos += 1;
-        while self.pos % 4 != 0 {
+        while !self.pos.is_multiple_of(4) {
             self.pos += 1;
         }
         Ok(s)
@@ -197,7 +214,7 @@ impl<'a> Parser<'a> {
                         })?
                         .to_vec();
                     self.pos += len;
-                    while self.pos % 4 != 0 {
+                    while !self.pos.is_multiple_of(4) {
                         self.pos += 1;
                     }
                     node.props.push(Property {
@@ -223,7 +240,12 @@ struct Emitter {
 }
 
 impl Emitter {
-    fn new() -> Self { Self { structs: Vec::new(), strings: Vec::new() } }
+    fn new() -> Self {
+        Self {
+            structs: Vec::new(),
+            strings: Vec::new(),
+        }
+    }
 
     fn intern(&mut self, name: &str) -> u32 {
         let bytes = name.as_bytes();
@@ -244,10 +266,12 @@ impl Emitter {
         off
     }
 
-    fn push_u32(&mut self, v: u32) { self.structs.extend_from_slice(&v.to_be_bytes()); }
+    fn push_u32(&mut self, v: u32) {
+        self.structs.extend_from_slice(&v.to_be_bytes());
+    }
 
     fn pad4(&mut self) {
-        while self.structs.len() % 4 != 0 {
+        while !self.structs.len().is_multiple_of(4) {
             self.structs.push(0);
         }
     }
@@ -326,7 +350,9 @@ fn max_phandle(node: &Node) -> u32 {
     max
 }
 
-fn be32(v: u32) -> Vec<u8> { v.to_be_bytes().to_vec() }
+fn be32(v: u32) -> Vec<u8> {
+    v.to_be_bytes().to_vec()
+}
 fn strprop(s: &str) -> Vec<u8> {
     let mut v = s.as_bytes().to_vec();
     v.push(0);
@@ -405,7 +431,11 @@ mod tests {
             cpus.children.push(cpu);
         }
         root.children.push(cpus);
-        let fdt = Fdt { root, mem_rsv: vec![], boot_cpuid_phys: 0 };
+        let fdt = Fdt {
+            root,
+            mem_rsv: vec![],
+            boot_cpuid_phys: 0,
+        };
         serialize(&fdt)
     }
 
@@ -441,7 +471,10 @@ mod tests {
                 .iter()
                 .find(|p| p.name == "cpu-idle-states")
                 .expect("cpu-idle-states missing on cpu node");
-            assert_eq!(u32::from_be_bytes(cis.value.as_slice().try_into().unwrap()), phandle);
+            assert_eq!(
+                u32::from_be_bytes(cis.value.as_slice().try_into().unwrap()),
+                phandle
+            );
         }
     }
 
@@ -454,7 +487,10 @@ mod tests {
         let rt = parse(&out).unwrap();
         assert!(rt.root.children.iter().any(|c| c.name == "idle-states"));
         let cpus = rt.root.children.iter().find(|c| c.name == "cpus").unwrap();
-        assert!(cpus.children.iter().all(|c| c.props.iter().any(|p| p.name == "cpu-idle-states")));
+        assert!(cpus
+            .children
+            .iter()
+            .all(|c| c.props.iter().any(|p| p.name == "cpu-idle-states")));
     }
 
     #[test]
@@ -468,7 +504,11 @@ mod tests {
     fn patch_fails_when_no_cpus_node() {
         let mut root = Node::new("");
         root.set_prop("#address-cells", be32(2));
-        let fdt_in = Fdt { root, mem_rsv: vec![], boot_cpuid_phys: 0 };
+        let fdt_in = Fdt {
+            root,
+            mem_rsv: vec![],
+            boot_cpuid_phys: 0,
+        };
         let bytes = serialize(&fdt_in);
         let mut fdt = parse(&bytes).unwrap();
         assert!(patch_idle_states(&mut fdt, 4).is_err());
@@ -482,7 +522,11 @@ mod tests {
         cpu.set_prop("reg", be32(0));
         cpus.children.push(cpu);
         root.children.push(cpus);
-        let fdt_in = Fdt { root, mem_rsv: vec![], boot_cpuid_phys: 0x42 };
+        let fdt_in = Fdt {
+            root,
+            mem_rsv: vec![],
+            boot_cpuid_phys: 0x42,
+        };
         let bytes = serialize(&fdt_in);
         let parsed = parse(&bytes).unwrap();
         assert_eq!(parsed.boot_cpuid_phys, 0x42);
diff --git a/src/utils/vm/lifecycle.rs b/src/utils/vm/lifecycle.rs
index 5d4bc702..cd5253ea 100644
--- a/src/utils/vm/lifecycle.rs
+++ b/src/utils/vm/lifecycle.rs
@@ -168,8 +168,11 @@ pub async fn start(opts: StartOptions) -> Result<VmStatus> {
     // this one; downstream callers (vm shell, forward.rs, Avocado.app)
     // only ever see `ssh_port`.
     let internal_ssh_port = qemu::pick_free_port()?;
-    std::fs::write(paths.internal_ssh_port_file(), internal_ssh_port.to_string())
-        .with_context(|| format!("writing {}", paths.internal_ssh_port_file().display()))?;
+    std::fs::write(
+        paths.internal_ssh_port_file(),
+        internal_ssh_port.to_string(),
+    )
+    .with_context(|| format!("writing {}", paths.internal_ssh_port_file().display()))?;
 
     // Now that the port is known, write the ssh-config + wire it into
     // ~/.ssh/config. This is required for `DOCKER_HOST=ssh://avocado-vm`
@@ -874,7 +877,10 @@ async fn spawn_supervisor(
     // proxy is ready before boot_sync starts pumping connections through.
     let deadline = std::time::Instant::now() + Duration::from_secs(5);
     loop {
-        if tokio::net::TcpStream::connect(("127.0.0.1", user_port)).await.is_ok() {
+        if tokio::net::TcpStream::connect(("127.0.0.1", user_port))
+            .await
+            .is_ok()
+        {
             return Ok(());
         }
         if std::time::Instant::now() >= deadline {
diff --git a/src/utils/vm/qemu.rs b/src/utils/vm/qemu.rs
index 1f750631..1046b7a9 100644
--- a/src/utils/vm/qemu.rs
+++ b/src/utils/vm/qemu.rs
@@ -231,7 +231,9 @@ pub fn build_qemu_args(
     // Failures degrade gracefully: log + skip, kernel falls back to the
     // auto-generated DTB it would have used anyway.
     if matches!(arch.as_str(), "arm64" | "aarch64") {
-        let dtb_override = std::env::var("AVOCADO_VM_DTB").ok().filter(|s| !s.is_empty());
+        let dtb_override = std::env::var("AVOCADO_VM_DTB")
+            .ok()
+            .filter(|s| !s.is_empty());
         match dtb_override {
             Some(path) => {
                 args.push("-dtb".into());
@@ -288,8 +290,7 @@ fn ensure_idle_states_dtb(paths: &VmPaths, cfg: &QemuConfig) -> Result<PathBuf>
     let raw = std::fs::read(tmp.path())
         .with_context(|| format!("failed to read dumped DTB at {}", tmp.path().display()))?;
     let mut fdt = fdt::parse(&raw).context("failed to parse QEMU-generated DTB")?;
-    fdt::patch_idle_states(&mut fdt, cfg.cpus)
-        .context("failed to splice idle-states into DTB")?;
+    fdt::patch_idle_states(&mut fdt, cfg.cpus).context("failed to splice idle-states into DTB")?;
     let patched = fdt::serialize(&fdt);
     std::fs::write(tmp.path(), &patched)
         .with_context(|| format!("failed to write patched DTB to {}", tmp.path().display()))?;
@@ -341,10 +342,8 @@ fn dump_base_dtb(qemu_bin: &str, cfg: &QemuConfig, out: &Path) -> Result<()> {
 /// cache; a binary that hasn't been touched produces the same key
 /// indefinitely.
 fn qemu_binary_tag(qemu_bin: &str) -> Result<String> {
-    let path = which_on_path(qemu_bin)
-        .with_context(|| format!("{qemu_bin} not found on $PATH"))?;
-    let meta = std::fs::metadata(&path)
-        .with_context(|| format!("stat {}", path.display()))?;
+    let path = which_on_path(qemu_bin).with_context(|| format!("{qemu_bin} not found on $PATH"))?;
+    let meta = std::fs::metadata(&path).with_context(|| format!("stat {}", path.display()))?;
     let mtime = meta
         .modified()
         .ok()
diff --git a/src/utils/vm/supervisor.rs b/src/utils/vm/supervisor.rs
index 0055903e..9134e55f 100644
--- a/src/utils/vm/supervisor.rs
+++ b/src/utils/vm/supervisor.rs
@@ -419,10 +419,7 @@ fn spawn_ssh_tunnel(args: &RunArgs) -> Result<u32> {
         "-p",
         &args.internal_port.to_string(),
         "-L",
-        &format!(
-            "{}:/run/docker.sock",
-            args.docker_socket_internal.display()
-        ),
+        &format!("{}:/run/docker.sock", args.docker_socket_internal.display()),
         "root@127.0.0.1",
     ]);
     cmd.stdin(Stdio::null());
@@ -463,5 +460,7 @@ async fn wait_for_term() -> Result<()> {
 
 #[cfg(not(unix))]
 async fn wait_for_term() -> Result<()> {
-    tokio::signal::ctrl_c().await.context("install ctrl-c handler")
+    tokio::signal::ctrl_c()
+        .await
+        .context("install ctrl-c handler")
 }

From 64408641da770f701e821259633f34204e8ca9e3 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 21:41:16 -0400
Subject: [PATCH 18/21] chore: cfg(unix) gate the supervisor for Windows build

The hibernation supervisor uses tokio's UnixListener/UnixStream for
the docker socket path and tokio::signal::unix for graceful shutdown,
neither of which exist on Windows. Without gating, `cargo check
--target x86_64-pc-windows-gnu` fails with E0432 (unresolved
UnixListener/UnixStream imports).

Gated unix-only:
  - `pub mod supervisor` in utils/vm/mod.rs
  - `pub mod supervise` in commands/vm/mod.rs
  - `VmCommands::Supervise` variant + dispatch in main.rs
  - `spawn_supervisor` / `stop_supervisor` / `resolve_idle_after_secs`
    / `DEFAULT_IDLE_AFTER_SECS` in lifecycle.rs
  - The internal-port pick + ssh_port file write in `start`

On Windows the hibernation feature is unavailable: QEMU binds the
user-facing port directly (today's pre-supervisor behavior), the
legacy long-lived docker forwarder runs, and the VM never auto-pauses.
---
 src/commands/vm/mod.rs    |  1 +
 src/main.rs               |  5 ++++-
 src/utils/vm/lifecycle.rs | 36 +++++++++++++++++++++++++++---------
 src/utils/vm/mod.rs       |  1 +
 4 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/src/commands/vm/mod.rs b/src/commands/vm/mod.rs
index 585cd74a..27a9746e 100644
--- a/src/commands/vm/mod.rs
+++ b/src/commands/vm/mod.rs
@@ -12,5 +12,6 @@ pub mod shell;
 pub mod start;
 pub mod status;
 pub mod stop;
+#[cfg(unix)]
 pub mod supervise;
 pub mod update;
diff --git a/src/main.rs b/src/main.rs
index 6d622125..34f328e1 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3076,6 +3076,7 @@ async fn main() -> Result<()> {
                 cmd.execute().await
             }
             VmCommands::Stop { force } => commands::vm::stop::StopCommand { force }.execute().await,
+            #[cfg(unix)]
             VmCommands::Supervise {
                 user_port,
                 internal_port,
@@ -4565,7 +4566,9 @@ enum VmCommands {
     /// Long-lived hibernation supervisor. Internal — spawned by `vm start`,
     /// not for direct use. Owns the user-facing SSH port AND docker
     /// socket, proxies to QEMU's internal hostfwd / SSH tunnel, and
-    /// sends QMP stop/cont on the idle timeout.
+    /// sends QMP stop/cont on the idle timeout. Unix-only because the
+    /// docker socket path requires UnixListener.
+    #[cfg(unix)]
     #[command(hide = true)]
     Supervise {
         #[arg(long)]
diff --git a/src/utils/vm/lifecycle.rs b/src/utils/vm/lifecycle.rs
index cd5253ea..fca9e73b 100644
--- a/src/utils/vm/lifecycle.rs
+++ b/src/utils/vm/lifecycle.rs
@@ -166,13 +166,18 @@ pub async fn start(opts: StartOptions) -> Result<VmStatus> {
     // Loopback-only port QEMU's hostfwd binds to. The supervisor
     // listens on the user-facing `ssh_port` and proxies through to
     // this one; downstream callers (vm shell, forward.rs, Avocado.app)
-    // only ever see `ssh_port`.
-    let internal_ssh_port = qemu::pick_free_port()?;
-    std::fs::write(
-        paths.internal_ssh_port_file(),
-        internal_ssh_port.to_string(),
-    )
-    .with_context(|| format!("writing {}", paths.internal_ssh_port_file().display()))?;
+    // only ever see `ssh_port`. On non-unix the supervisor is
+    // unavailable (uses tokio's UnixListener), so QEMU binds the
+    // user-facing port directly — pre-supervisor behavior.
+    #[cfg(unix)]
+    let qemu_hostfwd_port = {
+        let internal = qemu::pick_free_port()?;
+        std::fs::write(paths.internal_ssh_port_file(), internal.to_string())
+            .with_context(|| format!("writing {}", paths.internal_ssh_port_file().display()))?;
+        internal
+    };
+    #[cfg(not(unix))]
+    let qemu_hostfwd_port = ssh_port;
 
     // Now that the port is known, write the ssh-config + wire it into
     // ~/.ssh/config. This is required for `DOCKER_HOST=ssh://avocado-vm`
@@ -189,7 +194,7 @@ pub async fn start(opts: StartOptions) -> Result<VmStatus> {
     let cfg = QemuConfig {
         memory_mib,
         cpus,
-        ssh_port: internal_ssh_port,
+        ssh_port: qemu_hostfwd_port,
         cmdline_extra: opts.cmdline_extra,
         artifact_dir: artifact_dir.clone(),
         workspace: workspace.clone(),
@@ -216,8 +221,16 @@ pub async fn start(opts: StartOptions) -> Result<VmStatus> {
     // `idle_after_secs` of no proxied activity, sends QMP `stop` to
     // halt the vCPUs; wakes on the next incoming TCP. boot_sync below
     // goes through the proxy, which is why we spawn before waiting.
+    // Unix-only because the supervisor uses tokio's UnixListener for
+    // the docker-socket path; on Windows the supervisor is absent
+    // (idle_after_secs forced to 0) and we fall through to the legacy
+    // long-lived docker forwarder below.
+    #[cfg(unix)]
     let idle_after_secs = resolve_idle_after_secs(&paths);
-    spawn_supervisor(&paths, ssh_port, internal_ssh_port, idle_after_secs).await?;
+    #[cfg(not(unix))]
+    let idle_after_secs: u64 = 0;
+    #[cfg(unix)]
+    spawn_supervisor(&paths, ssh_port, qemu_hostfwd_port, idle_after_secs).await?;
 
     // Wait for the guest to become ready — first signal wins (qga vs SSH).
     let signal = super::boot_sync::wait_for_guest_ready(&paths.qga_socket(), ssh_port, None)
@@ -311,6 +324,7 @@ async fn stop_inner(force: bool) -> Result<()> {
     // exited, the next `vm start` would race against a still-bound port.
     // The docker socket forwarder is an SSH child that can outlive QEMU
     // if we shut down by signal, leaving a stale `docker.sock`.
+    #[cfg(unix)]
     stop_supervisor(&paths);
     let _ = super::forward::stop(&paths).await;
 
@@ -765,12 +779,14 @@ fn write_ssh_config(paths: &VmPaths, ssh_port: u16) -> Result<()> {
 /// when the user steps away from active work and not pausing mid-pause
 /// during normal SSH/docker bursts. Users with snappier wake budgets
 /// can lower via `avocado vm config set idle.hibernate_after_secs N`.
+#[cfg(unix)]
 const DEFAULT_IDLE_AFTER_SECS: u64 = 60;
 
 /// Resolve the hibernate timeout. Env var wins (one-shot override for
 /// experimentation), else the persisted `idle.hibernate_after_secs`,
 /// else the default. `0` disables hibernation while keeping the proxy
 /// up — useful for isolating proxy issues from QMP issues.
+#[cfg(unix)]
 fn resolve_idle_after_secs(paths: &VmPaths) -> u64 {
     if let Ok(raw) = std::env::var("AVOCADO_VM_IDLE_HIBERNATE_SECS") {
         if let Ok(parsed) = raw.parse::<u64>() {
@@ -796,6 +812,7 @@ fn resolve_idle_after_secs(paths: &VmPaths) -> u64 {
 /// Best-effort SIGTERM → SIGKILL on the supervisor pid, then remove
 /// its pidfile + internal-ssh-port marker. Idempotent — missing
 /// pidfile / dead pid is a no-op.
+#[cfg(unix)]
 fn stop_supervisor(paths: &VmPaths) {
     let pidfile = paths.supervisor_pid();
     if let Ok(raw) = std::fs::read_to_string(&pidfile) {
@@ -818,6 +835,7 @@ fn stop_supervisor(paths: &VmPaths) {
     let _ = std::fs::remove_file(paths.internal_ssh_port_file());
 }
 
+#[cfg(unix)]
 async fn spawn_supervisor(
     paths: &VmPaths,
     user_port: u16,
diff --git a/src/utils/vm/mod.rs b/src/utils/vm/mod.rs
index 0d09870c..2b0001c3 100644
--- a/src/utils/vm/mod.rs
+++ b/src/utils/vm/mod.rs
@@ -39,4 +39,5 @@ pub mod share;
 pub mod ssh;
 pub mod staging;
 pub mod state;
+#[cfg(unix)]
 pub mod supervisor;

From fe6cc5faa45bf4aa7248ca24bc3e1e9af4f44865 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 22:10:30 -0400
Subject: [PATCH 19/21] feat(vm): `vm status` reports hibernated state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hibernation supervisor halts vCPUs via QMP `stop` after the
configured idle timeout — a running pid alone no longer tells us
whether the guest is actively executing. Probe QMP `query-status`
from `lifecycle::status` (500ms timeout so a wedged QEMU doesn't
hang the command) and surface a "(hibernated — wakes on next
ssh/docker call)" suffix when paused.

`VmStatus` gains a `paused: Option<bool>` where `Some(true)` =
paused, `Some(false)` = confirmed running, `None` = couldn't probe
(no QMP socket, non-unix host, supervisor down). Lets desktop /
other consumers distinguish the three explicitly.
---
 src/commands/vm/status.rs |  6 ++++-
 src/utils/vm/lifecycle.rs | 47 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/src/commands/vm/status.rs b/src/commands/vm/status.rs
index 1ec01d9d..8edff1b0 100644
--- a/src/commands/vm/status.rs
+++ b/src/commands/vm/status.rs
@@ -10,8 +10,12 @@ impl StatusCommand {
     pub async fn execute(self) -> Result<()> {
         let s = lifecycle::status().await?;
         if s.running {
+            let state_tag = match s.paused {
+                Some(true) => " (hibernated — wakes on next ssh/docker call)",
+                _ => "",
+            };
             println!(
-                "avocado-vm running (pid {}, ssh 127.0.0.1:{})",
+                "avocado-vm running (pid {}, ssh 127.0.0.1:{}){state_tag}",
                 s.pid.unwrap_or(0),
                 s.ssh_port.unwrap_or(0),
             );
diff --git a/src/utils/vm/lifecycle.rs b/src/utils/vm/lifecycle.rs
index fca9e73b..7cae01f6 100644
--- a/src/utils/vm/lifecycle.rs
+++ b/src/utils/vm/lifecycle.rs
@@ -89,6 +89,12 @@ pub struct VmStatus {
     pub manifest_platform: Option<String>,
     pub manifest_arch: Option<String>,
     pub paths: VmPaths,
+    /// `Some(true)` when QEMU has been paused by the hibernation
+    /// supervisor (vCPUs halted, host CPU ~0%, RAM resident — wakes
+    /// on next inbound SSH/docker connection). `Some(false)` when
+    /// confirmed running. `None` when liveness couldn't be probed (no
+    /// QMP socket, supervisor down, non-unix host, etc.).
+    pub paused: Option<bool>,
 }
 
 /// Start the VM. Errors if one is already running. Performs manifest sha256
@@ -303,6 +309,8 @@ pub async fn start(opts: StartOptions) -> Result<VmStatus> {
         manifest_platform: Some(manifest.platform),
         manifest_arch: Some(manifest.architecture),
         paths,
+        // We just finished boot — definitely not paused.
+        paused: Some(false),
     })
 }
 
@@ -393,6 +401,15 @@ pub async fn status() -> Result<VmStatus> {
         (None, None)
     };
 
+    // Probe QMP for paused state — the hibernation supervisor halts
+    // vCPUs via QMP `stop` when idle, so a running pid does not
+    // necessarily mean the guest is actively executing. `query-status`
+    // returns `{ status: "running" | "paused", ... }`. Failures here
+    // (no socket, QMP unreachable on non-unix, etc.) leave the field
+    // as `None` so callers can distinguish "couldn't tell" from
+    // "definitely paused".
+    let paused = if running { probe_paused(&paths).await } else { None };
+
     Ok(VmStatus {
         running,
         pid: if running { pid } else { None },
@@ -400,6 +417,7 @@ pub async fn status() -> Result<VmStatus> {
         manifest_platform: platform,
         manifest_arch: arch,
         paths,
+        paused,
     })
 }
 
@@ -774,6 +792,35 @@ fn write_ssh_config(paths: &VmPaths, ssh_port: u16) -> Result<()> {
     Ok(())
 }
 
+/// Ask QEMU via QMP whether the VM is currently paused. Returns
+/// `Some(true)` for paused, `Some(false)` for any non-paused running
+/// state, and `None` if the QMP socket is unreachable. Short timeout
+/// (~500ms) so a wedged QEMU doesn't make `vm status` hang.
+async fn probe_paused(paths: &VmPaths) -> Option<bool> {
+    #[cfg(unix)]
+    {
+        if !paths.qmp_socket().exists() {
+            return None;
+        }
+        let probe = async {
+            let mut client = QmpClient::connect(&paths.qmp_socket()).await.ok()?;
+            let v = client.execute("query-status", None).await.ok()?;
+            v.get("status")
+                .and_then(|s| s.as_str())
+                .map(|s| s == "paused")
+        };
+        tokio::time::timeout(Duration::from_millis(500), probe)
+            .await
+            .ok()
+            .flatten()
+    }
+    #[cfg(not(unix))]
+    {
+        let _ = paths;
+        None
+    }
+}
+
 /// Default idle timeout in seconds when neither config nor env var sets
 /// one. One minute strikes a balance between freeing host CPU promptly
 /// when the user steps away from active work and not pausing mid-pause

From 9b123aad39e00e8ffc9e4e8422583cd1f39e9e67 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 23:10:43 -0400
Subject: [PATCH 20/21] perf(vm): infrastructure lane for idle-exempt telemetry
 + timestamps

Adds a second TCP listener (`infra-ssh-port`) and a second docker
socket (`docker-stream.sock`) on the supervisor. Connections via
these "infra" lanes are proxied identically to the user-facing
counterparts but do NOT count toward the idle-hibernation timer, AND
do NOT wake the VM if it's paused.

Without this, long-lived telemetry channels (the desktop app's agent
SSH tunnel, a future docker `/events` subscription, etc.) would
either pin the VM awake forever (when held open during Running) or
respawn-loop wake the VM as soon as it tries to hibernate (when the
backing SSH tunnel dies on pause and the consumer reconnects).

Semantics:
  - User-facing: `wake()` on accept, `ensure_tunnel()` on docker conn,
    counts toward idle while open.
  - Infra: no wake, no `ensure_tunnel`. Fast-fail if the tunnel isn't
    already up (caller backs off and retries). Tunnel comes up only
    when real user activity happens.

Net result: hibernation actually sticks when the desktop is open.
Background telemetry sees the VM as available iff a human is using
it; otherwise it accepts the timer-driven hibernation.

Also adds UTC RFC3339 timestamps to `supervisor.log` via a `slog!`
macro so pause/wake cycles are legible without correlating against
shell history.

Wired into the CLI Supervise subcommand via two new flags
(`--infra-port`, `--docker-socket-stream`). `state.rs` gains
`infra_ssh_port_file()` and `docker_socket_stream()` accessors;
`lifecycle::start` picks the infra port + passes paths through.
---
 src/commands/vm/supervise.rs |   4 +
 src/main.rs                  |   8 ++
 src/utils/vm/lifecycle.rs    |  31 +++++-
 src/utils/vm/state.rs        |  18 ++++
 src/utils/vm/supervisor.rs   | 178 ++++++++++++++++++++++++++++-------
 5 files changed, 204 insertions(+), 35 deletions(-)

diff --git a/src/commands/vm/supervise.rs b/src/commands/vm/supervise.rs
index ce38e4d0..740bace0 100644
--- a/src/commands/vm/supervise.rs
+++ b/src/commands/vm/supervise.rs
@@ -13,11 +13,13 @@ use crate::utils::vm::supervisor::{run, RunArgs};
 pub struct SuperviseCommand {
     pub user_port: u16,
     pub internal_port: u16,
+    pub infra_port: u16,
     pub qmp_socket: PathBuf,
     pub idle_after_secs: u64,
     pub pid_file: PathBuf,
     pub docker_socket: PathBuf,
     pub docker_socket_internal: PathBuf,
+    pub docker_socket_stream: PathBuf,
     pub ssh_key: PathBuf,
     pub known_hosts: PathBuf,
 }
@@ -27,11 +29,13 @@ impl SuperviseCommand {
         run(RunArgs {
             user_port: self.user_port,
             internal_port: self.internal_port,
+            infra_port: self.infra_port,
             qmp_socket: self.qmp_socket,
             idle_after_secs: self.idle_after_secs,
             pid_file: self.pid_file,
             docker_socket: self.docker_socket,
             docker_socket_internal: self.docker_socket_internal,
+            docker_socket_stream: self.docker_socket_stream,
             ssh_key: self.ssh_key,
             known_hosts: self.known_hosts,
         })
diff --git a/src/main.rs b/src/main.rs
index 34f328e1..fa85bd67 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3080,22 +3080,26 @@ async fn main() -> Result<()> {
             VmCommands::Supervise {
                 user_port,
                 internal_port,
+                infra_port,
                 qmp_socket,
                 idle_after_secs,
                 pid_file,
                 docker_socket,
                 docker_socket_internal,
+                docker_socket_stream,
                 ssh_key,
                 known_hosts,
             } => {
                 commands::vm::supervise::SuperviseCommand {
                     user_port,
                     internal_port,
+                    infra_port,
                     qmp_socket,
                     idle_after_secs,
                     pid_file,
                     docker_socket,
                     docker_socket_internal,
+                    docker_socket_stream,
                     ssh_key,
                     known_hosts,
                 }
@@ -4576,6 +4580,8 @@ enum VmCommands {
         #[arg(long)]
         internal_port: u16,
         #[arg(long)]
+        infra_port: u16,
+        #[arg(long)]
         qmp_socket: std::path::PathBuf,
         #[arg(long)]
         idle_after_secs: u64,
@@ -4586,6 +4592,8 @@ enum VmCommands {
         #[arg(long)]
         docker_socket_internal: std::path::PathBuf,
         #[arg(long)]
+        docker_socket_stream: std::path::PathBuf,
+        #[arg(long)]
         ssh_key: std::path::PathBuf,
         #[arg(long)]
         known_hosts: std::path::PathBuf,
diff --git a/src/utils/vm/lifecycle.rs b/src/utils/vm/lifecycle.rs
index 7cae01f6..f1995289 100644
--- a/src/utils/vm/lifecycle.rs
+++ b/src/utils/vm/lifecycle.rs
@@ -185,6 +185,22 @@ pub async fn start(opts: StartOptions) -> Result<VmStatus> {
     #[cfg(not(unix))]
     let qemu_hostfwd_port = ssh_port;
 
+    // Infrastructure SSH port — second user-facing port that the
+    // supervisor proxies identically to `ssh_port`, but connections to
+    // it do NOT count toward the idle-hibernation timer. Long-lived
+    // telemetry channels (Avocado.app's agent SSH tunnel, docker
+    // /events subscriptions) connect here so they wake the VM on
+    // attach but don't pin it awake forever.
+    #[cfg(unix)]
+    let infra_ssh_port = {
+        let port = qemu::pick_free_port()?;
+        std::fs::write(paths.infra_ssh_port_file(), port.to_string())
+            .with_context(|| format!("writing {}", paths.infra_ssh_port_file().display()))?;
+        port
+    };
+    #[cfg(not(unix))]
+    let infra_ssh_port = ssh_port;
+
     // Now that the port is known, write the ssh-config + wire it into
     // ~/.ssh/config. This is required for `DOCKER_HOST=ssh://avocado-vm`
     // to resolve in any subprocess we spawn — Docker's ssh transport reads
@@ -236,7 +252,14 @@ pub async fn start(opts: StartOptions) -> Result<VmStatus> {
     #[cfg(not(unix))]
     let idle_after_secs: u64 = 0;
     #[cfg(unix)]
-    spawn_supervisor(&paths, ssh_port, qemu_hostfwd_port, idle_after_secs).await?;
+    spawn_supervisor(
+        &paths,
+        ssh_port,
+        qemu_hostfwd_port,
+        infra_ssh_port,
+        idle_after_secs,
+    )
+    .await?;
 
     // Wait for the guest to become ready — first signal wins (qga vs SSH).
     let signal = super::boot_sync::wait_for_guest_ready(&paths.qga_socket(), ssh_port, None)
@@ -880,6 +903,7 @@ fn stop_supervisor(paths: &VmPaths) {
     }
     let _ = std::fs::remove_file(pidfile);
     let _ = std::fs::remove_file(paths.internal_ssh_port_file());
+    let _ = std::fs::remove_file(paths.infra_ssh_port_file());
 }
 
 #[cfg(unix)]
@@ -887,6 +911,7 @@ async fn spawn_supervisor(
     paths: &VmPaths,
     user_port: u16,
     internal_port: u16,
+    infra_port: u16,
     idle_after_secs: u64,
 ) -> Result<()> {
     let exe = std::env::current_exe().context("locating current avocado binary")?;
@@ -898,6 +923,8 @@ async fn spawn_supervisor(
         &user_port.to_string(),
         "--internal-port",
         &internal_port.to_string(),
+        "--infra-port",
+        &infra_port.to_string(),
         "--qmp-socket",
         &paths.qmp_socket().to_string_lossy(),
         "--idle-after-secs",
@@ -908,6 +935,8 @@ async fn spawn_supervisor(
         &paths.docker_socket().to_string_lossy(),
         "--docker-socket-internal",
         &paths.docker_socket_internal().to_string_lossy(),
+        "--docker-socket-stream",
+        &paths.docker_socket_stream().to_string_lossy(),
         "--ssh-key",
         &paths.ssh_key().to_string_lossy(),
         "--known-hosts",
diff --git a/src/utils/vm/state.rs b/src/utils/vm/state.rs
index 6c161118..19edfb0b 100644
--- a/src/utils/vm/state.rs
+++ b/src/utils/vm/state.rs
@@ -183,6 +183,22 @@ impl VmPaths {
     pub fn internal_ssh_port_file(&self) -> PathBuf {
         self.root.join("internal-ssh-port")
     }
+    /// Supervisor's "infrastructure" SSH lane — a second TCP listener
+    /// that proxies to QEMU's internal hostfwd identically to the
+    /// user-facing port, BUT does not count toward the idle-hibernation
+    /// activity tracker. Long-lived telemetry channels (Avocado.app's
+    /// agent SSH tunnel, future event-stream consumers) connect here
+    /// so they wake the VM on attach but don't pin it awake forever.
+    pub fn infra_ssh_port_file(&self) -> PathBuf {
+        self.root.join("infra-ssh-port")
+    }
+    /// "Infrastructure" docker socket. Same backing SSH tunnel as
+    /// `docker_socket()`, but connections here don't count toward idle
+    /// — meant for streaming subscriptions like `GET /events` that
+    /// stay open for the VM's lifetime.
+    pub fn docker_socket_stream(&self) -> PathBuf {
+        self.root.join("docker-stream.sock")
+    }
     /// Absolute path to the artifact directory that was last used for `vm
     /// start`. The macOS Avocado.app reads this when launched without an
     /// AVOCADO_VM_DIR env var (Finder/Dock launches inherit a sanitized env
@@ -280,9 +296,11 @@ pub fn cleanup_transient(paths: &VmPaths) {
         paths.lock_file(),
         paths.docker_socket(),
         paths.docker_socket_internal(),
+        paths.docker_socket_stream(),
         paths.forwarder_pid(),
         paths.supervisor_pid(),
         paths.internal_ssh_port_file(),
+        paths.infra_ssh_port_file(),
     ] {
         let _ = std::fs::remove_file(&p);
     }
diff --git a/src/utils/vm/supervisor.rs b/src/utils/vm/supervisor.rs
index 9134e55f..dcb911bf 100644
--- a/src/utils/vm/supervisor.rs
+++ b/src/utils/vm/supervisor.rs
@@ -41,6 +41,15 @@ use tokio::sync::Mutex;
 use super::qmp::QmpClient;
 use super::state;
 
+/// Log a supervisor event with a UTC timestamp prefix. Timestamps make
+/// pause/wake cycles in `~/.avocado/vm/supervisor.log` legible without
+/// having to correlate against shell history.
+macro_rules! slog {
+    ($($arg:tt)*) => {{
+        eprintln!("[{}] supervisor: {}", chrono::Utc::now().to_rfc3339(), format_args!($($arg)*))
+    }};
+}
+
 /// Arguments passed from the `avocado vm supervise` subcommand into the
 /// supervisor loop. Plain owned data so the caller can construct it from
 /// clap-parsed flags without leaking lifetimes.
@@ -64,6 +73,16 @@ pub struct RunArgs {
     /// Host path the supervisor's SSH `-L` tunnel binds to; only the
     /// docker proxy connects here.
     pub docker_socket_internal: PathBuf,
+    /// "Infrastructure" TCP lane — second user-facing port that
+    /// wakes the VM on connect but does NOT count toward idle.
+    /// Used by long-lived telemetry channels (Avocado.app's agent SSH
+    /// tunnel) so they don't pin the VM awake.
+    pub infra_port: u16,
+    /// "Infrastructure" docker socket. Same SSH `-L` tunnel as
+    /// `docker_socket`, but accepted connections here don't count
+    /// toward idle — meant for `GET /events` style streaming
+    /// subscriptions.
+    pub docker_socket_stream: PathBuf,
     /// SSH private key for tunneling to the guest.
     pub ssh_key: PathBuf,
     /// known_hosts file the SSH tunnel uses.
@@ -102,7 +121,7 @@ impl State {
                 .await
                 .context("QMP cont")?;
             self.paused.store(false, Ordering::Relaxed);
-            eprintln!("supervisor: resumed VM on incoming connection");
+            slog!("resumed VM on incoming connection");
         }
         Ok(())
     }
@@ -145,7 +164,7 @@ impl State {
         loop {
             if self.args.docker_socket_internal.exists() {
                 *lock = Some(pid);
-                eprintln!("supervisor: docker tunnel up (pid {pid})");
+                slog!("docker tunnel up (pid {pid})");
                 return Ok(());
             }
             if !state::pid_alive(pid) {
@@ -244,6 +263,28 @@ pub async fn run(args: RunArgs) -> Result<()> {
         args.docker_socket.display()
     );
 
+    // Infrastructure TCP lane — wakes the VM on connect, proxies to the
+    // same internal hostfwd, but does NOT count toward idle. Long-lived
+    // telemetry channels (desktop's agent SSH tunnel, future event-stream
+    // consumers) connect here so they don't pin the VM awake.
+    let infra_tcp_listener = TcpListener::bind(("127.0.0.1", args.infra_port))
+        .await
+        .with_context(|| format!("binding 127.0.0.1:{}", args.infra_port))?;
+    eprintln!(
+        "supervisor: infra TCP listening on 127.0.0.1:{} → 127.0.0.1:{} (idle-exempt)",
+        args.infra_port, args.internal_port
+    );
+
+    // Infrastructure docker socket — same SSH tunnel, doesn't count toward
+    // idle. Meant for `GET /events` streaming subscriptions.
+    let _ = std::fs::remove_file(&args.docker_socket_stream);
+    let infra_unix_listener = UnixListener::bind(&args.docker_socket_stream)
+        .with_context(|| format!("binding {}", args.docker_socket_stream.display()))?;
+    eprintln!(
+        "supervisor: infra Unix listening on {} → SSH→/run/docker.sock (idle-exempt)",
+        args.docker_socket_stream.display()
+    );
+
     if args.idle_after_secs > 0 {
         let state_t = state.clone();
         tokio::spawn(async move {
@@ -257,7 +298,7 @@ pub async fn run(args: RunArgs) -> Result<()> {
     let state_sig = state.clone();
     tokio::spawn(async move {
         if let Err(e) = wait_for_term().await {
-            eprintln!("supervisor: signal handler error: {e:#}");
+            slog!("signal handler error: {e:#}");
             return;
         }
         let _ = state_sig.wake().await; // ensure VM is resumed before we exit
@@ -265,32 +306,58 @@ pub async fn run(args: RunArgs) -> Result<()> {
         std::process::exit(0);
     });
 
-    // Main accept loop: select between TCP and Unix listeners. Spawned
+    // Main accept loop: select between TCP/Unix user-facing listeners
+    // (counted) and the two infra listeners (idle-exempt). Spawned
     // tasks own their connection through close.
     loop {
         tokio::select! {
             res = tcp_listener.accept() => {
                 let (sock, peer) = match res {
                     Ok(v) => v,
-                    Err(e) => { eprintln!("supervisor: TCP accept error: {e:#}"); continue; }
+                    Err(e) => { slog!("TCP accept error: {e:#}"); continue; }
+                };
+                let s = state.clone();
+                let internal_port = args.internal_port;
+                tokio::spawn(async move {
+                    if let Err(e) = handle_tcp(sock, internal_port, s, /* count */ true).await {
+                        slog!("TCP conn {peer} error: {e:#}");
+                    }
+                });
+            }
+            res = infra_tcp_listener.accept() => {
+                let (sock, peer) = match res {
+                    Ok(v) => v,
+                    Err(e) => { slog!("infra TCP accept error: {e:#}"); continue; }
                 };
                 let s = state.clone();
                 let internal_port = args.internal_port;
                 tokio::spawn(async move {
-                    if let Err(e) = handle_tcp(sock, internal_port, s).await {
-                        eprintln!("supervisor: TCP conn {peer} error: {e:#}");
+                    if let Err(e) = handle_tcp(sock, internal_port, s, /* count */ false).await {
+                        slog!("infra TCP conn {peer} error: {e:#}");
                     }
                 });
             }
             res = unix_listener.accept() => {
                 let (sock, _peer) = match res {
                     Ok(v) => v,
-                    Err(e) => { eprintln!("supervisor: Unix accept error: {e:#}"); continue; }
+                    Err(e) => { slog!("Unix accept error: {e:#}"); continue; }
+                };
+                let s = state.clone();
+                tokio::spawn(async move {
+                    if let Err(e) = handle_docker(sock, s, /* count */ true).await {
+                        slog!("docker conn error: {e:#}");
+                    }
+                });
+            }
+            res = infra_unix_listener.accept() => {
+                let (sock, _peer) = match res {
+                    Ok(v) => v,
+                    Err(e) => { slog!("infra Unix accept error: {e:#}"); continue; }
                 };
                 let s = state.clone();
                 tokio::spawn(async move {
-                    if let Err(e) = handle_docker(sock, s).await {
-                        eprintln!("supervisor: docker conn error: {e:#}");
+                    if let Err(e) = handle_docker(sock, s, /* count */ false).await {
+                        slog!("infra docker conn error: {e:#}");
                     }
                 });
             }
@@ -298,12 +365,32 @@ pub async fn run(args: RunArgs) -> Result<()> {
     }
 }
 
-async fn handle_tcp(mut incoming: TcpStream, internal_port: u16, state: Arc<State>) -> Result<()> {
-    state.active_conns.fetch_add(1, Ordering::Relaxed);
-    state.touch();
-
-    if let Err(e) = state.wake().await {
-        eprintln!("supervisor: wake failed: {e}");
+/// Proxy a TCP connection from a user-facing or infra listener to QEMU's
+/// internal hostfwd.
+///
+/// `count`-true: user-facing traffic (a real SSH session, etc.) —
+/// bumps `active_conns` + activity time, and calls `wake()` to bring
+/// the VM out of hibernation. Drives the VM lifecycle.
+///
+/// `count`-false: infrastructure (long-lived telemetry like the
+/// desktop's agent SSH tunnel) — does NOT touch activity counters and
+/// does NOT wake the VM. Just opportunistically uses the VM if it's
+/// already running. Otherwise the inner connect to `internal_port`
+/// succeeds (QEMU slirp accepts) but bytes queue without delivery; the
+/// caller times out and retries with backoff. Keeps hibernation
+/// intact: only real user activity can wake the VM.
+async fn handle_tcp(
+    mut incoming: TcpStream,
+    internal_port: u16,
+    state: Arc<State>,
+    count: bool,
+) -> Result<()> {
+    if count {
+        state.active_conns.fetch_add(1, Ordering::Relaxed);
+        state.touch();
+        if let Err(e) = state.wake().await {
+            slog!("wake failed: {e}");
+        }
     }
 
     let mut inner = TcpStream::connect(("127.0.0.1", internal_port))
@@ -313,23 +400,44 @@ async fn handle_tcp(mut incoming: TcpStream, internal_port: u16, state: Arc<Stat
     let _ = incoming.shutdown().await;
     let _ = inner.shutdown().await;
 
-    state.active_conns.fetch_sub(1, Ordering::Relaxed);
-    state.touch();
+    if count {
+        state.active_conns.fetch_sub(1, Ordering::Relaxed);
+        state.touch();
+    }
     classify_close(res)
 }
 
-async fn handle_docker(mut client: UnixStream, state: Arc<State>) -> Result<()> {
-    state.active_conns.fetch_add(1, Ordering::Relaxed);
-    state.touch();
-
-    // Wake VM first (QMP cont). Then bring the SSH tunnel up — the
-    // tunnel's auth handshake needs guest sshd running, which is only
-    // true post-wake.
-    state.wake().await.context("waking VM for docker conn")?;
-    state
-        .ensure_tunnel()
-        .await
-        .context("bringing docker tunnel up")?;
+/// Proxy a docker client to the supervisor-managed SSH tunnel.
+///
+/// `count`-true: user-facing docker call (avocado build, docker ps from
+/// the user shell, etc.) — wakes VM + brings tunnel up if needed,
+/// counts toward idle. The VM stays awake until the call finishes.
+///
+/// `count`-false: infrastructure (containers watcher's `/events` stream,
+/// snapshot refreshes) — does NOT wake VM and does NOT ensure the
+/// tunnel. Just connects to the existing tunnel socket; if it's down
+/// (paused VM, boot still in progress), returns a fast error. Caller
+/// backs off and retries. This is what lets hibernation actually stick
+/// when the desktop is open: the watcher's reconnect attempts can't
+/// pin the VM awake by themselves, only user activity can.
+async fn handle_docker(mut client: UnixStream, state: Arc<State>, count: bool) -> Result<()> {
+    if count {
+        state.active_conns.fetch_add(1, Ordering::Relaxed);
+        state.touch();
+        state.wake().await.context("waking VM for docker conn")?;
+        state
+            .ensure_tunnel()
+            .await
+            .context("bringing docker tunnel up")?;
+    } else if !state.args.docker_socket_internal.exists() {
+        // Infra: tunnel not currently up. Fail fast — caller (likely
+        // ContainersWatcher) backs off and retries; a future
+        // user-driven docker call will bring the tunnel up and the
+        // next retry will succeed.
+        return Err(anyhow::anyhow!(
+            "docker tunnel not up (VM paused or still booting)"
+        ));
+    }
 
     let mut backend = UnixStream::connect(&state.args.docker_socket_internal)
         .await
@@ -343,8 +451,10 @@ async fn handle_docker(mut client: UnixStream, state: Arc<State>) -> Result<()>
     let _ = client.shutdown().await;
     let _ = backend.shutdown().await;
 
-    state.active_conns.fetch_sub(1, Ordering::Relaxed);
-    state.touch();
+    if count {
+        state.active_conns.fetch_sub(1, Ordering::Relaxed);
+        state.touch();
+    }
     classify_close(res)
 }
 
@@ -377,9 +487,9 @@ async fn idle_watcher(state: Arc<State>) {
         let since = now_ms() - state.last_activity_ms.load(Ordering::Relaxed);
         if since >= state.idle_threshold_ms {
             match state.pause().await {
-                Ok(_) => eprintln!("supervisor: paused VM after {since} ms idle"),
+                Ok(_) => slog!("paused VM after {since} ms idle"),
                 Err(e) => {
-                    eprintln!("supervisor: pause failed: {e}");
+                    slog!("pause failed: {e}");
                     state.touch(); // back off
                 }
             }

From e45fa150cc180c3f7e14d1face92f52a0a563a06 Mon Sep 17 00:00:00 2001
From: Justin Schneck <j.schneck@peridio.com>
Date: Tue, 2 Jun 2026 11:27:54 -0400
Subject: [PATCH 21/21] release: bump to 0.41.0

---
 Cargo.lock | 2 +-
 Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d530a751..eecd58a4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -130,7 +130,7 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
 
 [[package]]
 name = "avocado-cli"
-version = "0.40.2"
+version = "0.41.0"
 dependencies = [
  "anyhow",
  "base64",
diff --git a/Cargo.toml b/Cargo.toml
index 2dc0d143..3b05098a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "avocado-cli"
-version = "0.40.2"
+version = "0.41.0"
 edition = "2021"
 description = "Command line interface for Avocado."
 authors = ["Avocado"]